diff --git a/README.md b/README.md index 7132a031..04f4d269 100644 --- a/README.md +++ b/README.md @@ -26,13 +26,24 @@ The full API is described in the documentation page [https://hyperion-ml.readthe ### Prerequisites We use anaconda or miniconda, though you should be able to make it work in other python distributions - To start, you should create a new enviroment and install PyTorch>=1.9, (older versions are not supported any longer) e.g.: + To start, you should create a new enviroment and install PyTorch: ``` -conda create --name ${your_env} python=3.8 +conda create --name ${your_env} python=3.11 conda activate ${your_env} -conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=10.2 -c pytorch +# We used PyTorch 2.0.1, other versions may work too +conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.8 -c pytorch -c nvidia +# If using k2 for ASR +wget https://huggingface.co/csukuangfj/k2/resolve/main/ubuntu-cuda/k2-1.24.4.dev20240223+cuda11.8.torch2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +pip install k2-1.24.4.dev20240223+cuda11.8.torch2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ``` -In next Hyperion versions, we will upgrade to Pytorch>=1.9 and drop compatibility with older PyTorch versions. + +For older systems with cuda 10.2 driver: +``` +conda create --name ${your_env} python=3.10 +conda activate ${your_env} +conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=10.2 -c pytorch +``` + ### Installing Hyperion @@ -41,27 +52,12 @@ In next Hyperion versions, we will upgrade to Pytorch>=1.9 and drop compatibilit git clone https://github.com/hyperion-ml/hyperion.git ``` -- You can choose to install hyperion in the environment +- Then install hyperion in the environment ```bash cd hyperion pip install -e . ``` -- Or add the hyperion toolkit to the PYTHONPATH envirnoment variable - This option will allow you to share the same environment if you are working with several hyperion branches - at the same time, while installing it requires to have an enviroment per branch. - For this, you need to install the requirements -```bash -cd hyperion -pip install -r requirements.txt -``` -Then add these lines to your `~/.bashrc` or to each script that uses hyperion -```bash -HYP_ROOT= #substitute this by your hyperion location -export PYTHONPATH=${HYP_ROOT}:$PYTHONPATH -export PATH=${HYP_ROOT}/bin:$PATH -``` - ## Recipes There are recipes for several tasks in the `./egs` directory. diff --git a/apps.txt b/apps.txt index 4bf4a173..837c064b 100644 --- a/apps.txt +++ b/apps.txt @@ -1,69 +1,14 @@ -apply-mvn-select-frames.py -compute-energy-vad.py -compute-mfcc-feats.py -copy-feats.py -eval-cos-1vs1.py -eval-linear-gbe-up.py -eval-linear-gbe.py -eval-linear-svmc.py -eval-logistic-regression.py -eval-plda-1vs1.py -eval-plda-nvs1.py -make-babble-noise-audio-files.py -merge-h5-files.py -pack-audio-files.py -pack-wav-rirs.py -plot-vector-hist.py -plot-vector-tsne.py -preprocess-audio-files.py -rttm-to-bin-vad.py -segments-to-bin-vad.py -torch-adv-finetune-xvec-from-wav.py -torch-adv-finetune-xvec.py -torch-compute-mfcc-feats.py -torch-eval-vae.py -torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py -torch-eval-xvec-cosine-scoring-from-adv-test-wav.py -torch-eval-xvec-cosine-scoring-from-art-test-wav.py -torch-eval-xvec-cosine-scoring-from-test-wav.py -torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py -torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py -torch-eval-xvec-logits-from-wav.py -torch-extract-xvectors-from-wav-with-rttm.py -torch-extract-xvectors-from-wav.py -torch-extract-xvectors-slidwin-from-wav.py -torch-extract-xvectors-slidwin.py -torch-extract-xvectors-vae-preproc.py -torch-extract-xvectors.py -torch-finetune-xvec-dfr-from-wav.py -torch-finetune-xvec-dfr.py -torch-finetune-xvec-from-wav.py -torch-finetune-xvec.py -torch-generate-adv-attacks-xvector-classif.py -torch-generate-adv-attacks-xvector-verif.py -torch-train-dvae.py -torch-train-efficientnet-xvec-from-wav.py -torch-train-efficientnet-xvec.py -torch-train-resnet-xvec-from-wav.py -torch-train-resnet-xvec.py -torch-train-spinenet-xvec-from-wav.py -torch-train-tdnn-xvec-from-wav.py -torch-train-tdnn-xvec.py -torch-train-transformer-xvec-v1-from-wav.py -torch-train-transformer-xvec-v1.py -torch-train-vae.py -torch-train-vq-dvae.py -torch-train-vq-vae.py -torch-train-xvec-from-wav.py -train-cw-up.py -train-cw.py -train-gaussianizer.py -train-lda.py -train-linear-gbe-up.py -train-linear-gbe.py -train-linear-svmc.py -train-logistic-regression.py -train-mvn.py -train-nda.py -train-pca.py -train-plda.py +compute_energy_vad.py +extract_wav2vec2xvectors.py +extract_xvectors_from_wav.py +finetune_wav2vec2xvector.py +finetune_xvector_dfr_from_feats.py +finetune_xvector_dfr_from_wav.py +finetune_xvector_from_feats.py +finetune_xvector_from_wav.py +make_babble_noise_audio_files.py +pack_wav_rirs.py +preprocess_audio_files.py +train_wav2vec2xvector.py +train_xvector_from_feats.py +train_xvector_from_wav.py diff --git a/egs/chime5_spkdet/v1/local/score_dcf.py b/egs/chime5_spkdet/v1/local/score_dcf.py index 1137e049..cba16610 100755 --- a/egs/chime5_spkdet/v1/local/score_dcf.py +++ b/egs/chime5_spkdet/v1/local/score_dcf.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py b/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py index b77d3595..9ef02a02 100755 --- a/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py +++ b/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py @@ -17,7 +17,7 @@ from hyperion.utils import TrialScores from hyperion.helpers import MultiTestTrialDataReaderV2 as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def combine_diar_scores(ndx, orig_seg, subseg_scores): diff --git a/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py b/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py index dc3e3f87..19d582e4 100755 --- a/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py b/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py index fb5dd6f9..1cf80177 100755 --- a/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py @@ -18,8 +18,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/chime5_spkdet/v1/steps_be/train-be-v1.py b/egs/chime5_spkdet/v1/steps_be/train-be-v1.py index 55c412ac..6d1af604 100755 --- a/egs/chime5_spkdet/v1/steps_be/train-be-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/train-be-v1.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F diff --git a/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py b/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py index fa1dfcf7..9eaa7187 100755 --- a/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, verbose): diff --git a/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py b/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py index 25282718..c45767b2 100755 --- a/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py +++ b/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py @@ -27,13 +27,13 @@ from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList, PCA, LNorm -from hyperion.clustering import AHC -from hyperion.pdfs import GMMTiedDiagCov as GMM -from hyperion.diarization import DiarAHCPLDA as Diar +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.clustering import AHC +from hyperion.np.pdfs import GMMTiedDiagCov as GMM +from hyperion.np.diarization import DiarAHCPLDA as Diar -# from hyperion.pdfs import GMMDiagCov as GMM2 -# from hyperion.pdfs import GMM as GMM3 +# from hyperion.np.pdfs import GMMDiagCov as GMM2 +# from hyperion.np.pdfs import GMM as GMM3 def make_timestamps(n, win_start, win_length, win_shift, win_shrink): diff --git a/egs/dihard2019/v1/steps_diar/train-plda-v1.py b/egs/dihard2019/v1/steps_diar/train-plda-v1.py index c7589c8a..713798af 100755 --- a/egs/dihard2019/v1/steps_diar/train-plda-v1.py +++ b/egs/dihard2019/v1/steps_diar/train-plda-v1.py @@ -22,7 +22,7 @@ from hyperion.utils import Utt2Info # from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, LDA, LNorm, PCA +from hyperion.np.transforms import TransformList, LDA, LNorm, PCA from hyperion.helpers import PLDAFactory as F from hyperion.io import RandomAccessDataReaderFactory as DRF diff --git a/egs/librispeech/v0/cmd.sh b/egs/librispeech/v0/cmd.sh new file mode 100755 index 00000000..89dbb7d8 --- /dev/null +++ b/egs/librispeech/v0/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " + export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/librispeech/v0/conf/clsp.conf b/egs/librispeech/v0/conf/clsp.conf new file mode 100644 index 00000000..959c62a7 --- /dev/null +++ b/egs/librispeech/v0/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/librispeech/v0/conf/infer.yaml b/egs/librispeech/v0/conf/infer.yaml new file mode 100644 index 00000000..cd50a2cb --- /dev/null +++ b/egs/librispeech/v0/conf/infer.yaml @@ -0,0 +1,4 @@ +beam_width: 5 +decoding_method: time_sync_beam_search +#decoding_method: greedy +#decoding_method: align_length_sync_beam_search \ No newline at end of file diff --git a/egs/librispeech/v0/conf/reverb_noise20dB_aug.yaml b/egs/librispeech/v0/conf/reverb_noise20dB_aug.yaml new file mode 100644 index 00000000..23086ecb --- /dev/null +++ b/egs/librispeech/v0/conf/reverb_noise20dB_aug.yaml @@ -0,0 +1,34 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 10 + max_snr: 20 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 10 + max_snr: 20 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 10 + max_snr: 20 diff --git a/egs/librispeech/v0/conf/reverb_noise_aug.yaml b/egs/librispeech/v0/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/librispeech/v0/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml new file mode 100644 index 00000000..fc5b833a --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml new file mode 100644 index 00000000..c16a9e6d --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 64 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml new file mode 100644 index 00000000..9dd6a944 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 64 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + pos_enc_type: abs + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml new file mode 100644 index 00000000..43c2063d --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 1 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml new file mode 100644 index 00000000..3b3a83b4 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 32 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml new file mode 100644 index 00000000..9286657b --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 16 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml new file mode 100644 index 00000000..b4869ed3 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 4 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml new file mode 100644 index 00000000..645f784c --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 2 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml new file mode 100644 index 00000000..fbbac0c2 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 8 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml new file mode 100644 index 00000000..f1f8c414 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 32 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml new file mode 100644 index 00000000..44cb9642 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 16 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml new file mode 100644 index 00000000..031061f9 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 8 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml new file mode 100644 index 00000000..6cb61718 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 4 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml new file mode 100644 index 00000000..4b5e0e4d --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 2 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml new file mode 100644 index 00000000..91b5fccb --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + rnn_type: lstm + num_layers: 1 + hid_feats: 512 + proj_feats: 0 + out_feats: 512 + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml new file mode 100644 index 00000000..bdb33845 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + decoder: + rnnt_loss: k2_pruned + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml new file mode 100644 index 00000000..cfd41553 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + decoder: + rnnt_loss: k2_pruned + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml new file mode 100644 index 00000000..2cf2d04c --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml new file mode 100644 index 00000000..c66a1ca4 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + decoder: + rnnt_loss: k2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml new file mode 100644 index 00000000..c23a4f11 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml @@ -0,0 +1,53 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 75. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 75. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2base_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 42000 + hold_steps: 15000 + min_lr: 4e-5 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 1200 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml new file mode 100644 index 00000000..c1490295 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml @@ -0,0 +1,53 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 75. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 75. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2base_rnn_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 1200 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml new file mode 100644 index 00000000..7381bb01 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml @@ -0,0 +1,53 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 75. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 75. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2base_rnnt_ta_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml new file mode 100644 index 00000000..edc0af5e --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml new file mode 100644 index 00000000..aefddc7e --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_enclast.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml new file mode 100644 index 00000000..49077fd6 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml new file mode 100644 index 00000000..9f070bbe --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml new file mode 100644 index 00000000..d787a373 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml new file mode 100644 index 00000000..76d676f2 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml new file mode 100644 index 00000000..35b2b47c --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml new file mode 100644 index 00000000..855bfc98 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 42000 + hold_steps: 15000 + min_lr: 4e-5 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 1200 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml new file mode 100644 index 00000000..69c489b0 --- /dev/null +++ b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml @@ -0,0 +1,61 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + transducer: + decoder: + override_dropouts: true + embedding_dropout_rate: 0.3 + rnn_dropout_rate: 0.3 + +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v0/conf/wav2vec2base_rnn_transducer_do0.4.yaml b/egs/librispeech/v0/conf/wav2vec2base_rnn_transducer_do0.4.yaml new file mode 100644 index 00000000..6ddc7259 --- /dev/null +++ b/egs/librispeech/v0/conf/wav2vec2base_rnn_transducer_do0.4.yaml @@ -0,0 +1,12 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h +transducer: + decoder: + embed_dim: 1024 + num_pred_layers: 2 + pred_hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v0/conf/wav2vec2base_rnnt_ta_do0.4.yaml b/egs/librispeech/v0/conf/wav2vec2base_rnnt_ta_do0.4.yaml new file mode 100644 index 00000000..cfab3fb9 --- /dev/null +++ b/egs/librispeech/v0/conf/wav2vec2base_rnnt_ta_do0.4.yaml @@ -0,0 +1,16 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h +transducer: + decoder: + rnnt_loss: torchaudio + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v0/conf/wav2vec2base_transducer_do0.4.yaml b/egs/librispeech/v0/conf/wav2vec2base_transducer_do0.4.yaml new file mode 100644 index 00000000..3707672a --- /dev/null +++ b/egs/librispeech/v0/conf/wav2vec2base_transducer_do0.4.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer.yaml new file mode 100644 index 00000000..a7071b8c --- /dev/null +++ b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer.yaml @@ -0,0 +1,14 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + #embedding_dim: 128 + #num_layers: 1 + #hidden_dim: 64 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do.yaml new file mode 100644 index 00000000..c7fc2df7 --- /dev/null +++ b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.1 + rnn_dropout_rate: 0.1 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.2.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.2.yaml new file mode 100644 index 00000000..1ee4ec72 --- /dev/null +++ b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.2.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.2 + rnn_dropout_rate: 0.2 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.3.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.3.yaml new file mode 100644 index 00000000..ca7c1995 --- /dev/null +++ b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.3.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.3 + rnn_dropout_rate: 0.3 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.4.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.4.yaml new file mode 100644 index 00000000..9fed09e7 --- /dev/null +++ b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.4.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_enclast.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_enclast.yaml new file mode 100644 index 00000000..1d46c33c --- /dev/null +++ b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_enclast.yaml @@ -0,0 +1,11 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + joiner: + num_layers: 1 +feat_fusion_method: last + diff --git a/egs/librispeech/v0/datapath.sh b/egs/librispeech/v0/datapath.sh new file mode 100644 index 00000000..4c7987ef --- /dev/null +++ b/egs/librispeech/v0/datapath.sh @@ -0,0 +1,22 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + librispeech_root=/export/corpora5/LibriSpeech + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + # voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + # voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + # musan_root=/expscratch/dgromero/corpora-open/musan + echo "Put your database paths here" + exit 1 +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/librispeech/v0/default_config.sh b/egs/librispeech/v0/default_config.sh new file mode 120000 index 00000000..2b6239b6 --- /dev/null +++ b/egs/librispeech/v0/default_config.sh @@ -0,0 +1 @@ +global_conf/config_transducer_v1.sh \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/steps_pyfe b/egs/librispeech/v0/feats similarity index 100% rename from egs/voxceleb/adv.v1/steps_pyfe rename to egs/librispeech/v0/feats diff --git a/egs/librispeech/v0/global_conf/config_transducer_v1.sh b/egs/librispeech/v0/global_conf/config_transducer_v1.sh new file mode 100644 index 00000000..39c4d90f --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_transducer_v1.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean +test_data=test_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v0/global_conf/config_transducer_v2.sh b/egs/librispeech/v0/global_conf/config_transducer_v2.sh new file mode 100644 index 00000000..f663e2dd --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_transducer_v2.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v0/global_conf/config_transducer_v3.1.sh b/egs/librispeech/v0/global_conf/config_transducer_v3.1.sh new file mode 100644 index 00000000..0aa4d949 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_transducer_v3.1.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.1 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v0/global_conf/config_transducer_v3.2.sh b/egs/librispeech/v0/global_conf/config_transducer_v3.2.sh new file mode 100644 index 00000000..9185cc3f --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_transducer_v3.2.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0120.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v0/global_conf/config_transducer_v3.3.sh b/egs/librispeech/v0/global_conf/config_transducer_v3.3.sh new file mode 100644 index 00000000..490baba7 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_transducer_v3.3.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.3 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0120.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v0/global_conf/config_transducer_v3.sh b/egs/librispeech/v0/global_conf/config_transducer_v3.sh new file mode 100644 index 00000000..3871ee55 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_transducer_v3.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v0/global_conf/config_transducer_v4.3.sh b/egs/librispeech/v0/global_conf/config_transducer_v4.3.sh new file mode 100644 index 00000000..f51f1213 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_transducer_v4.3.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_all +dev_data=dev_all +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v4.3 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0030.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v0/global_conf/config_transducer_v4.4.sh b/egs/librispeech/v0/global_conf/config_transducer_v4.4.sh new file mode 100644 index 00000000..d09c197b --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_transducer_v4.4.sh @@ -0,0 +1,41 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_all +dev_data=dev_all +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v4.4 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0030.pth +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0075.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v0/global_conf/config_transducer_v5.0.sh b/egs/librispeech/v0/global_conf/config_transducer_v5.0.sh new file mode 100644 index 00000000..2aaeed2b --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_transducer_v5.0.sh @@ -0,0 +1,33 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_transducer_stage1_v5.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v5.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0030.pth +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0075.pth +nnet_s1=$nnet_s1_dir/model_ep0106.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_transducer_v6.1.sh b/egs/librispeech/v0/global_conf/config_transducer_v6.1.sh new file mode 100644 index 00000000..f67b0a88 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_transducer_v6.1.sh @@ -0,0 +1,34 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_transducer_stage1_v6.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v6.1 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0030.pth +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0075.pth +nnet_s1=$nnet_s1_dir/model_ep0106.pth +nnet_s1=$nnet_s1_dir/model_ep0646.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_transducer_v7.1.sh b/egs/librispeech/v0/global_conf/config_transducer_v7.1.sh new file mode 100644 index 00000000..48f0d363 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_transducer_v7.1.sh @@ -0,0 +1,33 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_transducer_stage1_v7.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v7.1 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0056.pth +nnet_s1=$nnet_s1_dir/model_ep0068.pth +nnet_s1=$nnet_s1_dir/model_ep0090.pth +nnet_s1=$nnet_s1_dir/model_ep0094.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh new file mode 100644 index 00000000..a0e4f1a9 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v1.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0120.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh new file mode 100644 index 00000000..823f50b1 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh @@ -0,0 +1,30 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0115.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh new file mode 100644 index 00000000..16971bcc --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh new file mode 100644 index 00000000..d4b45852 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.3 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh new file mode 100644 index 00000000..3c98fc9b --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.4.1 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh new file mode 100644 index 00000000..187ad022 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.4.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0100.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh new file mode 100644 index 00000000..1538a7d1 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.4 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0104.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh new file mode 100644 index 00000000..0ce9fd99 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.5 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh new file mode 100644 index 00000000..81702305 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.6 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh new file mode 100644 index 00000000..83f7682d --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.7 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh new file mode 100644 index 00000000..beb92d39 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.9 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0100.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh new file mode 100644 index 00000000..8e15e372 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_lstm_rnnt_k2_pruned.v1.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0120.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0105.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh new file mode 100644 index 00000000..1fd43d23 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh @@ -0,0 +1,31 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0081.pth +nnet_s1=$nnet_s1_dir/model_ep0120.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh new file mode 100644 index 00000000..7cd22d2d --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh @@ -0,0 +1,30 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v1.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh new file mode 100644 index 00000000..18875086 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh @@ -0,0 +1,30 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v1.3 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0100.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh new file mode 100644 index 00000000..ed274e91 --- /dev/null +++ b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh @@ -0,0 +1,36 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2.v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0041.pth +nnet_s1=$nnet_s1_dir/model_ep0048.pth +nnet_s1=$nnet_s1_dir/model_ep0066.pth +nnet_s1=$nnet_s1_dir/model_ep0106.pth +# nnet_s1=$nnet_s1_dir/model_ep0075.pth +# nnet_s1=$nnet_s1_dir/model_ep0106.pth +# nnet_s1=$nnet_s1_dir/model_ep0646.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/voxceleb/adv.v1/hyp_utils b/egs/librispeech/v0/hyp_utils similarity index 100% rename from egs/voxceleb/adv.v1/hyp_utils rename to egs/librispeech/v0/hyp_utils diff --git a/egs/librispeech/v0/local/data_prep.sh b/egs/librispeech/v0/local/data_prep.sh new file mode 100755 index 00000000..cb446a12 --- /dev/null +++ b/egs/librispeech/v0/local/data_prep.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash + +# Copyright 2014 Vassil Panayotov +# 2014 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" + exit 1 +fi + +src=$1 +dst=$2 + +# all utterances are FLAC compressed +if ! which flac >&/dev/null; then + echo "Please install 'flac' on ALL worker nodes!" + exit 1 +fi + +spk_file=$src/../SPEAKERS.TXT + +mkdir -p $dst || exit 1 + +[ ! -d $src ] && echo "$0: no such directory $src" && exit 1 +[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1 + + +wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp +trans=$dst/text; [[ -f "$trans" ]] && rm $trans +utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk +spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender + +for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do + reader=$(basename $reader_dir) + if ! [ $reader -eq $reader ]; then # not integer. + echo "$0: unexpected subdirectory name $reader" + exit 1 + fi + + reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}') + if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then + echo "Unexpected gender: '$reader_gender'" + exit 1 + fi + + for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do + chapter=$(basename $chapter_dir) + if ! [ "$chapter" -eq "$chapter" ]; then + echo "$0: unexpected chapter-subdirectory name $chapter" + exit 1 + fi + + find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ + awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac\n", $0, dir, $0}' >>$wav_scp|| exit 1 + + chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt + [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 + cat $chapter_trans >>$trans + + # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered + # to be a different speaker. This is done for simplicity and because we want + # e.g. the CMVN to be calculated per-chapter + awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \ + <$chapter_trans >>$utt2spk || exit 1 + + # reader -> gender map (again using per-chapter granularity) + echo "${reader}-${chapter} $reader_gender" >>$spk2gender + done +done + +spk2utt=$dst/spk2utt +utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 +# utils/data/get_utt2dur.sh $dst +# awk 'sub(/ *$/, "", $0)' $dst/utt2dur > $dst/utt2dur2 +# mv $dst/utt2dur2 $dst/utt2dur + +ntrans=$(wc -l <$trans) +nutt2spk=$(wc -l <$utt2spk) +! [ "$ntrans" -eq "$nutt2spk" ] && \ + echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1 + +utils/validate_data_dir.sh --no-feats $dst || exit 1 + +echo "$0: successfully prepared data in $dst" + +exit 0 diff --git a/egs/librispeech/v0/local/download_lm.py b/egs/librispeech/v0/local/download_lm.py new file mode 100755 index 00000000..030122aa --- /dev/null +++ b/egs/librispeech/v0/local/download_lm.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This file downloads the following LibriSpeech LM files: + + - 3-gram.pruned.1e-7.arpa.gz + - 4-gram.arpa.gz + - librispeech-vocab.txt + - librispeech-lexicon.txt + - librispeech-lm-norm.txt.gz + +from http://www.openslr.org/resources/11 +and save them in the user provided directory. + +Files are not re-downloaded if they already exist. + +Usage: + ./local/download_lm.py --out-dir ./download/lm +""" + +import argparse +import gzip +import logging +import os +import shutil +from pathlib import Path + +from lhotse.utils import urlretrieve_progress +from tqdm.auto import tqdm + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--out-dir", type=str, help="Output directory.") + + args = parser.parse_args() + return args + + +def main(out_dir: str): + url = "http://www.openslr.org/resources/11" + out_dir = Path(out_dir) + + files_to_download = ( + "3-gram.pruned.1e-7.arpa.gz", + "4-gram.arpa.gz", + "librispeech-vocab.txt", + "librispeech-lexicon.txt", + "librispeech-lm-norm.txt.gz", + ) + + for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"): + filename = out_dir / f + if filename.is_file() is False: + urlretrieve_progress( + f"{url}/{f}", + filename=filename, + desc=f"Downloading {filename}", + ) + else: + logging.info(f"{filename} already exists - skipping") + + if ".gz" in str(filename): + unzipped = Path(os.path.splitext(filename)[0]) + if unzipped.is_file() is False: + with gzip.open(filename, "rb") as f_in: + with open(unzipped, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + else: + logging.info(f"{unzipped} already exist - skipping") + + +if __name__ == "__main__": + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + + logging.basicConfig(format=formatter, level=logging.INFO) + + args = get_args() + logging.info(f"out_dir: {args.out_dir}") + + main(out_dir=args.out_dir) diff --git a/egs/voxceleb/v1/local/make_musan.py b/egs/librispeech/v0/local/make_musan.py similarity index 100% rename from egs/voxceleb/v1/local/make_musan.py rename to egs/librispeech/v0/local/make_musan.py diff --git a/egs/voxceleb/v1/local/make_musan.sh b/egs/librispeech/v0/local/make_musan.sh similarity index 100% rename from egs/voxceleb/v1/local/make_musan.sh rename to egs/librispeech/v0/local/make_musan.sh diff --git a/egs/voxceleb/v1/local/make_rirs_data.sh b/egs/librispeech/v0/local/make_rirs_data.sh similarity index 100% rename from egs/voxceleb/v1/local/make_rirs_data.sh rename to egs/librispeech/v0/local/make_rirs_data.sh diff --git a/egs/librispeech/v0/local/prepare_lang.py b/egs/librispeech/v0/local/prepare_lang.py new file mode 100755 index 00000000..39d76146 --- /dev/null +++ b/egs/librispeech/v0/local/prepare_lang.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script takes as input a lexicon file "data/lang_phone/lexicon.txt" +consisting of words and tokens (i.e., phones) and does the following: + +1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt + +2. Generate tokens.txt, the token table mapping a token to a unique integer. + +3. Generate words.txt, the word table mapping a word to a unique integer. + +4. Generate L.pt, in k2 format. It can be loaded by + + d = torch.load("L.pt") + lexicon = k2.Fsa.from_dict(d) + +5. Generate L_disambig.pt, in k2 format. +""" +import argparse +import math +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Tuple + +import k2 +import torch + +from hyperion.utils.lexicon import read_lexicon, write_lexicon + +Lexicon = List[Tuple[str, List[str]]] + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain a file lexicon.txt. + Generated files by this script are saved into this directory. + """, + ) + + parser.add_argument( + "--debug", + default=False, + action="store_true", + help="""True for debugging, which will generate + a visualization of the lexicon FST. + + Caution: If your lexicon contains hundreds of thousands + of lines, please set it to False! + """, + ) + + return parser.parse_args() + + +def write_mapping(filename: str, sym2id: Dict[str, int]) -> None: + """Write a symbol to ID mapping to a file. + + Note: + No need to implement `read_mapping` as it can be done + through :func:`k2.SymbolTable.from_file`. + + Args: + filename: + Filename to save the mapping. + sym2id: + A dict mapping symbols to IDs. + Returns: + Return None. + """ + with open(filename, "w", encoding="utf-8") as f: + for sym, i in sym2id.items(): + f.write(f"{sym} {i}\n") + + +def get_tokens(lexicon: Lexicon) -> List[str]: + """Get tokens from a lexicon. + + Args: + lexicon: + It is the return value of :func:`read_lexicon`. + Returns: + Return a list of unique tokens. + """ + ans = set() + for _, tokens in lexicon: + ans.update(tokens) + sorted_ans = sorted(list(ans)) + return sorted_ans + + +def get_words(lexicon: Lexicon) -> List[str]: + """Get words from a lexicon. + + Args: + lexicon: + It is the return value of :func:`read_lexicon`. + Returns: + Return a list of unique words. + """ + ans = set() + for word, _ in lexicon: + ans.add(word) + sorted_ans = sorted(list(ans)) + return sorted_ans + + +def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]: + """It adds pseudo-token disambiguation symbols #1, #2 and so on + at the ends of tokens to ensure that all pronunciations are different, + and that none is a prefix of another. + + See also add_lex_disambig.pl from kaldi. + + Args: + lexicon: + It is returned by :func:`read_lexicon`. + Returns: + Return a tuple with two elements: + + - The output lexicon with disambiguation symbols + - The ID of the max disambiguation symbol that appears + in the lexicon + """ + + # (1) Work out the count of each token-sequence in the + # lexicon. + count = defaultdict(int) + for _, tokens in lexicon: + count[" ".join(tokens)] += 1 + + # (2) For each left sub-sequence of each token-sequence, note down + # that it exists (for identifying prefixes of longer strings). + issubseq = defaultdict(int) + for _, tokens in lexicon: + tokens = tokens.copy() + tokens.pop() + while tokens: + issubseq[" ".join(tokens)] = 1 + tokens.pop() + + # (3) For each entry in the lexicon: + # if the token sequence is unique and is not a + # prefix of another word, no disambig symbol. + # Else output #1, or #2, #3, ... if the same token-seq + # has already been assigned a disambig symbol. + ans = [] + + # We start with #1 since #0 has its own purpose + first_allowed_disambig = 1 + max_disambig = first_allowed_disambig - 1 + last_used_disambig_symbol_of = defaultdict(int) + + for word, tokens in lexicon: + tokenseq = " ".join(tokens) + assert tokenseq != "" + if issubseq[tokenseq] == 0 and count[tokenseq] == 1: + ans.append((word, tokens)) + continue + + cur_disambig = last_used_disambig_symbol_of[tokenseq] + if cur_disambig == 0: + cur_disambig = first_allowed_disambig + else: + cur_disambig += 1 + + if cur_disambig > max_disambig: + max_disambig = cur_disambig + last_used_disambig_symbol_of[tokenseq] = cur_disambig + tokenseq += f" #{cur_disambig}" + ans.append((word, tokenseq.split())) + return ans, max_disambig + + +def generate_id_map(symbols: List[str]) -> Dict[str, int]: + """Generate ID maps, i.e., map a symbol to a unique ID. + + Args: + symbols: + A list of unique symbols. + Returns: + A dict containing the mapping between symbols and IDs. + """ + return {sym: i for i, sym in enumerate(symbols)} + + +def add_self_loops(arcs: List[List[Any]], disambig_token: int, + disambig_word: int) -> List[List[Any]]: + """Adds self-loops to states of an FST to propagate disambiguation symbols + through it. They are added on each state with non-epsilon output symbols + on at least one arc out of the state. + + See also fstaddselfloops.pl from Kaldi. One difference is that + Kaldi uses OpenFst style FSTs and it has multiple final states. + This function uses k2 style FSTs and it does not need to add self-loops + to the final state. + + The input label of a self-loop is `disambig_token`, while the output + label is `disambig_word`. + + Args: + arcs: + A list-of-list. The sublist contains + `[src_state, dest_state, label, aux_label, score]` + disambig_token: + It is the token ID of the symbol `#0`. + disambig_word: + It is the word ID of the symbol `#0`. + + Return: + Return new `arcs` containing self-loops. + """ + states_needs_self_loops = set() + for arc in arcs: + src, dst, ilabel, olabel, score = arc + if olabel != 0: + states_needs_self_loops.add(src) + + ans = [] + for s in states_needs_self_loops: + ans.append([s, s, disambig_token, disambig_word, 0]) + + return arcs + ans + + +def lexicon_to_fst( + lexicon: Lexicon, + token2id: Dict[str, int], + word2id: Dict[str, int], + sil_token: str = "SIL", + sil_prob: float = 0.5, + need_self_loops: bool = False, +) -> k2.Fsa: + """Convert a lexicon to an FST (in k2 format) with optional silence at + the beginning and end of each word. + + Args: + lexicon: + The input lexicon. See also :func:`read_lexicon` + token2id: + A dict mapping tokens to IDs. + word2id: + A dict mapping words to IDs. + sil_token: + The silence token. + sil_prob: + The probability for adding a silence at the beginning and end + of the word. + need_self_loops: + If True, add self-loop to states with non-epsilon output symbols + on at least one arc out of the state. The input label for this + self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. + Returns: + Return an instance of `k2.Fsa` representing the given lexicon. + """ + assert sil_prob > 0.0 and sil_prob < 1.0 + # CAUTION: we use score, i.e, negative cost. + sil_score = math.log(sil_prob) + no_sil_score = math.log(1.0 - sil_prob) + + start_state = 0 + loop_state = 1 # words enter and leave from here + sil_state = 2 # words terminate here when followed by silence; this state + # has a silence transition to loop_state. + next_state = 3 # the next un-allocated state, will be incremented as we go. + arcs = [] + + assert token2id[""] == 0 + assert word2id[""] == 0 + + eps = 0 + + sil_token = token2id[sil_token] + + arcs.append([start_state, loop_state, eps, eps, no_sil_score]) + arcs.append([start_state, sil_state, eps, eps, sil_score]) + arcs.append([sil_state, loop_state, sil_token, eps, 0]) + + for word, tokens in lexicon: + assert len(tokens) > 0, f"{word} has no pronunciations" + cur_state = loop_state + + word = word2id[word] + tokens = [token2id[i] for i in tokens] + + for i in range(len(tokens) - 1): + w = word if i == 0 else eps + arcs.append([cur_state, next_state, tokens[i], w, 0]) + + cur_state = next_state + next_state += 1 + + # now for the last token of this word + # It has two out-going arcs, one to the loop state, + # the other one to the sil_state. + i = len(tokens) - 1 + w = word if i == 0 else eps + arcs.append([cur_state, loop_state, tokens[i], w, no_sil_score]) + arcs.append([cur_state, sil_state, tokens[i], w, sil_score]) + + if need_self_loops: + disambig_token = token2id["#0"] + disambig_word = word2id["#0"] + arcs = add_self_loops( + arcs, + disambig_token=disambig_token, + disambig_word=disambig_word, + ) + + final_state = next_state + arcs.append([loop_state, final_state, -1, -1, 0]) + arcs.append([final_state]) + + arcs = sorted(arcs, key=lambda arc: arc[0]) + arcs = [[str(i) for i in arc] for arc in arcs] + arcs = [" ".join(arc) for arc in arcs] + arcs = "\n".join(arcs) + + fsa = k2.Fsa.from_str(arcs, acceptor=False) + return fsa + + +def main(): + args = get_args() + lang_dir = Path(args.lang_dir) + lexicon_filename = lang_dir / "lexicon.txt" + sil_token = "SIL" + sil_prob = 0.5 + + lexicon = read_lexicon(lexicon_filename) + tokens = get_tokens(lexicon) + words = get_words(lexicon) + + lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) + + for i in range(max_disambig + 1): + disambig = f"#{i}" + assert disambig not in tokens + tokens.append(f"#{i}") + + assert "" not in tokens + tokens = [""] + tokens + + assert "" not in words + assert "#0" not in words + assert "" not in words + assert "" not in words + + words = [""] + words + ["#0", "", ""] + + token2id = generate_id_map(tokens) + word2id = generate_id_map(words) + + write_mapping(lang_dir / "tokens.txt", token2id) + write_mapping(lang_dir / "words.txt", word2id) + write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig) + + L = lexicon_to_fst( + lexicon, + token2id=token2id, + word2id=word2id, + sil_token=sil_token, + sil_prob=sil_prob, + ) + + L_disambig = lexicon_to_fst( + lexicon_disambig, + token2id=token2id, + word2id=word2id, + sil_token=sil_token, + sil_prob=sil_prob, + need_self_loops=True, + ) + torch.save(L.as_dict(), lang_dir / "L.pt") + torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") + + if args.debug: + labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt") + aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt") + + L.labels_sym = labels_sym + L.aux_labels_sym = aux_labels_sym + L.draw(f"{lang_dir / 'L.svg'}", title="L.pt") + + L_disambig.labels_sym = labels_sym + L_disambig.aux_labels_sym = aux_labels_sym + L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", + title="L_disambig.pt") + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/v0/local/prepare_lang_bpe.py b/egs/librispeech/v0/local/prepare_lang_bpe.py new file mode 100755 index 00000000..7838b6a0 --- /dev/null +++ b/egs/librispeech/v0/local/prepare_lang_bpe.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) +""" + +This script takes as input `lang_dir`, which should contain:: + + - lang_dir/bpe.model, + - lang_dir/words.txt + +and generates the following files in the directory `lang_dir`: + + - lexicon.txt + - lexicon_disambig.txt + - L.pt + - L_disambig.pt + - tokens.txt +""" + +import argparse +from pathlib import Path +from typing import Dict, List, Tuple + +import k2 +import sentencepiece as spm +import torch +from prepare_lang import ( + Lexicon, + add_disambig_symbols, + add_self_loops, + write_lexicon, + write_mapping, +) + + +def lexicon_to_fst_no_sil( + lexicon: Lexicon, + token2id: Dict[str, int], + word2id: Dict[str, int], + need_self_loops: bool = False, +) -> k2.Fsa: + """Convert a lexicon to an FST (in k2 format). + + Args: + lexicon: + The input lexicon. See also :func:`read_lexicon` + token2id: + A dict mapping tokens to IDs. + word2id: + A dict mapping words to IDs. + need_self_loops: + If True, add self-loop to states with non-epsilon output symbols + on at least one arc out of the state. The input label for this + self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. + Returns: + Return an instance of `k2.Fsa` representing the given lexicon. + """ + loop_state = 0 # words enter and leave from here + next_state = 1 # the next un-allocated state, will be incremented as we go + + arcs = [] + + # The blank symbol is defined in local/train_bpe_model.py + assert token2id[""] == 0 + assert word2id[""] == 0 + + eps = 0 + + for word, pieces in lexicon: + assert len(pieces) > 0, f"{word} has no pronunciations" + cur_state = loop_state + + word = word2id[word] + pieces = [token2id[i] for i in pieces] + + for i in range(len(pieces) - 1): + w = word if i == 0 else eps + arcs.append([cur_state, next_state, pieces[i], w, 0]) + + cur_state = next_state + next_state += 1 + + # now for the last piece of this word + i = len(pieces) - 1 + w = word if i == 0 else eps + arcs.append([cur_state, loop_state, pieces[i], w, 0]) + + if need_self_loops: + disambig_token = token2id["#0"] + disambig_word = word2id["#0"] + arcs = add_self_loops( + arcs, + disambig_token=disambig_token, + disambig_word=disambig_word, + ) + + final_state = next_state + arcs.append([loop_state, final_state, -1, -1, 0]) + arcs.append([final_state]) + + arcs = sorted(arcs, key=lambda arc: arc[0]) + arcs = [[str(i) for i in arc] for arc in arcs] + arcs = [" ".join(arc) for arc in arcs] + arcs = "\n".join(arcs) + + fsa = k2.Fsa.from_str(arcs, acceptor=False) + return fsa + + +def generate_lexicon(model_file: str, + words: List[str]) -> Tuple[Lexicon, Dict[str, int]]: + """Generate a lexicon from a BPE model. + + Args: + model_file: + Path to a sentencepiece model. + words: + A list of strings representing words. + Returns: + Return a tuple with two elements: + - A dict whose keys are words and values are the corresponding + word pieces. + - A dict representing the token symbol, mapping from tokens to IDs. + """ + sp = spm.SentencePieceProcessor() + sp.load(str(model_file)) + + # Convert word to word piece IDs instead of word piece strings + # to avoid OOV tokens. + words_pieces_ids: List[List[int]] = sp.encode(words, out_type=int) + + # Now convert word piece IDs back to word piece strings. + words_pieces: List[List[str]] = [ + sp.id_to_piece(ids) for ids in words_pieces_ids + ] + + lexicon = [] + for word, pieces in zip(words, words_pieces): + lexicon.append((word, pieces)) + + # The OOV word is + lexicon.append(("", [sp.id_to_piece(sp.unk_id())])) + + token2id: Dict[str, int] = dict() + for i in range(sp.vocab_size()): + token2id[sp.id_to_piece(i)] = i + + return lexicon, token2id + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain the bpe.model and words.txt + """, + ) + + parser.add_argument( + "--debug", + default=False, + action="store_true", + help="""True for debugging, which will generate + a visualization of the lexicon FST. + + Caution: If your lexicon contains hundreds of thousands + of lines, please set it to False! + + See "test/test_bpe_lexicon.py" for usage. + """, + ) + + return parser.parse_args() + + +def main(): + args = get_args() + lang_dir = Path(args.lang_dir) + model_file = lang_dir / "bpe.model" + + word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt") + + words = word_sym_table.symbols + + excluded = [ + "", "!SIL", "", "", "#0", "", "" + ] + for w in excluded: + if w in words: + words.remove(w) + + lexicon, token_sym_table = generate_lexicon(model_file, words) + + lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) + + next_token_id = max(token_sym_table.values()) + 1 + for i in range(max_disambig + 1): + disambig = f"#{i}" + assert disambig not in token_sym_table + token_sym_table[disambig] = next_token_id + next_token_id += 1 + + word_sym_table.add("#0") + word_sym_table.add("") + word_sym_table.add("") + + write_mapping(lang_dir / "tokens.txt", token_sym_table) + + write_lexicon(lang_dir / "lexicon.txt", lexicon) + write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig) + + L = lexicon_to_fst_no_sil( + lexicon, + token2id=token_sym_table, + word2id=word_sym_table, + ) + + L_disambig = lexicon_to_fst_no_sil( + lexicon_disambig, + token2id=token_sym_table, + word2id=word_sym_table, + need_self_loops=True, + ) + torch.save(L.as_dict(), lang_dir / "L.pt") + torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") + + if args.debug: + labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt") + aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt") + + L.labels_sym = labels_sym + L.aux_labels_sym = aux_labels_sym + L.draw(f"{lang_dir / 'L.svg'}", title="L.pt") + + L_disambig.labels_sym = labels_sym + L_disambig.aux_labels_sym = aux_labels_sym + L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", + title="L_disambig.pt") + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/v0/local/train_bpe_model.py b/egs/librispeech/v0/local/train_bpe_model.py new file mode 100755 index 00000000..42aba957 --- /dev/null +++ b/egs/librispeech/v0/local/train_bpe_model.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# You can install sentencepiece via: +# +# pip install sentencepiece +# +# Due to an issue reported in +# https://github.com/google/sentencepiece/pull/642#issuecomment-857972030 +# +# Please install a version >=0.1.96 + +import argparse +import shutil +from pathlib import Path + +import sentencepiece as spm + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + The generated bpe.model is saved to this directory. + """, + ) + + parser.add_argument( + "--transcript", + type=str, + help="Training transcript.", + ) + + parser.add_argument( + "--vocab-size", + type=int, + help="Vocabulary size for BPE training", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + vocab_size = args.vocab_size + lang_dir = Path(args.lang_dir) + + model_type = "unigram" + + model_prefix = f"{lang_dir}/{model_type}_{vocab_size}" + train_text = args.transcript + character_coverage = 1.0 + input_sentence_size = 100000000 + + user_defined_symbols = ["", ""] + unk_id = len(user_defined_symbols) + # Note: unk_id is fixed to 2. + # If you change it, you should also change other + # places that are using it. + + model_file = Path(model_prefix + ".model") + if not model_file.is_file(): + spm.SentencePieceTrainer.train( + input=train_text, + vocab_size=vocab_size, + model_type=model_type, + model_prefix=model_prefix, + input_sentence_size=input_sentence_size, + character_coverage=character_coverage, + user_defined_symbols=user_defined_symbols, + unk_id=unk_id, + bos_id=-1, + eos_id=-1, + ) + + shutil.copyfile(model_file, f"{lang_dir}/bpe.model") + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/v0/local/validate_bpe_lexicon.py b/egs/librispeech/v0/local/validate_bpe_lexicon.py new file mode 100755 index 00000000..36962933 --- /dev/null +++ b/egs/librispeech/v0/local/validate_bpe_lexicon.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script checks that there are no OOV tokens in the BPE-based lexicon. + +Usage example: + + python3 ./local/validate_bpe_lexicon.py \ + --lexicon /path/to/lexicon.txt \ + --bpe-model /path/to/bpe.model +""" + +import argparse +from pathlib import Path +from typing import List, Tuple + +import sentencepiece as spm + +from hyperion.utils.lexicon import read_lexicon + +# Map word to word pieces +Lexicon = List[Tuple[str, List[str]]] + + +def get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--lexicon", + required=True, + type=Path, + help="Path to lexicon.txt", + ) + + parser.add_argument( + "--bpe-model", + required=True, + type=Path, + help="Path to bpe.model", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + assert args.lexicon.is_file(), args.lexicon + assert args.bpe_model.is_file(), args.bpe_model + + lexicon = read_lexicon(args.lexicon) + + sp = spm.SentencePieceProcessor() + sp.load(str(args.bpe_model)) + + word_pieces = set(sp.id_to_piece(list(range(sp.vocab_size())))) + for word, pieces in lexicon: + for p in pieces: + if p not in word_pieces: + raise ValueError(f"The word {word} contains an OOV token {p}") + + +if __name__ == "__main__": + main() diff --git a/egs/voxceleb/vae.v1/path.sh b/egs/librispeech/v0/path.sh similarity index 100% rename from egs/voxceleb/vae.v1/path.sh rename to egs/librispeech/v0/path.sh diff --git a/egs/librispeech/v0/run_001_prepare_data.sh b/egs/librispeech/v0/run_001_prepare_data.sh new file mode 100755 index 00000000..0708e667 --- /dev/null +++ b/egs/librispeech/v0/run_001_prepare_data.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. ./datapath.sh + + +nj=6 + +mkdir -p data + + +if [ ${stage} -le 1 ]; then + ### Task dependent. You have to make data the following preparation part by yourself. + ### But you can utilize Kaldi recipes in most cases + echo "stage 0: Data preparation" + for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500 + do + # use underscore-separated names in data directories. + local/data_prep.sh ${librispeech_root}/${part} data/${part//-/_} + steps_xvec/audio_to_duration.sh --cmd "$train_cmd" data/${part//-/_} + done +fi + +# if [ $stage -le 1 ]; then +# echo "Stage 1: Prepare LibriSpeech manifest" +# # We assume that you have downloaded the LibriSpeech corpus +# # to $librispeech_root +# mkdir -p data/manifests +# if [ ! -e data/manifests/.librispeech.done ]; then +# lhotse prepare librispeech -j $nj $librispeech_root data/manifests +# touch data/manifests/.librispeech.done +# fi +# fi + +# if [ $stage -le 2 ]; then +# echo "Stage 2: Prepare musan manifest" +# # We assume that you have downloaded the musan corpus +# # to $musan_root +# mkdir -p data/manifests +# if [ ! -e data/manifests/.musan.done ]; then +# lhotse prepare musan $musan_root data/manifests +# touch data/manifests/.musan.done +# fi +# fi diff --git a/egs/librispeech/v0/run_003_prepare_noises_rirs.sh b/egs/librispeech/v0/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..6bdcb4f2 --- /dev/null +++ b/egs/librispeech/v0/run_003_prepare_noises_rirs.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation + +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 16 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name librispeech-v1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name librispeech-v1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then + ln -s ../../sre19-cmn2/v1/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/librispeech/v0/run_004_compute_bpe.sh b/egs/librispeech/v0/run_004_compute_bpe.sh new file mode 100755 index 00000000..0bfeacb9 --- /dev/null +++ b/egs/librispeech/v0/run_004_compute_bpe.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +vocab_sizes=( + # 5000 + 2000 + 1000 + 500 +) + +dl_dir=$PWD/download + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. ./datapath.sh +. $config_file + + +if [ $stage -le 1 ]; then + echo "Stage 1: Download LM" + mkdir -p $dl_dir/lm + if [ ! -e $dl_dir/lm/.done ]; then + ./local/download_lm.py --out-dir=$dl_dir/lm + touch $dl_dir/lm/.done + fi +fi + +if [ $stage -le 2 ]; then + echo "Stage 2: Prepare phone based lang" + lang_dir=data/lang_phone + mkdir -p $lang_dir + + (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | + cat - $dl_dir/lm/librispeech-lexicon.txt | + sort | uniq > $lang_dir/lexicon.txt + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang.py --lang-dir $lang_dir + fi +fi + + +if [ $stage -le 3 ]; then + echo "Stage 3: Prepare BPE based lang" + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + mkdir -p $lang_dir + # We reuse words.txt from phone based lexicon + # so that the two can share G.pt later. + cp data/lang_phone/words.txt $lang_dir + + if [ ! -f $lang_dir/transcript_words.txt ]; then + echo "Generate data for BPE training" + files=$( + find "$librispeech_root/train-clean-100" -name "*.trans.txt" + find "$librispeech_root/train-clean-360" -name "*.trans.txt" + find "$librispeech_root/train-other-500" -name "*.trans.txt" + ) + for f in ${files[@]}; do + cat $f | cut -d " " -f 2- + done > $lang_dir/transcript_words.txt + fi + + if [ ! -f $lang_dir/bpe.model ]; then + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript $lang_dir/transcript_words.txt + fi + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bpe.py --lang-dir $lang_dir + + echo "Validating $lang_dir/lexicon.txt" + ./local/validate_bpe_lexicon.py \ + --lexicon $lang_dir/lexicon.txt \ + --bpe-model $lang_dir/bpe.model + fi + done +fi diff --git a/egs/librispeech/v0/run_011_train_asr.sh b/egs/librispeech/v0/run_011_train_asr.sh new file mode 100755 index 00000000..81ebbeae --- /dev/null +++ b/egs/librispeech/v0/run_011_train_asr.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=2 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2rnn_transducer.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + diff --git a/egs/librispeech/v0/run_011_train_asr_old.sh b/egs/librispeech/v0/run_011_train_asr_old.sh new file mode 100755 index 00000000..3c9f4f5b --- /dev/null +++ b/egs/librispeech/v0/run_011_train_asr_old.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=2 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + diff --git a/egs/librispeech/v0/run_030_inference.sh b/egs/librispeech/v0/run_030_inference.sh new file mode 100755 index 00000000..7ed9567a --- /dev/null +++ b/egs/librispeech/v0/run_030_inference.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +use_gpu=false +nnet_stage=1 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + transducer_args="--use-gpu true" + transducer_cmd="$cuda_eval_cmd --mem 6G" +else + transducer_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +transducer_dir=exp/transducer/$nnet_name + + +test_data=test_clean + + +# Extracts x-vectors for evaluation +for name in dev_clean dev_other test_clean test_other +do + nj=40 + steps_transducer/decode_wav2vec2rnn_transducer.sh \ + --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ + $nnet data/$name \ + $transducer_dir/$name $bpe_model +done + diff --git a/egs/librispeech/v0/run_030_inference_old.sh b/egs/librispeech/v0/run_030_inference_old.sh new file mode 100755 index 00000000..02b97001 --- /dev/null +++ b/egs/librispeech/v0/run_030_inference_old.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +use_gpu=false +nnet_stage=1 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + transducer_args="--use-gpu true" + transducer_cmd="$cuda_eval_cmd --mem 6G" +else + transducer_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +transducer_dir=exp/transducer/$nnet_name + + +test_data=test_clean + + +# Extracts x-vectors for evaluation +for name in dev_clean dev_other test_clean test_other #$test_data +do + nj=40 + steps_transducer/decode_wav2vec2transducer.sh \ + --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ + $nnet data/$name \ + $transducer_dir/$name $bpe_model +done + diff --git a/egs/librispeech/v0/run_040_eval_wer.sh b/egs/librispeech/v0/run_040_eval_wer.sh new file mode 100755 index 00000000..ac561344 --- /dev/null +++ b/egs/librispeech/v0/run_040_eval_wer.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +# By default we evaluate the nnet after finetuning stage 3 and only with cosine scoring +stage=3 +config_file=default_config.sh +nnet_stage=3 + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name/${be_name} +score_plda_dir=$score_dir/plda +score_cosine_dir=exp/scores/$nnet_name/cosine + +if [ $stage -le 1 ]; then + echo "Train PLDA on Voxceleb2" + steps_be/train_be_v1.sh \ + --cmd "$train_cmd" \ + --lda_dim $lda_dim \ + --plda_type $plda_type \ + --y_dim $plda_y_dim --z_dim $plda_z_dim \ + $xvector_dir/$plda_data/xvector.scp \ + data/$plda_data \ + $be_dir & + + wait +fi + + +if [ $stage -le 2 ];then + + echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA" + steps_be/eval_be_v1.sh \ + --cmd "$train_cmd" --plda_type $plda_type \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $be_dir/lda_lnorm.h5 \ + $be_dir/plda.h5 \ + $score_plda_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi + +score_plda_dir=$score_cosine_dir + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_plda_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi + + +exit + diff --git a/egs/voxceleb/adv.v1/steps b/egs/librispeech/v0/steps similarity index 100% rename from egs/voxceleb/adv.v1/steps rename to egs/librispeech/v0/steps diff --git a/egs/voxceleb/adv.v1/steps_be b/egs/librispeech/v0/steps_be similarity index 100% rename from egs/voxceleb/adv.v1/steps_be rename to egs/librispeech/v0/steps_be diff --git a/egs/voxceleb/vae.v1/steps_pyfe b/egs/librispeech/v0/steps_pyfe similarity index 100% rename from egs/voxceleb/vae.v1/steps_pyfe rename to egs/librispeech/v0/steps_pyfe diff --git a/egs/librispeech/v0/steps_transducer/decode_wav2vec2rnn_transducer.sh b/egs/librispeech/v0/steps_transducer/decode_wav2vec2rnn_transducer.sh new file mode 100755 index 00000000..470b92b1 --- /dev/null +++ b/egs/librispeech/v0/steps_transducer/decode_wav2vec2rnn_transducer.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" +set -e +use_gpu=false +#write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +extra_args="" +infer_cfg=conf/infer.yaml +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ] && [ $# != 4 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --infer-cfg # decoding configuration" + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + extra_args="${extra_args} --use-gpu" +fi + +# if [ "$write_utt2num_frames" == "true" ];then +# write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +# fi + +if [ $stage -le 0 ];then + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2vec2rnn_transducer.py \ + --infer-args $infer_cfg \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --output $output_dir/transducer.JOB.text $extra_args +fi + +if [ $stage -le 1 ];then + echo "compute wer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text +fi diff --git a/egs/librispeech/v0/steps_transducer/decode_wav2vec2transducer.sh b/egs/librispeech/v0/steps_transducer/decode_wav2vec2transducer.sh new file mode 100755 index 00000000..67fc7081 --- /dev/null +++ b/egs/librispeech/v0/steps_transducer/decode_wav2vec2transducer.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +set -e +nj=30 +cmd="run.pl" + +use_gpu=false +write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +num_augs=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ] && [ $# != 4 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + echo " --use-bin-vad # If true, uses binary VAD from vad.scp" + echo " --write-utt2num-frames # If true, write utt2num_frames file." + echo " --chunk-length # If provided, applies encoder with specified chunk-length and " + echo " # concatenates the chunks outputs before pooling" + echo " --feat-config # feature/mvn config file" + echo " --aug-config # augmentation config file" + echo " --random-utt-length # If true, extracts a random chunk from the utterance between " + echo " # min_utt_length and max_utt_length" + echo " --min-utt-length # " + echo " --max-utt-length # " + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +args="" +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + args="--use-gpu" +fi + +if [ "$write_utt2num_frames" == "true" ];then + write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +fi + +if [ $stage -le 0 ];then + #set +e + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2transducer.py \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --output $output_dir/transducer.JOB.text + # set -e +fi + +if [ $stage -le 1 ];then + echo "compute wer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text +fi diff --git a/egs/librispeech/v0/steps_xvec b/egs/librispeech/v0/steps_xvec new file mode 120000 index 00000000..289276b7 --- /dev/null +++ b/egs/librispeech/v0/steps_xvec @@ -0,0 +1 @@ +hyp_utils/xvectors/ \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/utils b/egs/librispeech/v0/utils similarity index 100% rename from egs/voxceleb/adv.v1/utils rename to egs/librispeech/v0/utils diff --git a/egs/voxceleb/adv.v1/steps_xvec b/egs/librispeech/v0/xvectors similarity index 100% rename from egs/voxceleb/adv.v1/steps_xvec rename to egs/librispeech/v0/xvectors diff --git a/egs/librispeech/v1/cmd.sh b/egs/librispeech/v1/cmd.sh new file mode 100755 index 00000000..040f458b --- /dev/null +++ b/egs/librispeech/v1/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/librispeech/v1/conf/clsp.conf b/egs/librispeech/v1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/librispeech/v1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/librispeech/v1/conf/coe_gpu_bigmem.conf b/egs/librispeech/v1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/librispeech/v1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/librispeech/v1/conf/coe_gpu_long.conf b/egs/librispeech/v1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/librispeech/v1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/librispeech/v1/conf/coe_gpu_rtx.conf b/egs/librispeech/v1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/librispeech/v1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/librispeech/v1/conf/coe_gpu_short.conf b/egs/librispeech/v1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/librispeech/v1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/librispeech/v1/conf/coe_gpu_v100.conf b/egs/librispeech/v1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/librispeech/v1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml b/egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml new file mode 100644 index 00000000..e6def26c --- /dev/null +++ b/egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml @@ -0,0 +1,28 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +spec_augment: + time_warp_prob: 0.66 + time_warp_window: 5 + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 40 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 2 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 30 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 2 + mask_method: mean +mvn: + norm_var: false + left_context: 0 + right_context: 0 + diff --git a/egs/librispeech/v1/conf/sp_unigram_1000.yaml b/egs/librispeech/v1/conf/sp_unigram_1000.yaml new file mode 100644 index 00000000..2a9b1b1e --- /dev/null +++ b/egs/librispeech/v1/conf/sp_unigram_1000.yaml @@ -0,0 +1,9 @@ +vocab_size: 1000 +model_type: unigram +char_coverage: 1.0 +unk_id: 2 +user_defined_symbols: +- +- +uppercase_text: true + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/sp_unigram_512.yaml b/egs/librispeech/v1/conf/sp_unigram_512.yaml new file mode 100644 index 00000000..116e6d22 --- /dev/null +++ b/egs/librispeech/v1/conf/sp_unigram_512.yaml @@ -0,0 +1,9 @@ +vocab_size: 512 +model_type: unigram +char_coverage: 1.0 +unk_id: 2 +user_defined_symbols: +- +- +uppercase_text: true + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/speed_reverb_noise10-20dB_aug.yaml b/egs/librispeech/v1/conf/speed_reverb_noise10-20dB_aug.yaml new file mode 100644 index 00000000..f9ecdd33 --- /dev/null +++ b/egs/librispeech/v1/conf/speed_reverb_noise10-20dB_aug.yaml @@ -0,0 +1,39 @@ +speed_aug: + speed_prob: 0.5 + speed_ratios: + - 0.9 + - 1.1 +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max + mediumroom: + weight: 1 + rir_path: csv:data/rirs_mediumroom/rirs.csv + rir_norm: max + realroom: + weight: 1 + rir_path: csv:data/rirs_real/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 10 + max_snr: 20 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 10 + max_snr: 20 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 10 + max_snr: 20 diff --git a/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml new file mode 100644 index 00000000..b0def8fc --- /dev/null +++ b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/speed_reverb_noise10-20dB_aug.yaml + tokenizer_mappings: + - text->text + tokenizer_files: + - data/token_librispeech_train-960_unigram_512/tokenizer.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 1500. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 8 + val: + dataset: + wav_scale: 1 + tokenizer_mappings: + - text->text + tokenizer_files: + - data/token_librispeech_train-960_unigram_512/tokenizer.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 1500. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_mn_16k.yaml + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 32 + d_model: 144 + num_heads: 4 + num_blocks: 16 + d_ff: 576 + in_layer_type: conv2d-sub + rnnt_decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 320 + embed_dropout_rate: 0.1 + rnn_dropout_rate: 0.1 + rnn_type: lstm + joiner: + hid_feats: 320 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.98 + weight_decay: 1e-6 + lrsched: + lrsch_type: noam_lr + d_model: 144 + lr_factor: 8.0 + min_lr: 1e-6 + warmup_steps: 25000 + update_lr_on_opt_step: true + # grad_clip: 100 + grad_clip: 20 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: full diff --git a/egs/librispeech/v1/datapath.sh b/egs/librispeech/v1/datapath.sh new file mode 100644 index 00000000..3e8de307 --- /dev/null +++ b/egs/librispeech/v1/datapath.sh @@ -0,0 +1,18 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + librispeech_root=/export/corpora5/LibriSpeech + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + librispeech_root=/export/common/data/corpora/ASR/openslr/SLR12/LibriSpeech + musan_root=/export/common/data/corpora/MUSAN/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/librispeech/v1/global_conf/config_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh b/egs/librispeech/v1/global_conf/config_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh new file mode 100644 index 00000000..62817852 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh @@ -0,0 +1,20 @@ +# Conformer + RNN-T + +# training data +nnet_train_data=librispeech_train-960 +nnet_val_data=librispeech_dev + +# tokenizer +token_train_data=librispeech_train-960 +token_cfg=conf/sp_unigram_512.yaml +token_dir=data/token_${token_train_data}_unigram_512 +token_model=$token_dir/tokenizer.model + +# rnn-t cfg +nnet_type=conformer_v1_rnn_transducer +nnet_name=fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p +nnet_s1_cfg=conf/train_${nnet_name}.s1.yaml +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/asr_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0115.pth diff --git a/egs/voxceleb/vae.v1/hyp_utils b/egs/librispeech/v1/hyp_utils similarity index 100% rename from egs/voxceleb/vae.v1/hyp_utils rename to egs/librispeech/v1/hyp_utils diff --git a/egs/librispeech/v1/path.sh b/egs/librispeech/v1/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/librispeech/v1/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/librispeech/v1/run_001_prepare_data.sh b/egs/librispeech/v1/run_001_prepare_data.sh new file mode 100755 index 00000000..1ca8b585 --- /dev/null +++ b/egs/librispeech/v1/run_001_prepare_data.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. ./datapath.sh + + +nj=6 + +mkdir -p data + + +# if [ ${stage} -le 1 ]; then +# ### Task dependent. You have to make data the following preparation part by yourself. +# ### But you can utilize Kaldi recipes in most cases +# echo "stage 0: Data preparation" +# for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500 +# do +# # use underscore-separated names in data directories. +# local/data_prep.sh ${librispeech_root}/${part} data/${part//-/_} +# steps_xvec/audio_to_duration.sh --cmd "$train_cmd" data/${part//-/_} +# done +# fi + +if [ $stage -le 1 ]; then + echo "Stage 1: Prepare lhotse LibriSpeech manifest" + # We assume that you have downloaded the LibriSpeech corpus + # to $librispeech_root + mkdir -p data/lhotse_librispeech + if [ ! -e data/lhotse_librispeech/.librispeech.done ]; then + lhotse prepare librispeech -j $nj $librispeech_root data/lhotse_librispeech + touch data/lhotse_librispeech/.librispeech.done + fi +fi + +if [ $stage -le 2 ];then + echo "Stage 2: Convert Manifest to Hyperion Datasets" + for data in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other + do + hyperion-dataset from_lhotse \ + --recordings-file data/lhotse_librispeech/librispeech_recordings_${data}.jsonl.gz \ + --supervisions-file data/lhotse_librispeech/librispeech_supervisions_${data}.jsonl.gz \ + --dataset data/librispeech_${data} + done + +fi + +if [ $stage -le 3 ];then + echo "Stage 3: Merge Librispeech train sets" + hyperion-dataset merge \ + --input-datasets data/librispeech_train-{clean-100,clean-360,other-500} \ + --dataset data/librispeech_train-960 + + echo "Stage 3: Merge Librispeech dev sets" + hyperion-dataset merge \ + --input-datasets data/librispeech_dev-{clean,other} \ + --dataset data/librispeech_dev + +fi diff --git a/egs/librispeech/v1/run_002_prepare_noises_rirs.sh b/egs/librispeech/v1/run_002_prepare_noises_rirs.sh new file mode 100755 index 00000000..73c7ed82 --- /dev/null +++ b/egs/librispeech/v1/run_002_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/librispeech/v1/run_003_train_tokenizers.sh b/egs/librispeech/v1/run_003_train_tokenizers.sh new file mode 100755 index 00000000..35ae7da2 --- /dev/null +++ b/egs/librispeech/v1/run_003_train_tokenizers.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $stage -le 1 ];then + $train_cmd \ + $token_dir/train_sp.log \ + hyperion-train-tokenizer sentencepiece \ + --cfg $token_cfg \ + --segments-file data/$token_train_data/segments.csv \ + --tokenizer-path $token_dir + +fi diff --git a/egs/librispeech/v1/run_004_train_asr.sh b/egs/librispeech/v1/run_004_train_asr.sh new file mode 100755 index 00000000..33b68ed2 --- /dev/null +++ b/egs/librispeech/v1/run_004_train_asr.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=2 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_train_data} +val_dir=data/${nnet_val_data} + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-train-wav2rnn-transducer $nnet_type \ + --cfg $nnet_s1_cfg \ + --data.train.dataset.recordings-file $train_dir/recordings.csv \ + --data.train.dataset.segments-file $train_dir/segments.csv \ + --data.val.dataset.recordings-file $val_dir/recordings.csv \ + --data.val.dataset.segments-file $val_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir $args \ + --num-gpus $ngpu + #--data.train.dataset.bpe-model $token_model \ +fi + diff --git a/egs/lre22/fixed.v1.8k/README.md b/egs/lre22/fixed.v1.8k/README.md new file mode 100644 index 00000000..877f99ca --- /dev/null +++ b/egs/lre22/fixed.v1.8k/README.md @@ -0,0 +1,43 @@ +# LRE22 Fixed Condition V1 + +Recipe for the NIST LRE22 fixed condition based to the JHU-MIT Submission. + +## Citing +``` +@inproceedings{villalba23_interspeech, + author={Jesús Villalba and Jonas Borgstrom and Maliha Jahan and Saurabh Kataria and Leibny Paola Garcia and Pedro Torres-Carrasquillo and Najim Dehak}, + title={{Advances in Language Recognition in Low Resource African Languages: The JHU-MIT Submission for NIST LRE22}}, + year=2023, + booktitle={Proc. INTERSPEECH 2023}, + pages={521--525}, + doi={10.21437/Interspeech.2023-1094} +} +``` + +## Training Data + + - x-Vector networks trained on: + - VoxLingua107 + - NIST LRE17 Train + Dev + Eval / CTS + AfV + - Gaussian back-end trained on: + - NIST LRE22 dev with 2-fold cross-val + x10 augmentations + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it uses ECAPA-TDNN 4 layers of 2048 dim. + - To change the default network run scripts with the config-file argument: +```bash +run_011_train_xvector.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh +run_030_extract_xvectors.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh --use-gpu true +run_040_be_final.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh +``` + +## Results + +| Config | Model Type | Model Details | Back-end | Dev MinCp | Dev ActCp | Eval MinCp | Eval ActCp | +| ------ | ---------- | ------------- | -------- | :-------: | :-------: | :--------: | :--------: | +| config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh | ECAPA-TDNN 2048x4 | Stage-2 | GBE | 0.207 | 0.209 | 0.198 | 0.199 | +| config_fbank64_stmn_fwseres2net50s8_v1.0.sh | fw-SE Res2Net50 scale=8 | Stage-2 | GBE | 0.227 | 0.229 | 0.213 | 0.215 | +| Fusion ECAPA-TDNN + FwSE Res2Net50 | | | FoCal | 0.182 | 0.183 | 0.180 | 0.181 | + diff --git a/egs/lre22/fixed.v1.8k/cmd.sh b/egs/lre22/fixed.v1.8k/cmd.sh new file mode 100755 index 00000000..4b4e8ae7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/cmd.sh @@ -0,0 +1,25 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 40G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi diff --git a/egs/lre22/fixed.v1.8k/conf/clsp.conf b/egs/lre22/fixed.v1.8k/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/lre22/fixed.v1.8k/conf/coe_gpu_bigmem.conf b/egs/lre22/fixed.v1.8k/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/lre22/fixed.v1.8k/conf/coe_gpu_long.conf b/egs/lre22/fixed.v1.8k/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/lre22/fixed.v1.8k/conf/coe_gpu_rtx.conf b/egs/lre22/fixed.v1.8k/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/lre22/fixed.v1.8k/conf/coe_gpu_short.conf b/egs/lre22/fixed.v1.8k/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/lre22/fixed.v1.8k/conf/coe_gpu_v100.conf b/egs/lre22/fixed.v1.8k/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/lre22/fixed.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml b/egs/lre22/fixed.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml new file mode 100644 index 00000000..fce3804a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml @@ -0,0 +1,24 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 8 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/fixed.v1.8k/conf/fbank64_stmn_8k.yaml b/egs/lre22/fixed.v1.8k/conf/fbank64_stmn_8k.yaml new file mode 100644 index 00000000..dfd0d3e5 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/fbank64_stmn_8k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/fixed.v1.8k/conf/reverb_noise_aug.yaml b/egs/lre22/fixed.v1.8k/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..22620f03 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,101 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.0 + intertop_margin: 0.0 + margin_warmup_epochs: 3.0 + dropout_rate: 0.2 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + warmup_steps: 15000 + min_lr: 1.0e-06 + update_lr_on_opt_step: true + use_amp: true + swa_start: 12 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 15 + #eff_batch_size: 512 + eff_batch_size: 256 diff --git a/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..25e7b213 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + weight_exponent: 0.3 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + weight_exponent: 0.3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + margin_warmup_epochs: 0 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 10000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 10000 + use_amp: true + swa_start: 14 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 18 + eff_batch_size: 256 diff --git a/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml b/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..d900ec9b --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,78 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + resnet_type: fwseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + se_r: 4 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.0 + intertop_margin: 0.0 + margin_warmup_epochs: 3.0 + dropout_rate: 0.05 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + warmup_steps: 15000 + min_lr: 1.0e-06 + update_lr_on_opt_step: true + use_amp: true + swa_start: 12 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 8 + eff_batch_size: 256 diff --git a/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage2_v1.0.yaml b/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..2e6d3a6c --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage2_v1.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + weight_exponent: 0.3 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + weight_exponent: 0.3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + margin_warmup_epochs: 0 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 10000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 10000 + use_amp: true + swa_start: 14 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 7 + eff_batch_size: 256 diff --git a/egs/lre22/fixed.v1.8k/conf/vad_8k.yaml b/egs/lre22/fixed.v1.8k/conf/vad_8k.yaml new file mode 100644 index 00000000..1cfe34b0 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/vad_8k.yaml @@ -0,0 +1,9 @@ +sample_frequency: 8000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: -4.89 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 +wav_scale: 1 diff --git a/egs/lre22/fixed.v1.8k/datapath.sh b/egs/lre22/fixed.v1.8k/datapath.sh new file mode 100644 index 00000000..d6a81520 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/datapath.sh @@ -0,0 +1,46 @@ +# Copyright +# 2021 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + +#paths to databases + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + ldc_root3=/export/fs02/corpora3/LDC + ldc_root5=/export/corpora5/LDC + ldc_root=/export/corpora6/LDC + #voxceleb1_root=/export/corpora5/VoxCeleb1_v2 + #voxceleb2_root=/export/corpora5/VoxCeleb2 + sre16_eval_root=$ldc_root5/LDC2018E30/data/eval/R149_0_1 + #janus_root=$ldc_root/LDC2019E55/Janus_Multimedia_Dataset + #sre_superset_root=$ldc_root/LDC2021E08 + #sre21_dev_root=$ldc_root/LDC2021E09 + #sre21_eval_root=$ldc_root/LDC2021E10 + lre17_train_root=$ldc_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$ldc_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$ldc_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=/export/corpora6/lre22_test_data_v2 + voxlingua_root=/export/corpora6/voxlingua107 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + ldc_root=/export/common/data/corpora/LDC + sre_root=/export/common/data/corpora/NIST/SRE + my_root=/exp/jvillalba/corpora + #voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 + #voxceleb2_root=/expscratch/dgromero/corpora/vox2 + sre16_dev_root=/exp/jvillalba/corpora/LDC2019S20/data/dev/R148_0_0 + sre16_eval_root=/exp/jvillalba/corpora/LDC2019S20/data/eval/R149_0_1 + #janus_root=$sre_root/SRE19/LDC2019E55_Janus_Multimedia_Dataset + sre_superset_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E08 + sre21_dev_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E09 + sre21_eval_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E10 + lre17_train_root=$my_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$my_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$my_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=$my_root/lre22_test_data_v2 + voxlingua_root=$my_root/voxlingua107 + musan_root=/expscratch/dgromero/corpora/musan +else + echo "Put your database paths here" + exit 1 +fi diff --git a/egs/lre22/fixed.v1.8k/default_config.sh b/egs/lre22/fixed.v1.8k/default_config.sh new file mode 120000 index 00000000..506bebe6 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/default_config.sh @@ -0,0 +1 @@ +global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh \ No newline at end of file diff --git a/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh b/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh new file mode 100644 index 00000000..b9cd45a5 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh @@ -0,0 +1,24 @@ +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxlingua107_lre17_noary + +# x-vector cfg +nnet_type=resnet1d +nnet_stages=2 +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml + +nnet_name=${feat_type}_ecapatdnn2048x4_v1.0 +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/swa_model_ep0016.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0007.pth diff --git a/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh b/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh new file mode 100644 index 00000000..afac4198 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh @@ -0,0 +1,28 @@ +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxlingua107_lre17_noary + +# x-vector cfg + +nnet_type=resnet +nnet_stages=2 +nnet_s1_base_cfg=conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml + +nnet_name=${feat_type}_fwseres2net50s8_v1.0 +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth + + +nnet_s2_base_cfg=conf/train_fwseres2net50s8_xvec_stage2_v1.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0007.pth + + diff --git a/egs/lre22/fixed.v1.8k/hyp_utils b/egs/lre22/fixed.v1.8k/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/lre22/fixed.v1.8k/local/apply_tel_codecs_to_kaldi_datadir.py b/egs/lre22/fixed.v1.8k/local/apply_tel_codecs_to_kaldi_datadir.py new file mode 100755 index 00000000..c0e2b9d3 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/apply_tel_codecs_to_kaldi_datadir.py @@ -0,0 +1,215 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import shutil +from tqdm import tqdm +import time +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import RecordingSet, SegmentSet + +valid_codecs = ["gsm", "g711mu", "g711a", "g722", "g723_1", "g726", "opus"] + +sox_options = {"gsm": "-r 8000 -e gsm-full-rate -t gsm"} +ffmpeg_options = { + "g711a": "-ar 8000 -acodec pcm_alaw -f wav", + "g711mu": "-ar 8000 -acodec pcm_mulaw -f wav", + "g722": "-ar 8000 -acodec g722 -f wav", + "g723_1": "-ar 8000 -acodec g723_1 -b:a 6300 -f wav", + "g726": "-ar 8000 -acodec g726 -f wav", + "opus": "-ar 8000 -acodec libopus -application voip -f opus", +} + + +def apply_sox_codec(storage_path, codec): + + option = sox_options[codec] + storage_path = storage_path.rstrip() + if storage_path[-1] == "|": + storage_path = f"{storage_path} sox -t wav - {option} - |" + else: + storage_path = f"sox {storage_path} {option} - |" + + storage_path = f"{storage_path} sox {option} - -t wav -e signed-integer -b 16 - |" + return storage_path + + +def apply_ffmpeg_codec(storage_path, codec, g726_css, opus_brs, rng): + + option = ffmpeg_options[codec] + if codec == "g726": + code_size = rng.choice(g726_css) + option = f"{option} -code_size {code_size}" + elif codec == "opus": + br = rng.choice(opus_brs) + option = f"{option} -b:a {br}" + + storage_path = storage_path.rstrip() + if storage_path[-1] == "|": + storage_path = f"{storage_path} ffmpeg -i - {option} - |" + else: + storage_path = f"ffmpeg -i {storage_path} {option} - |" + + storage_path = f"{storage_path} ffmpeg -i - -ar 8000 -c:a pcm_s16le -f wav - |" + return storage_path + + +def apply_codec(storage_path, codec, g726_css, opus_brs, rng): + + if codec in ["gsm"]: + storage_path = apply_sox_codec(storage_path, codec) + else: + storage_path = apply_ffmpeg_codec(storage_path, codec, g726_css, + opus_brs, rng) + + return storage_path + + +def apply_codecs( + input_dir, + output_dir, + codecs, + keep_orig, + g726_min_code_size, + opus_brs, + seed, + verbose, +): + config_logger(verbose) + logging.info("Applying codecs %s -> %s", input_dir, output_dir) + rng = np.random.RandomState(seed=seed) + input_dir = Path(input_dir) + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + + g726_css = list(range(g726_min_code_size, 6)) + logging.info("making wav.scp") + recs = RecordingSet.load(input_dir / "wav.scp") + recs["orig_id"] = recs["id"] + if keep_orig: + recs_orig = recs.clone() + + codec_idx = 0 + ids = [] + s_paths = [] + for i in tqdm(range(len(recs))): + t1 = time.time() + row = recs.iloc[i] + t2 = time.time() + codec_i = codecs[codec_idx % len(codecs)] + codec_idx += 1 + t3 = time.time() + # recs.loc[row.id, "id"] = f"{row.id}-{codec_i}" + ids.append(f"{row.id}-{codec_i}") + t4 = time.time() + sp = apply_codec(row["storage_path"], codec_i, g726_css, opus_brs, rng) + + t5 = time.time() + # recs.loc[row.id, "storage_path"] = sp + s_paths.append(sp) + t6 = time.time() + + recs["id"] = ids + recs["storage_path"] = s_paths + + mapping = recs[["orig_id", "id"]] + mapping.set_index("orig_id", inplace=True, drop=False) + if keep_orig: + recs = RecordingSet.merge(recs_orig, recs) + recs.sort() + + logging.info("making utt2orig_utt") + recs[["id", "orig_id"]].to_csv(output_dir / "utt2orig_utt", + sep=" ", + header=False, + index=False) + + recs.save(output_dir / "wav.scp") + u2x_files = [] + for pattern in ["utt2*", "vad.scp", "feats.scp"]: + files_p = glob.glob(str(input_dir / pattern)) + u2x_files.extend(files_p) + + for f in u2x_files: + logging.info("making %s", Path(f).name) + u2x = SegmentSet.load(f) + if keep_orig: + u2x_orig = u2x.clone() + + u2x["id"] = mapping.loc[u2x["id"], "id"] + if keep_orig: + u2x = SegmentSet.merge(u2x_orig, u2x) + u2x.sort() + + output_file = output_dir / Path(f).name + u2x.save(output_file) + + spk_files = glob.glob(str(input_dir / "spk2gender")) + for f in spk_files: + logging.info("making %s", Path(f).name) + output_file = output_dir / Path(f).name + shutil.copy2(f, output_file) + + logging.info("making utt2spk") + u2s = SegmentSet.load(output_dir / "utt2spk") + spks = u2s["class_id"].unique() + df_spk = u2s.df.sort_values(by="class_id") + df_spk.set_index("class_id", inplace=True) + + with open(output_dir / "spk2utt", "w") as f: + for spk in spks: + seg_ids = df_spk.loc[spk, "id"] + if isinstance(seg_ids, list): + seg_ids = " ".join(seg_ids) + f.write(f"{spk} {seg_ids}\n") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Apply telephone codecs to kaldi data dir") + parser.add_argument("--input-dir", + required=True, + help="Path to the original kaldi dataset") + + parser.add_argument("--output-dir", + required=True, + help="Codec augmented directory") + parser.add_argument( + "--codecs", + default=valid_codecs, + nargs="+", + choices=valid_codecs, + help="List of codecs to apply", + ) + parser.add_argument( + "--g726-min-code-size", + default=2, + choices=[2, 3, 4, 5], + help="minimum code-size for g726", + ) + parser.add_argument( + "--opus-brs", + default=[4500, 5500, 7700, 9500, 12500, 16000, 32000], + nargs="+", + help="opus codec bit rates", + ) + parser.add_argument("--keep-orig", default=False, action=ActionYesNo) + parser.add_argument("--seed", default=1234, help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + args = parser.parse_args() + apply_codecs(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/download_focal.sh b/egs/lre22/fixed.v1.8k/local/download_focal.sh new file mode 100755 index 00000000..13b86e57 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/download_focal.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2022 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 +# +# Downloads Niko Brummer's FoCal Multiclass + +set -e +tool=FoCal_MultiClass_V1 +s_dir=focal_multiclass + +# shareable link: +# https://drive.google.com/file/d/13rPUqS68NdEF5NB0vsL7bDEju5dhmmDZ/view?usp=sharing + + +wget --no-check-certificate "https://drive.google.com/uc?export=download&id=13rPUqS68NdEF5NB0vsL7bDEju5dhmmDZ" -O $tool.zip +unzip $tool.zip -d $s_dir + +if [ ! -f $s_dir/v1.0/readme.txt ];then + echo "the focal tool wasn't dowloaded correctly, download manually" + exit 1 +fi + +rm -f $tool.zip + + + + diff --git a/egs/lre22/fixed.v1.8k/local/download_focal.sh~ b/egs/lre22/fixed.v1.8k/local/download_focal.sh~ new file mode 100755 index 00000000..b871348f --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/download_focal.sh~ @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2022 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 +# +# Downloads Niko Brummer's FoCal Multiclass + +set -e +tool=FoCal_MultiClass_V1 +s_dir=focal_multiclass_v1.0 + +# shareable link: +# https://drive.google.com/file/d/13rPUqS68NdEF5NB0vsL7bDEju5dhmmDZ/view?usp=sharing + + +wget --no-check-certificate "https://drive.google.com/uc?export=download&id=13rPUqS68NdEF5NB0vsL7bDEju5dhmmDZ" -O $tool.zip +unzip $tool.zip + +if [ ! -f $s_dir/readme.txt ];then + echo "the focal tool wasn't dowloaded correctly, download manually" + exit 1 +fi + +rm -f $tool.zip + + + + diff --git a/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh b/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh new file mode 100755 index 00000000..344a6a34 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Copyright 2022 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 +# +# Downloads NIST scoring tools for LRE22 + +set -e +tool=lre-scorer +s_dir=lre-scorer + +# shareable link: +# https://drive.google.com/file/d/13pvUhFPGLgqId5yB8i25X__LFXKIU-ju/view?usp=sharing + +wget --no-check-certificate "https://drive.google.com/uc?export=download&id=13pvUhFPGLgqId5yB8i25X__LFXKIU-ju" -O $tool.tar.gz +tar xzvf $tool.tar.gz + +if [ ! -f $s_dir/scorerLRE22.py ];then + echo "the scoring tool wasn't dowloaded correctly, download manually" + exit 1 +fi + +rm -f $tool.tar.gz + + diff --git a/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh~ b/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh~ new file mode 100755 index 00000000..4201eecf --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh~ @@ -0,0 +1,25 @@ +#!/bin/bash + +# Copyright 2022 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 +# +# Downloads NIST scoring tools for LRE22 + +set -e +tool=lre-scorer +s_dir=lre-scorer + +# shareable link: +# https://drive.google.com/file/d/13pvUhFPGLgqId5yB8i25X__LFXKIU-ju/view?usp=sharing + +wget --no-check-certificate "https://drive.google.com/uc?export=download&id=13pvUhFPGLgqId5yB8i25X__LFXKIU-ju" -O $tool.tar.gz +tar xzvf $tool.tar.gz + +if [ ! -f $s_dir/scorerLRE22.py ];then + echo "the scoring tool wasn't dowloaded correctly, download manually" + exit 1 +fi + +rm -f $tool.tar.gz + + diff --git a/egs/lre22/fixed.v1.8k/local/eval_calibration_lre22.sh b/egs/lre22/fixed.v1.8k/local/eval_calibration_lre22.sh new file mode 100755 index 00000000..2c28e70e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/eval_calibration_lre22.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 2 ];then + echo "Usage: $0 " + exit 1 +fi + +score_dir=$1 +model_file=$2 +nocal_dir=$score_dir/nocal +cal_dir=$score_dir/cal_v1 + +dev_file=$nocal_dir/lre22_dev_scores.tsv +dev_cal_file=$cal_dir/lre22_dev_scores.tsv +eval_file=$nocal_dir/lre22_eval_scores.tsv +eval_cal_file=$cal_dir/lre22_eval_scores.tsv +mkdir -p $cal_dir + + +if [ "$(hostname --domain)" == "cm.gemini" ];then + module load matlab +fi + +if [ -f $dev_file ];then + echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({'$dev_file'}, '$dev_cal_file', '$model_file'); +" | matlab -nodisplay -nosplash > $cal_dir/eval_lre22_dev.log +fi + +if [ -f $eval_file ];then + echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({'$eval_file'}, '$eval_cal_file', '$model_file'); +" | matlab -nodisplay -nosplash > $cal_dir/eval_lre22_eval.log +fi + + diff --git a/egs/lre22/fixed.v1.8k/local/eval_fusion_lre22.sh b/egs/lre22/fixed.v1.8k/local/eval_fusion_lre22.sh new file mode 100755 index 00000000..284cac7e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/eval_fusion_lre22.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 3 ];then + echo "Usage: $0 $output_dir/eval_lre22_dev.log +fi + +if [ -f $eval_file_1 ];then + echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({$eval_files}, '$eval_fus_file', '$model_file'); +" | matlab -nodisplay -nosplash > $output_dir/eval_lre22_eval.log +fi + + diff --git a/egs/lre22/fixed.v1.8k/local/make_musan.py b/egs/lre22/fixed.v1.8k/local/make_musan.py new file mode 100755 index 00000000..b0ae6846 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_musan.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + + +def prepare_music(root_dir, fs, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_speech(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_noise(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def main(): + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) + + +if __name__ == "__main__": + main() diff --git a/egs/lre22/fixed.v1.8k/local/make_musan.sh b/egs/lre22/fixed.v1.8k/local/make_musan.sh new file mode 100755 index 00000000..4a6d30f9 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_musan.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +use_vocals='Y' + +. parse_options.sh || exit 1; + +if [ $# -ne 3 ];then + echo "Usage: $0 [options] "; + echo "e.g.: $0 /export/corpora/JHU/musan 8 data" + exit 1; +fi + +in_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} $fs ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf $data_dir/musan.tmp + diff --git a/egs/lre22/fixed.v1.8k/local/make_rirs_data.sh b/egs/lre22/fixed.v1.8k/local/make_rirs_data.sh new file mode 100755 index 00000000..c6652eda --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_rirs_data.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# +# Apache 2.0. +set -e + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom" +fi + +rir_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir + +rir_list=$rir_dir/rir_list +if [ "$fs" -eq 16 ];then + awk '{ key=$5; sub(/.*\//,"",key); print key,$5 }' $rir_list > $data_dir/wav.scp +else + awk '{ +key=$5; sub(/.*\//,"",key); +print key,"sox "$5" -r 8000 -t wav -b 16 -e signed-integer - |" }' \ + $rir_list > $data_dir/wav.scp +fi +awk '{ key=$5; sub(/.*\//,"",key); print key,$4 }' $rir_list > $data_dir/rir2room + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre16_train_dev.sh b/egs/lre22/fixed.v1.8k/local/make_sre16_train_dev.sh new file mode 100755 index 00000000..f861a8f4 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre16_train_dev.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +meta=$input_path/metadata +call2lang=$meta/calls.tsv +call2spk=$meta/call_sides.tsv +spk2gender=$meta/subjects.tsv +segm_file=$docs/sre16_dev_segment_key.tsv + +tel_up="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" +fi + +#Dev CMN2 Mandarin and Cebuano +for lang in cmn ceb +do + output_dir=$output_path/sre16_train_dev_$lang + mkdir -p $output_dir + awk -v c2l=$call2lang -v c2s=$call2spk -v s2g=$spk2gender -v l=$lang -F "\t" 'BEGIN{ +while(getline < c2l) +{ + if($2 == l){ calls[$1]=1 } +} +while(getline < c2s) { spk[$1]=$3 } +while(getline < s2g) { gender[$1]=tolower($2) } +} +{ if($2 in calls) { s=spk[$2]; print $1, s, gender[s] }}' $segm_file > $output_dir/table + + awk '{ print $2"-"$1,$2}' $output_dir/table | sort -k1,1 > $output_dir/utt2spk + utils/utt2spk_to_spk2utt.pl $output_dir/utt2spk > $output_dir/spk2utt + awk '{ print $2,$3}' $output_dir/table | sort -k1,1 -u > $output_dir/spk2gender + awk -v lang=$lang 'BEGIN{if(lang=="cmn"){lang_ldc="zho-cmn"} else { lang_ldc="ceb-ceb" }} { print $1,lang_ldc}' $output_dir/utt2spk > $output_dir/utt2lang + + find -L $input_path -name "*.sph" > $output_dir/wav.scp.tmp + + awk -v fwav=$output_dir/wav.scp.tmp 'BEGIN{ +while(getline < fwav) +{ + bn=$1; + sub(/.*\//,"",bn); + sub(/\.sph$/,"",bn); + wav[bn]=$1; +} +} +{ print $2"-"$1,"sph2pipe -f wav -p -c 1 "wav[$1]" |'"$tel_up"'"}' $output_dir/table | \ + sort -k1,1 > $output_dir/wav.scp + + rm -f $output_dir/wav.scp.tmp + utils/fix_data_dir.sh $output_dir + utils/validate_data_dir.sh --no-text --no-feats $output_dir +done + + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre16_train_eval.sh b/egs/lre22/fixed.v1.8k/local/make_sre16_train_eval.sh new file mode 100755 index 00000000..3589a60e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre16_train_eval.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +meta=$input_path/metadata +call2lang=$meta/calls.tsv +call2spk=$meta/call_sides.tsv +spk2gender=$meta/subjects.tsv +segm_file=$docs/sre16_eval_segment_key.tsv + +tel_up="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" +fi + +#Dev CMN2 Cantonese and Tagalog +for lang in yue tgl +do + output_dir=$output_path/sre16_train_eval_$lang + mkdir -p $output_dir + awk -v c2l=$call2lang -v c2s=$call2spk -v s2g=$spk2gender -v l=$lang -F "\t" 'BEGIN{ +while(getline < c2l) +{ + if($2 == l){ calls[$1]=1 } +} +while(getline < c2s) { spk[$1]=$3 } +while(getline < s2g) { gender[$1]=tolower($2) } +} +{ if($2 in calls) { s=spk[$2]; print $1, s, gender[s] }}' $segm_file > $output_dir/table + + awk '{ print $2"-"$1,$2}' $output_dir/table | sort -k1,1 > $output_dir/utt2spk + utils/utt2spk_to_spk2utt.pl $output_dir/utt2spk > $output_dir/spk2utt + awk '{ print $2,$3}' $output_dir/table | sort -k1,1 -u > $output_dir/spk2gender + awk -v lang=$lang 'BEGIN{if(lang=="yue"){lang_ldc="zho-yue"} else { lang_ldc="tl-tl" }} { print $1,lang_ldc}' $output_dir/utt2spk > $output_dir/utt2lang + + + find -L $input_path -name "*.sph" > $output_dir/wav.scp.tmp + + awk -v fwav=$output_dir/wav.scp.tmp 'BEGIN{ +while(getline < fwav) +{ + bn=$1; + sub(/.*\//,"",bn); + sub(/\.sph$/,"",bn); + wav[bn]=$1; +} +} +{ print $2"-"$1,"sph2pipe -f wav -p -c 1 "wav[$1]" |'"$tel_up"'"}' $output_dir/table | \ + sort -k1,1 > $output_dir/wav.scp + + rm -f $output_dir/wav.scp.tmp + utils/fix_data_dir.sh $output_dir + utils/validate_data_dir.sh --no-text --no-feats $output_dir +done + + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre18_dev_unlabeled.sh b/egs/lre22/fixed.v1.8k/local/make_sre18_dev_unlabeled.sh new file mode 100755 index 00000000..5d49bba7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre18_dev_unlabeled.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Copyright 2018 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +#enroll_file=$docs/sre18_dev_enrollment.tsv +#enroll_diar_file=$docs/sre18_dev_enrollment_diarization.tsv +segm_file=$docs/sre18_dev_segment_key.tsv +#trial_file=$docs/sre18_dev_trials.tsv +#key_file=$docs/sre18_dev_trial_key.tsv + +tel_up="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" +fi + +#Unlabeled +unlab_dir=$output_path/sre18_dev_unlabeled +mkdir -p $unlab_dir +awk '/unlabeled/ { print $1,"sph2pipe -f wav -p -c 1 '$input_path'/data/unlabeled/"$1" |'"$tel_up"'"}' $segm_file | \ + sort -k1,1 > $unlab_dir/wav.scp +awk '/unlabeled/ { print $1,$1}' $segm_file | sort -k1,1 > $unlab_dir/utt2spk +cp $unlab_dir/utt2spk $unlab_dir/spk2utt +awk '{ print $1,"ara-aeb" }' $unlab_dir/utt2spk > $unlab_dir/utt2lang + +utils/fix_data_dir.sh $unlab_dir +utils/validate_data_dir.sh --no-text --no-feats $unlab_dir + + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre18_train_dev.sh b/egs/lre22/fixed.v1.8k/local/make_sre18_train_dev.sh new file mode 100755 index 00000000..9e6ff763 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre18_train_dev.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Copyright 2018 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +segm_file=$docs/sre18_dev_segment_key.tsv + +tel_up="" +vid_down="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" +fi + + +#Dev CMN2 +output_dir=$output_path/sre18_train_dev_cmn2 +mkdir -p $output_dir +awk '$7=="cmn2" && $4 != "unlabeled" { print $2"-"$1,$2}' $segm_file | sort -k1,1 > $output_dir/utt2spk +utils/utt2spk_to_spk2utt.pl $output_dir/utt2spk > $output_dir/spk2utt +awk '{ print $1,"ara-aeb" }' $output_dir/utt2spk > $output_dir/utt2lang + +find -L $input_path -name "*.sph" > $output_dir/wav.scp.tmp + +awk -v fwav=$output_dir/wav.scp.tmp 'BEGIN{ +while(getline < fwav) +{ + bn=$1; + sub(/.*\//,"",bn); + wav[bn]=$1; +} +} +$7=="cmn2" && $4 != "unlabeled" { print $2"-"$1,"sph2pipe -f wav -p -c 1 "wav[$1]" |'"$tel_up"'"}' $segm_file | \ + sort -k1,1 > $output_dir/wav.scp + +rm -f $output_dir/wav.scp.tmp + +awk -v sf=$segm_file 'BEGIN{ +while(getline < sf) +{ + gender[$1]=substr($3,1,1) +} +} +{ sub(/^[^-]*-/,"",$2); print $1,gender[$2] } ' $output_dir/spk2utt > $output_dir/spk2gender + +utils/fix_data_dir.sh $output_dir +utils/validate_data_dir.sh --no-text --no-feats $output_dir + + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre18_train_eval.sh b/egs/lre22/fixed.v1.8k/local/make_sre18_train_eval.sh new file mode 100755 index 00000000..33ff5a5a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre18_train_eval.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# Copyright 2018 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +segm_file=$docs/sre18_eval_segment_key.tsv + +tel_up="" +vid_down="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" + vid_down=" -r 16k " +elif [ $fs -eq 8 ];then + vid_down=" -r 8k " +fi + + +#Eval CMN2 +output_dir=$output_path/sre18_train_eval_cmn2 +mkdir -p $output_dir +awk '$7=="cmn2" && $4 != "unlabeled" { print $2"-"$1,$2}' $segm_file | sort -k1,1 > $output_dir/utt2spk +utils/utt2spk_to_spk2utt.pl $output_dir/utt2spk > $output_dir/spk2utt +awk '{ print $1,"ara-aeb" }' $output_dir/utt2spk > $output_dir/utt2lang + +find $input_path -name "*.sph" > $output_dir/wav.scp.tmp + +awk -v fwav=$output_dir/wav.scp.tmp 'BEGIN{ +while(getline < fwav) +{ + bn=$1; + sub(/.*\//,"",bn); + wav[bn]=$1; +} +} +$7=="cmn2" && $4 != "unlabeled" { print $2"-"$1,"sph2pipe -f wav -p -c 1 "wav[$1]" |'"$tel_up"'"}' $segm_file | \ + sort -k1,1 > $output_dir/wav.scp + +rm -f $output_dir/wav.scp.tmp + +awk -v sf=$segm_file 'BEGIN{ +while(getline < sf) +{ + gender[$1]=substr($3,1,1) +} +} +{ sub(/^[^-]*-/,"",$2); print $1,gender[$2] } ' $output_dir/spk2utt > $output_dir/spk2gender + +utils/fix_data_dir.sh $output_dir +utils/validate_data_dir.sh --no-text --no-feats $output_dir + + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre19cmn2_eval.sh b/egs/lre22/fixed.v1.8k/local/make_sre19cmn2_eval.sh new file mode 100755 index 00000000..d6f877f5 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre19cmn2_eval.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +enroll_file=$docs/sre19_cts_challenge_enrollment.tsv +trial_file=$docs/sre19_cts_challenge_trials.tsv +key_file=$docs/sre19_cts_challenge_trial_key.tsv + +tel_up="" +vid_down="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" +fi + +#Enrollment CMN2 +enroll_dir=$output_path/sre19_eval_enroll_cmn2 +mkdir -p $enroll_dir +awk '/\.sph/ { print $1"-"$2,"sph2pipe -f wav -p -c 1 '$input_path'/data/enrollment/"$2" |'"$tel_up"'"}' $enroll_file | \ + sort -k1,1 > $enroll_dir/wav.scp +awk '!/modelid/ && /\.sph/ { print $1"-"$2,$1}' $enroll_file | sort -k1,1 > $enroll_dir/utt2spk +utils/utt2spk_to_spk2utt.pl $enroll_dir/utt2spk > $enroll_dir/spk2utt +awk '{ print $1,"ara-aeb" }' $enroll_dir/utt2spk > $enroll_dir/utt2lang + +utils/fix_data_dir.sh $enroll_dir +utils/validate_data_dir.sh --no-text --no-feats $enroll_dir + + +#Test set CMN2 +test_dir=$output_path/sre19_eval_test_cmn2 +mkdir -p $test_dir +awk '/\.sph/ { print $2,"sph2pipe -f wav -p -c 1 '$input_path'/data/test/"$2" |'"$tel_up"'"}' $trial_file | \ + sort -u -k1,1 > $test_dir/wav.scp +awk '{ print $1,$1}' $test_dir/wav.scp | sort -k1,1 > $test_dir/utt2spk +cp $test_dir/utt2spk $test_dir/spk2utt +awk '{ print $1,"ara-aeb" }' $test_dir/utt2spk > $test_dir/utt2lang +awk '!/modelid/ { print $1,$2,$4 }' $key_file > $test_dir/trials + +cp $trial_file $test_dir/trials.tsv +cp $key_file $test_dir/trial_key.tsv + +utils/fix_data_dir.sh $test_dir +utils/validate_data_dir.sh --no-text --no-feats $test_dir + + diff --git a/egs/lre22/fixed.v1.8k/local/merge_scores.py b/egs/lre22/fixed.v1.8k/local/merge_scores.py new file mode 100755 index 00000000..8d0df80e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/merge_scores.py @@ -0,0 +1,32 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import pandas as pd + + +def merge_scores(in_score_files, out_score_file): + + dfs = [] + for f in in_score_files: + df_f = pd.read_csv(f, sep="\t") + dfs.append(df_f) + + df = pd.concat(dfs) + df.sort_values(by="segmentid", inplace=True) + df.to_csv(out_score_file, sep="\t", index=False) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Split Segment list into training and validation" + ) + parser.add_argument("--in-score-files", nargs="+", required=True) + parser.add_argument("--out-score-file", required=True) + args = parser.parse_args() + merge_scores(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_adi17.py b/egs/lre22/fixed.v1.8k/local/prepare_adi17.py new file mode 100755 index 00000000..c04d988b --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_adi17.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python +# prepare_adi17.py --corpus-dir /export/corpora6/ADI17 --output-dir data/adi17 --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + +lre_map = { + "ALG": "ara-arq", + "EGY": "ara-arz", + "IRA": "ara-acm", + "JOR": "ara-jor", + "KSA": "ara-ksa", + "KUW": "ara-kuw", + "LEB": "ara-leb", + "LIB": "ara-ayl", + "MAU": "ara-mau", + "MOR": "ara-mor", + "OMA": "ara-oma", + "PAL": "ara-pal", + "QAT": "ara-qat", + "SUD": "ara-sud", + "SYR": "ara-syr", + "UAE": "ara-uae", + "YEM": "ara-yem" +} + + +def map_to_lre(langs): + return [lre_map[l] for l in langs] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = "sox {} -t wav -r {} - |".format(filename, target_fs) + else: + wav = filename + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_adi17(corpus_dir, output_dir, remove_langs, map_langs_to_lre_codes, + target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + train_files = glob.glob(str(corpus_dir / "train_segments/*/*.wav"), + recursive=True) + train_ids = [Path(f).stem for f in train_files] + train_langs = [Path(f).parent.stem for f in train_files] + dev_files = glob.glob(str(corpus_dir / "dev_segments/*.wav"), + recursive=True) + test_files = glob.glob(str(corpus_dir / "test_segments/*.wav"), + recursive=True) + dev_test_files = dev_files + test_files + df_labels = pd.concat([ + pd.read_csv(str(corpus_dir / "adi17_official_dev_label.txt"), + delim_whitespace=True), + pd.read_csv(str(corpus_dir / "adi17_official_test_label.txt"), + delim_whitespace=True) + ]) + df_labels = df_labels.set_index("id") + dev_test_ids = [Path(f).stem for f in dev_test_files] + dev_test_langs = df_labels.loc[dev_test_ids, "label"].values + all_ids = train_ids + dev_test_ids + all_files = train_files + dev_test_files + all_langs = list(train_langs) + list(dev_test_langs) + if map_langs_to_lre_codes: + all_langs = map_to_lre(all_langs) + + all_ids = [f"{a}-{b}" for a, b in zip(all_langs, all_ids)] + df = pd.DataFrame({ + "id": all_ids, + "language": all_langs, + "filename": all_files + }) + if remove_langs is not None: + for lang in remove_langs: + df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = corpus_dir.stem + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares ADI17 for training") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--remove-langs", + default=None, + nargs="+", + help="languages to remove") + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_adi17(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_ast.py b/egs/lre22/fixed.v1.8k/local/prepare_ast.py new file mode 100755 index 00000000..957ee9bf --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_ast.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python +# prepare_ast.py --corpus-dir /export/corpora6/LRE/AST2004 --output-dir data/ast --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +lre_map = { + "afr": "afr-afr", + "ndb": "nbl-nbl", + "oro": "orm-orm", + "tso": "tso-tso", + "ven": "ven-ven", + "xho": "xho-xho", + "zul": "zul-zul", + "tig": "tir-tir", + "sae": "eng-ens", + "ine": "eng-iaf", + "tun": "ara-aeb", + "alg": "ara-arq", + "lib": "ara-ayl", + "naf": "fra-ntf", + "aa": "afr-afr", + "ba": "afr-afr", + "ca": "afr-afr", + "ae": "eng-ens", + "be": "eng-ens", + "ce": "eng-ens", +} + + +def map_to_lre(langs): + return [lre_map[l] if l in lre_map else "{}-{}".format(l, l) for l in langs] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = "sox -t raw -e a-law -r 8000 {} -t wav -e signed-integer -b 16 -r {} - |".format(filename, target_fs) + else: + wav = "sox -t raw -e a-law -r 8000 {} -t wav -e signed-integer -b 16 -r 16000 - |".format(filename) + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_ast( + corpus_dir, output_dir, remove_langs, map_langs_to_lre_codes, target_fs, verbose +): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + files = glob.glob(str(corpus_dir / "*/*/*/*/*.alaw")) + langs = [(Path(f).parent.parent.parent.parent.stem).lower() for f in files] + files2 = glob.glob(str(corpus_dir / "*/*/*/*.alaw")) + langs2 = [(Path(f).parent.parent.parent.stem).lower() for f in files2] + files = files + files2 + langs = langs + langs2 + files = [f for f, l in zip(files, langs) if l not in ['ee']] + langs = [l for l in langs if l not in ['ee']] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + if remove_langs is not None: + for lang in remove_langs: + df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "cts" + df["corpus_id"] = corpus_dir.stem + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__":#ast + + parser = ArgumentParser(description="Prepares AST for training") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--remove-langs", default=None, nargs="+", help="languages to remove" + ) + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_ast(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_babel.py b/egs/lre22/fixed.v1.8k/local/prepare_babel.py new file mode 100755 index 00000000..4eb18945 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_babel.py @@ -0,0 +1,108 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import glob +import re +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def make_kaldi(df, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + source = row["source"] + if source == "cts": + wav = f"sph2pipe -f wav -p -c 1 {filename} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + else: + wav = f"sox {filename} -t wav -r {target_fs} - |" + + f.write(f"{segment_id} {wav}\n") + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_babel(corpus_dir, lang_code, output_dir, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + logging.info("searching audio files") + wavs = glob.glob(str(corpus_dir / "**/audio/*.sph"), recursive=True) + logging.info("found %d files", len(wavs)) + wavs = [corpus_dir / w for w in wavs] + seg_ids = [w.stem for w in wavs] + df = pd.DataFrame({"id": seg_ids, "filename": wavs}) + + # sort by segment id + df.sort_values(by="id", inplace=True) + df["corpus_id"] = "babel" + df["sample_rate"] = target_fs + df["language"] = lang_code + df["source"] = "cts" + logging.info("saving files") + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Prepares Babel datasets for training in LRE") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument( + "--lang-code", + required=True, + help="language code", + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_babel(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_common_voice.py b/egs/lre22/fixed.v1.8k/local/prepare_common_voice.py new file mode 100755 index 00000000..411ae94a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_common_voice.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +# prepare_common_voice.py --corpus-dir /export/corpora6/LRE/CommonVoice2020 --output-dir data/cv --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd +from hyperion.hyp_defs import config_logger + +lre_map = { + "afr": "afr-afr", + "ndb": "nbl-nbl", + "oro": "orm-orm", + "tso": "tso-tso", + "ven": "ven-ven", + "xho": "xho-xho", + "zul": "zul-zul", + "tig": "tir-tir", + "sae": "eng-ens", + "ine": "eng-ine", + "tun": "ara-aeb", + "alg": "ara-arq", + "lib": "ara-ayl", + "naf": "fra-ntf" +} + + +def map_to_lre(langs): + return [ + lre_map[l] if l in lre_map else "{}-{}".format(l, l) for l in langs + ] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = "ffmpeg -i {} -acodec pcm_s16le -ar {} -f wav - |".format( + filename, target_fs) + else: + wav = "ffmpeg -i {} -acodec pcm_s16le -f wav - |".format( + filename) + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_common_voice(corpus_dir, output_dir, keep_langs, + map_langs_to_lre_codes, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + files = glob.glob(str(corpus_dir / "**/clips/*.mp3"), recursive=True) + langs = [(Path(f).parent.parent.stem).lower() for f in files] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + df = df[df["language"].isin(keep_langs)] + # if remove_langs is not None: + # for lang in remove_langs: + # df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = "cv" + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares Common Voice for training") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--keep-langs", + default=["tir-tir"], + nargs="+", + help="languages to keep") + + # parser.add_argument("--remove-langs", + # default=None, + # nargs="+", + # help="languages to remove") + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_common_voice(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents.py b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents.py new file mode 100755 index 00000000..4c44b7f7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python +# prepare_common_voice.py --corpus-dir /export/corpora6/LRE/CommonVoice2020 --output-dir data/cv --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd +from hyperion.hyp_defs import config_logger + +en_map = {"indian": "eng-ine"} +fr_map = { + "france": "fra-fra", + "canada": "fra-can", + "algeria": "fra-ntf", + "morocco": "fra-ntf", + "tunisia": "fra-ntf", +} + +lre_map = { + "en": en_map, + "fr": fr_map, +} + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = "ffmpeg -i {} -acodec pcm_s16le -ar {} -f wav - |".format( + filename, target_fs) + else: + wav = "ffmpeg -i {} -acodec pcm_s16le -f wav - |".format( + filename) + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_common_voice(corpus_dir, output_dir, lang, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + my_map = lre_map[lang] + df = pd.read_csv(corpus_dir / lang / "validated.tsv", sep="\t") + mask = None + for dialect in my_map.keys(): + mask_d = df["accent"] == dialect + if mask is None: + mask = mask_d + else: + mask = np.logical_or(mask, mask_d) + + df = df.loc[mask] + files = df["path"] + files = [corpus_dir / lang / "clips" / f for f in df["path"]] + langs = [my_map[l] for l in df["accent"]] + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = "cv" + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Prepares Common Voice Accents for training in LRE22") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--lang", + default="en", + choices=["en", "fr"], + help="languages") + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_common_voice(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents_cat.py b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents_cat.py new file mode 100755 index 00000000..bf9d79ed --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents_cat.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python +# prepare_common_voice.py --corpus-dir /export/corpora6/LRE/CommonVoice2020 --output-dir data/cv --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd +from hyperion.hyp_defs import config_logger + +en_map = {"indian": "eng-ine"} +fr_map = { + "france": "fra-fra", + "canada": "fra-can", + "algeria": "fra-ntf", + "morocco": "fra-ntf", + "tunisia": "fra-ntf", +} + +lre_map = { + "en": en_map, + "fr": fr_map, +} + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + list_dir = output_dir / "lists_cat" + list_dir.mkdir(parents=True, exist_ok=True) + for r in range(len(df)): + file_list = df.iloc[r].file_lists + with open(list_dir / f"{df.iloc[r].id}.txt", "w") as f: + for fn in file_list: + f.write("file %s\n" % fn) + + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = list_dir / f"{segment_id}.txt" + if target_fs != 16000: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -ar {target_fs} -f wav - |" + else: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -f wav - |" + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_common_voice(corpus_dir, output_dir, lang, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + my_map = lre_map[lang] + df = pd.read_csv(corpus_dir / lang / "validated.tsv", sep="\t") + mask = None + for dialect in my_map.keys(): + mask_d = df["accent"] == dialect + if mask is None: + mask = mask_d + else: + mask = np.logical_or(mask, mask_d) + + df = df.loc[mask] + files = df["path"] + files = [corpus_dir / lang / "clips" / f for f in df["path"]] + langs = [my_map[l] for l in df["accent"]] + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({ + "id": ids, + "language": langs, + "filename": files, + "speaker": df["client_id"] + }) + + # sort by speaker, id + df.sort_values(by=["speaker", "id"], inplace=True) + + file_lists = [] + file_list = [] + seg_count = 0 + prev_spk = "" + cat_segs = [] + cur_seg = 0 + for r in range(len(df)): + row = df.iloc[r] + if seg_count == 5 or (row.speaker != prev_spk and seg_count > 0): + file_lists.append(file_list) + cat_segs.append(cur_seg) + file_list = [] + seg_count = 0 + cur_seg = r + + file_list.append(row.filename) + seg_count += 1 + prev_spk = row.speaker + + if file_list: + file_lists.append(file_list) + cat_segs.append(cur_seg) + + df_cat = df.iloc[cat_segs].drop(["filename"], axis=1) + df_cat["file_lists"] = file_lists + + df_cat["sample_coding"] = "pcm" + df_cat["source"] = "afv" + df_cat["corpus_id"] = "cv" + df_cat["sample_rate"] = target_fs + + # sort by segment id + df_cat.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df_cat.drop(["file_lists"], axis=1).to_csv(output_file, + sep=",", + index=False) + + make_kaldi(df_cat, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Prepares Common Voice Accents for training in LRE22") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--lang", + default="en", + choices=["en", "fr"], + help="languages") + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_common_voice(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_common_voice_cat.py b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_cat.py new file mode 100755 index 00000000..0790be25 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_cat.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python +# prepare_common_voice.py --corpus-dir /export/corpora6/LRE/CommonVoice2020 --output-dir data/cv --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd +from hyperion.hyp_defs import config_logger + +lre_map = { + "afr": "afr-afr", + "ndb": "nbl-nbl", + "oro": "orm-orm", + "tso": "tso-tso", + "ven": "ven-ven", + "xho": "xho-xho", + "zul": "zul-zul", + "tig": "tir-tir", + "sae": "eng-ens", + "ine": "eng-ine", + "tun": "ara-aeb", + "alg": "ara-arq", + "lib": "ara-ayl", + "naf": "fra-ntf" +} + + +def map_to_lre(langs): + return [ + lre_map[l] if l in lre_map else "{}-{}".format(l, l) for l in langs + ] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + list_dir = output_dir / "lists_cat" + list_dir.mkdir(parents=True, exist_ok=True) + for r in range(len(df)): + file_list = df.iloc[r].file_lists + with open(list_dir / f"{df.iloc[r].id}.txt", "w") as f: + for fn in file_list: + f.write("file %s\n" % fn) + + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = list_dir / f"{segment_id}.txt" + if target_fs != 16000: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -ar {target_fs} -f wav - |" + else: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -f wav - |" + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_common_voice(corpus_dir, output_dir, keep_langs, + map_langs_to_lre_codes, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + files = glob.glob(str(corpus_dir / "**/clips/*.mp3"), recursive=True) + langs = [(Path(f).parent.parent.stem).lower() for f in files] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + df = df[df["language"].isin(keep_langs)] + # if remove_langs is not None: + # for lang in remove_langs: + # df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = "cv" + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + file_lists = [] + file_list = [] + seg_count = 0 + prev_lang = "" + cat_segs = [] + cur_seg = 0 + for r in range(len(df)): + row = df.iloc[r] + if seg_count == 5 or (row.language != prev_lang and seg_count > 0): + file_lists.append(file_list) + cat_segs.append(cur_seg) + file_list = [] + seg_count = 0 + cur_seg = r + + file_list.append(row.filename) + seg_count += 1 + prev_lang = row.language + + if file_list: + file_lists.append(file_list) + cat_segs.append(cur_seg) + + df_cat = df.iloc[cat_segs].drop(["filename"], axis=1) + df_cat["file_lists"] = file_lists + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df_cat.drop(["file_lists"], axis=1).to_csv(output_file, + sep=",", + index=False) + + make_kaldi(df_cat, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares Common Voice for training") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--keep-langs", + default=["tir-tir"], + nargs="+", + help="languages to keep") + + # parser.add_argument("--remove-langs", + # default=None, + # nargs="+", + # help="languages to remove") + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_common_voice(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_lre17.py b/egs/lre22/fixed.v1.8k/local/prepare_lre17.py new file mode 100755 index 00000000..18eaa1d2 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_lre17.py @@ -0,0 +1,140 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import re +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + source = row["source"] + if source == "cts": + wav = f"sph2pipe -f wav -p -c 1 {filename} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + else: + wav = f"sox {filename} -t wav -r {target_fs} - |" + + f.write(f"{segment_id} {wav}\n") + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + "duration", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + "utt2speech_dur", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_lre17(corpus_dir, subset, source, output_dir, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s - %s -> %s", corpus_dir, subset, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir / "data" / subset + if subset == "eval": + table_info = corpus_dir / "docs" / f"lre17_eval_segment_keys.tsv" + else: + table_info = corpus_dir / "docs" / f"{subset}_info.tab" + df = pd.read_csv(table_info, sep="\t") + df.rename( + columns={ + "language_code": "language", + "segmentid": "id", + "file_duration": "duration", + }, + inplace=True, + ) + + if subset == "eval": + df["data_source"] = df["data_source"].str.lower() + df["sample_coding"] = df["data_source"].apply( + lambda x: "mulaw" if x == "mls14" else "pcm" + ) + df.loc[df["speech_duration"].isnull(), "speech_duration"] = 1000 + df["length_condition"] = df.pop("speech_duration").astype("int32") + + if subset in ["dev", "eval"]: + # drop files of 3 and 10 secs since they are contained in the files of 30 secs + df = df[df["length_condition"] > 10] + if source != "all": + df = df[df["data_source"] == source] + + # move segment column to first positon + first_col = df.pop("id") + df.insert(0, "id", first_col) + + # sort by segment id + df.sort_values(by="id", inplace=True) + + if subset == "train": + df["filename"] = df.apply(lambda x: wav_dir / x.language / x.id, axis=1) + else: + df["filename"] = df.apply(lambda x: wav_dir / x.id, axis=1) + df["source"] = df["id"].apply(lambda x: "cts" if re.match(r".*\.sph", x) else "afv") + df["corpus_id"] = "lre17" + df["sample_rate"] = target_fs + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares LDC2022E16/17 LRE17 for training") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument( + "--subset", + required=True, + help="train/dev/eval", + choices=["train", "dev", "eval"], + ) + parser.add_argument( + "--source", + default="all", + help="all/mls14/vast", + choices=["all", "mls14", "vast"], + ) + + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_lre17(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_lre22_dev.py b/egs/lre22/fixed.v1.8k/local/prepare_lre22_dev.py new file mode 100755 index 00000000..825f9b67 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_lre22_dev.py @@ -0,0 +1,108 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + source = row["source"] + if source == "cts": + wav = f"sph2pipe -f wav -p -c 1 {filename} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + else: + wav = f"sox {filename} -t wav -r {target_fs} - |" + + f.write(f"{segment_id} {wav}\n") + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + "duration", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + "utt2speech_dur", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_lre22(corpus_dir, output_dir, target_fs, verbose): + config_logger(verbose) + subset = "dev" + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir / "data" / subset + table_info = corpus_dir / "metadata" / "lre22_dev_metadata.tsv" + df = pd.read_csv(table_info, sep="\t") + df.rename( + columns={ + "language_code": "language", + "file_name": "id", + "duration_sec": "duration", + }, + inplace=True, + ) + + # move segment column to first positon + first_col = df.pop("id") + df.insert(0, "id", first_col) + + # sort by segment id + df.sort_values(by="id", inplace=True) + + df["filename"] = df.apply(lambda x: wav_dir / f"{x.id}.sph", axis=1) + df["source_coding"] = "alaw" + df["source"] = "cts" + df["corpus_id"] = "lre22" + df["sample_rate"] = target_fs + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares LDC2022E14 LRE22") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_lre22(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_lre22_eval.py b/egs/lre22/fixed.v1.8k/local/prepare_lre22_eval.py new file mode 100755 index 00000000..39aa06de --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_lre22_eval.py @@ -0,0 +1,98 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + source = row["source"] + if source == "cts": + wav = f"sph2pipe -f wav -p -c 1 {filename} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + else: + wav = f"sox {filename} -t wav -r {target_fs} - |" + + f.write(f"{segment_id} {wav}\n") + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + ] + files = [ + "utt2spk", + "spk2utt", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_lre22(corpus_dir, output_dir, target_fs, verbose): + config_logger(verbose) + subset = "eval" + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir / "data" / subset + table_info = corpus_dir / "docs" / "lre22_eval_trials.tsv" + df = pd.read_csv(table_info, sep="\t") + df.rename( + columns={ + "segmentid": "id", + }, + inplace=True, + ) + + # sort by segment id + df.sort_values(by="id", inplace=True) + + df["filename"] = df.apply(lambda x: wav_dir / f"{x.id}.sph", axis=1) + df["source_coding"] = "alaw" + df["source"] = "cts" + df["corpus_id"] = "lre22" + df["sample_rate"] = target_fs + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares LRE22 eval data") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_lre22(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre.py b/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre.py new file mode 100755 index 00000000..d3eb68f1 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# prepare_data.py --corpus-dir /export/corpora6/LRE/FLEURS2022 --output-dir data/fleurs --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/Lwazi2009 --output-dir data/lwazi --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/NCHLT2014 --output-dir data/nchlt --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/AMMI2020 --output-dir data/ammi --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +lre_map = { + "afr": "afr-afr", + "ndb": "nbl-nbl", + "oro": "orm-orm", + "tso": "tso-tso", + "ven": "ven-ven", + "xho": "xho-xho", + "zul": "zul-zul", + "tig": "tir-tir", + "sae": "eng-ens", + "ine": "eng-iaf", + "tun": "ara-aeb", + "alg": "ara-arq", + "lib": "ara-ayl", + "naf": "fra-ntf" +} + + +def map_to_lre(langs): + return [lre_map[l] if l in lre_map else "{}-{}".format(l,l) for l in langs] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = "sox {} -t wav -r {} - |".format(filename, target_fs) + else: + wav = filename + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_data( + corpus_dir, output_dir, remove_langs, map_langs_to_lre_codes, target_fs, verbose +): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + files = glob.glob(str(corpus_dir / "*/*/*/*/*.wav")) + langs = [(Path(f).parent.parent.parent.parent.stem).lower() for f in files] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + if remove_langs is not None: + for lang in remove_langs: + df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = corpus_dir.stem + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares NCHLT, FLEURS, Lwazi, and AMMI corpus for training") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--remove-langs", default=None, nargs="+", help="languages to remove" + ) + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_data(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre_cat.py b/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre_cat.py new file mode 100755 index 00000000..df62f18a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre_cat.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# prepare_data.py --corpus-dir /export/corpora6/LRE/FLEURS2022 --output-dir data/fleurs --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/Lwazi2009 --output-dir data/lwazi --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/NCHLT2014 --output-dir data/nchlt --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/AMMI2020 --output-dir data/ammi --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + +lre_map = { + "afr": "afr-afr", + "ndb": "nbl-nbl", + "oro": "orm-orm", + "tso": "tso-tso", + "ven": "ven-ven", + "xho": "xho-xho", + "zul": "zul-zul", + "tig": "tir-tir", + "sae": "eng-ens", + "ine": "eng-iaf", + "tun": "ara-aeb", + "alg": "ara-arq", + "lib": "ara-ayl", + "naf": "fra-ntf" +} + +buggy_files = { + "200630-192328_tir_c85_elicit_7", + "200701-120846_tir_c85_elicit_35", + "200701-133352_tir_c85_elicit_57", + "200701-134903_tir_c85_elicit_19", + "200701-134903_tir_c85_elicit_32", + "200701-234652_tir_c85_elicit_78", + "200702-083859_tir_c85_elicit_18", + "200702-125252_tir_c85_elicit_46", + "200702-161120_tir_c85_elicit_4", + "200702-161120_tir_c85_elicit_7", + "200702-172026_tir_c85_elicit_31", + "200702-182933_tir_c85_elicit_133", + "200702-182933_tir_c85_elicit_88", + "200702-193310_tir_c85_elicit_2", + "200702-194850_tir_c85_elicit_88", + "200702-200911_tir_c85_elicit_171", +} + + +def map_to_lre(langs): + return [ + lre_map[l] if l in lre_map else "{}-{}".format(l, l) for l in langs + ] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + list_dir = output_dir / "lists_cat" + list_dir.mkdir(parents=True, exist_ok=True) + for r in range(len(df)): + file_list = df.iloc[r].file_lists + with open(list_dir / f"{df.iloc[r].id}.txt", "w") as f: + for fn in file_list: + f.write("file %s\n" % fn) + + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = list_dir / f"{segment_id}.txt" + if target_fs != 16000: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -ar {target_fs} -f wav - |" + else: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -f wav - |" + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_data(corpus_dir, output_dir, remove_langs, map_langs_to_lre_codes, + target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + files = glob.glob(str(corpus_dir / "**/*.wav"), recursive=True) + langs = [(Path(f).parent.parent.parent.parent.stem).lower() for f in files] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + val = np.array( + [False if Path(f).stem in buggy_files else True for f in files]) + non_val = np.any(val == False) + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + if non_val: + df = df.loc[val] + logging.info("detected invalid files %d / %d remained", len(df), + len(val)) + if remove_langs is not None: + for lang in remove_langs: + df = df[df["language"] != lang] + + # sort by segment id + df.sort_values(by=["language", "id"], inplace=True) + + file_lists = [] + file_list = [] + seg_count = 0 + prev_lang = "" + cat_segs = [] + cur_seg = 0 + for r in range(len(df)): + row = df.iloc[r] + if seg_count == 5 or (row.language != prev_lang and seg_count > 0): + file_lists.append(file_list) + cat_segs.append(cur_seg) + file_list = [] + seg_count = 0 + cur_seg = r + + file_list.append(row.filename) + seg_count += 1 + prev_lang = row.language + + if file_list: + file_lists.append(file_list) + cat_segs.append(cur_seg) + + df_cat = df.iloc[cat_segs].drop(["filename"], axis=1) + df_cat["file_lists"] = file_lists + + # sort by segment id + df_cat.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df_cat.drop(["file_lists"], axis=1).to_csv(output_file, + sep=",", + index=False) + + make_kaldi(df_cat, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description= + "Prepares NCHLT, FLEURS, Lwazi, and AMMI corpus for training") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--remove-langs", + default=None, + nargs="+", + help="languages to remove") + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_data(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_sre21av_dev_audio.py b/egs/lre22/fixed.v1.8k/local/prepare_sre21av_dev_audio.py new file mode 100755 index 00000000..bc2c3001 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_sre21av_dev_audio.py @@ -0,0 +1,215 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + +from enum import Enum + + +class LangTrialCond(Enum): + ENG_ENG = 1 + ENG_CMN = 2 + ENG_YUE = 3 + CMN_CMN = 4 + CMN_YUE = 5 + YUE_YUE = 6 + OTHER_OTHER = 7 + OTHER_ENG = 8 + OTHER_CMN = 9 + OTHER_YUE = 10 + + @staticmethod + def is_eng(val): + if val in "ENG" or val in "USE": + return True + return False + + @staticmethod + def get_side_cond(val): + if val == "ENG" or val == "USE": + return "ENG" + if "YUE" in val: + return "YUE" + if "CMN" in val: + return "CMN" + + return "OTHER" + + @staticmethod + def get_trial_cond(enr, test): + enr = LangTrialCond.get_side_cond(enr) + test = LangTrialCond.get_side_cond(test) + trial = enr + "_" + test + try: + return LangTrialCond[trial] + except: + trial = test + "_" + enr + return LangTrialCond[trial] + + +class SourceTrialCond(Enum): + CTS_CTS = 1 + CTS_AFV = 2 + AFV_AFV = 3 + + @staticmethod + def get_trial_cond(enr, test): + trial = enr.upper() + "_" + test.upper() + try: + return SourceTrialCond[trial] + except: + trial = test.upper() + "_" + enr.upper() + return SourceTrialCond[trial] + + +def write_wav(df, target_fs, wav_dir, output_file): + with open(output_file, "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + ext = segment_id.split(".")[-1] + if ext == "flac": + if target_fs == 16000: + wav = f"{wav_dir}/{segment_id}" + else: + wav = f"sox {wav_dir}/{segment_id} -t wav -r {target_fs} - |" + elif ext == "mp4": + wav = f"ffmpeg -v 8 -i {wav_dir}/{segment_id} -vn -ar {target_fs} -ac 1 -f wav - |" + else: + wav = f"sph2pipe -f wav -p -c 1 {wav_dir}/{segment_id} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + f.write(f"{segment_id} {wav}\n") + + +def make_enroll_dir(df_segms, wav_dir, target_fs, source, output_path): + # fix source + df_segms.loc[df_segms["id"].str.match(r".*\.flac$"), "source_type"] = "afv" + enroll_dir = Path(output_path + f"_enroll_{source}") + wav_dir = wav_dir / "enrollment" + logging.info("making enrollment dir %s", enroll_dir) + enroll_dir.mkdir(parents=True, exist_ok=True) + df_segms = (df_segms[(df_segms["partition"] == "enrollment") + & (df_segms["source_type"] == source) & + (df_segms["language"] != "other")].drop( + ["partition"], axis=1).sort_values(by="id")) + segment_file = enroll_dir / "segments.csv" + df_segms.to_csv(segment_file, sep=",", index=False) + + with open(enroll_dir / "utt2spk", "w") as f1, open(enroll_dir / "spk2utt", + "w") as f2: + for u in df_segms["id"]: + f1.write(f"{u} {u}\n") + f2.write(f"{u} {u}\n") + + with open(enroll_dir / "utt2lang", "w") as f: + for u, s in zip(df_segms["id"], df_segms["language"]): + f.write(f"{u} {s}\n") + + write_wav(df_segms, target_fs, wav_dir, enroll_dir / "wav.scp") + + +def make_test_dir(df_segms, wav_dir, target_fs, source, output_path): + if source == "na": + # fix source + df_segms.loc[df_segms["id"].str.match(r".*\.mp4$"), + "source_type"] = "afv" + source = "afv" + + test_dir = Path(output_path + f"_test_{source}") + wav_dir = wav_dir / "test" + logging.info("making test dir %s", test_dir) + test_dir.mkdir(parents=True, exist_ok=True) + df_segms = (df_segms[(df_segms["partition"] == "test") + & (df_segms["source_type"] == source) & + (df_segms["language"] != "other")].drop( + ["partition"], axis=1).sort_values(by="id")) + + segment_file = test_dir / "segments.csv" + df_segms.to_csv(segment_file, sep=",", index=False) + + with open(test_dir / "utt2spk", "w") as f1, open(test_dir / "spk2utt", + "w") as f2: + for u in df_segms["id"]: + f1.write(f"{u} {u}\n") + f2.write(f"{u} {u}\n") + + with open(test_dir / "utt2lang", "w") as f: + for u, s in zip(df_segms["id"], df_segms["language"]): + f.write(f"{u} {s}\n") + + with open(test_dir / "spk2gender", "w") as f: + for u, g in zip(df_segms["id"], df_segms["gender"]): + g = g[0] + f.write(f"{u} {g}\n") + + write_wav(df_segms, target_fs, wav_dir, test_dir / "wav.scp") + + +def prepare_sre21av_dev_audio(corpus_dir, output_path, av_output_path, + target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_path) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir / "data" / "audio" + segments_file = corpus_dir / "docs" / "sre21_dev_segment_key.tsv" + df_segms = pd.read_csv(segments_file, sep="\t") + df_segms.rename( + columns={ + "segmentid": "id", + "subjectid": "speaker_id" + }, + inplace=True, + ) + df_segms.replace({"language": "english"}, {"language": "eng-zho"}, + inplace=True) + df_segms.replace({"language": "cantonese"}, {"language": "zho-yue"}, + inplace=True) + df_segms.replace({"language": "mandarin"}, {"language": "zho-cmn"}, + inplace=True) + + enroll_file = corpus_dir / "docs" / "sre21_audio_dev_enrollment.tsv" + + make_enroll_dir(df_segms, wav_dir, target_fs, "cts", output_path) + make_enroll_dir(df_segms, wav_dir, target_fs, "afv", output_path) + make_test_dir(df_segms, wav_dir, target_fs, "cts", output_path) + make_test_dir(df_segms, wav_dir, target_fs, "afv", output_path) + + wav_dir = corpus_dir / "data" / "video" + make_test_dir(df_segms, wav_dir, target_fs, "na", av_output_path) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares SRE21 dev audio part") + + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-path", + required=True, + help="Output data path prefix") + parser.add_argument( + "--av-output-path", + required=True, + help="Output data path prefix for audio visual", + ) + parser.add_argument("--target-fs", + default=16000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_sre21av_dev_audio(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_sre21av_eval_audio.py b/egs/lre22/fixed.v1.8k/local/prepare_sre21av_eval_audio.py new file mode 100755 index 00000000..301eebf7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_sre21av_eval_audio.py @@ -0,0 +1,243 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + +from enum import Enum + + +class LangTrialCond(Enum): + ENG_ENG = 1 + ENG_CMN = 2 + ENG_YUE = 3 + CMN_CMN = 4 + CMN_YUE = 5 + YUE_YUE = 6 + OTHER_OTHER = 7 + OTHER_ENG = 8 + OTHER_CMN = 9 + OTHER_YUE = 10 + + @staticmethod + def is_eng(val): + if val in "ENG" or val in "USE": + return True + return False + + @staticmethod + def get_side_cond(val): + if val == "ENG" or val == "USE": + return "ENG" + if "YUE" in val: + return "YUE" + if "CMN" in val: + return "CMN" + + return "OTHER" + + @staticmethod + def get_trial_cond(enr, test): + enr = LangTrialCond.get_side_cond(enr) + test = LangTrialCond.get_side_cond(test) + trial = enr + "_" + test + try: + return LangTrialCond[trial] + except: + trial = test + "_" + enr + return LangTrialCond[trial] + + +class SourceTrialCond(Enum): + CTS_CTS = 1 + CTS_AFV = 2 + AFV_AFV = 3 + + @staticmethod + def get_trial_cond(enr, test): + trial = enr.upper() + "_" + test.upper() + try: + return SourceTrialCond[trial] + except: + trial = test.upper() + "_" + enr.upper() + return SourceTrialCond[trial] + + +def write_wav(df, target_fs, wav_dir, output_file): + with open(output_file, "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + ext = segment_id.split(".")[-1] + if ext == "flac": + if target_fs == 16000: + wav = f"{wav_dir}/{segment_id}" + else: + wav = f"sox {wav_dir}/{segment_id} -t wav -r {target_fs} - |" + elif ext == "mp4": + wav = f"ffmpeg -v 8 -i {wav_dir}/{segment_id} -vn -ar {target_fs} -ac 1 -f wav - |" + else: + wav = f"sph2pipe -f wav -p -c 1 {wav_dir}/{segment_id} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + f.write(f"{segment_id} {wav}\n") + + +def make_enroll_dir(df_segms, wav_dir, target_fs, source, output_path): + + # fix source + df_segms.loc[df_segms["id"].str.match(r".*\.flac$"), "source_type"] = "afv" + enroll_dir = Path(output_path + f"_enroll_{source}") + wav_dir = wav_dir / "enrollment" + logging.info("making enrollment dir %s", enroll_dir) + enroll_dir.mkdir(parents=True, exist_ok=True) + df_segms = (df_segms[(df_segms["partition"] == "enrollment") + & (df_segms["source_type"] == source) & + (df_segms["language"] != "other")].drop( + ["partition"], axis=1).sort_values(by="id")) + segment_file = enroll_dir / "segments.csv" + df_segms.to_csv(segment_file, sep=",", index=False) + + with open(enroll_dir / "utt2spk", "w") as f1, open(enroll_dir / "spk2utt", + "w") as f2: + for u in df_segms["id"]: + f1.write(f"{u} {u}\n") + f2.write(f"{u} {u}\n") + + with open(enroll_dir / "utt2lang", "w") as f: + for u, s in zip(df_segms["id"], df_segms["language"]): + f.write(f"{u} {s}\n") + + write_wav(df_segms, target_fs, wav_dir, enroll_dir / "wav.scp") + + +def make_test_dir(df_segms, wav_dir, target_fs, source, output_path): + + if source == "na": + # fix source + df_segms.loc[df_segms["id"].str.match(r".*\.mp4$"), + "source_type"] = "afv" + source = "afv" + + test_dir = Path(output_path + f"_test_{source}") + wav_dir = wav_dir / "test" + logging.info("making test dir %s", test_dir) + test_dir.mkdir(parents=True, exist_ok=True) + df_segms = (df_segms[(df_segms["partition"] == "test") + & (df_segms["source_type"] == source) & + (df_segms["language"] != "other")].drop( + ["partition"], axis=1).sort_values(by="id")) + + segment_file = test_dir / "segments.csv" + df_segms.to_csv(segment_file, sep=",", index=False) + + with open(test_dir / "utt2spk", "w") as f1, open(test_dir / "spk2utt", + "w") as f2: + for u in df_segms["id"]: + f1.write(f"{u} {u}\n") + f2.write(f"{u} {u}\n") + + with open(test_dir / "utt2lang", "w") as f: + for u, s in zip(df_segms["id"], df_segms["language"]): + f.write(f"{u} {s}\n") + + with open(test_dir / "spk2gender", "w") as f: + for u, g in zip(df_segms["id"], df_segms["gender"]): + g = g[0] + f.write(f"{u} {g}\n") + + write_wav(df_segms, target_fs, wav_dir, test_dir / "wav.scp") + + +def prepare_sre21av_eval_audio(corpus_dir, output_path, av_output_path, + target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_path) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir / "data" / "audio" + segments_file = corpus_dir / "docs" / "sre21_eval_segment_key.tsv" + df_segms = pd.read_csv(segments_file, sep="\t") + df_segms.rename( + columns={ + "segmentid": "id", + "subjectid": "speaker_id" + }, + inplace=True, + ) + df_segms.replace({"language": "english"}, {"language": "eng-zho"}, + inplace=True) + df_segms.replace({"language": "cantonese"}, {"language": "zho-yue"}, + inplace=True) + df_segms.replace({"language": "mandarin"}, {"language": "zho-cmn"}, + inplace=True) + + # enroll_file = corpus_dir / "docs" / "sre21_audio_eval_enrollment.tsv" + # df_enr = pd.read_csv(enroll_file, sep="\t") + # df_enr.rename( + # columns={ + # "segmentid": "id", + # "modelid": "model_id" + # }, + # inplace=True, + # ) + # key_file = corpus_dir / "docs" / "sre21_audio_eval_trial_key.tsv" + # df_key = pd.read_csv(key_file, sep="\t") + # df_key.rename( + # columns={ + # "segmentid": "id", + # "modelid": "model_id" + # }, + # inplace=True, + # ) + + make_enroll_dir(df_segms, wav_dir, target_fs, "cts", output_path) + make_enroll_dir(df_segms, wav_dir, target_fs, "afv", output_path) + make_test_dir(df_segms, wav_dir, target_fs, "cts", output_path) + make_test_dir(df_segms, wav_dir, target_fs, "afv", output_path) + + key_file = corpus_dir / "docs" / "sre21_audio-visual_eval_trial_key.tsv" + # df_key = pd.read_csv(key_file, sep="\t") + # df_key.rename( + # columns={ + # "segmentid": "id", + # "modelid": "model_id" + # }, + # inplace=True, + # ) + wav_dir = corpus_dir / "data" / "video" + make_test_dir(df_segms, wav_dir, target_fs, "na", av_output_path) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares SRE21 eval audio part") + + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-path", + required=True, + help="Output data path prefix") + parser.add_argument( + "--av-output-path", + required=True, + help="Output data path prefix for audio visual", + ) + parser.add_argument("--target-fs", + default=16000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_sre21av_eval_audio(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_sre_cts_superset.py b/egs/lre22/fixed.v1.8k/local/prepare_sre_cts_superset.py new file mode 100755 index 00000000..af299781 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_sre_cts_superset.py @@ -0,0 +1,185 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + +multigender_spks = [ + "111774", + "111781", + "112778", + "112783", + "112879", + "113153", + "113213", + "113603", + "128673", + "128770", +] + +lre_map = { + "USE": "eng-usg", + "AMH": "am-am", + "BEN": "bn-bn", + "FAR": "far-far", + "HIN": "hi-hi", + "INE": "eng-ine", + "ITA": "it-it", + "JPN": "ja-ja", + "KAT": "ka-ka", + "KHM": "km-km", + "KOR": "ko-ko", + "LAO": "lo-lo", + "PAN": "pa-pa", + "POL": "qsl-pol", + "RUS": "qsl-rus", + "TAM": "ta-ta", + "TGL": "tl-tl", + "THA": "th-th", + "TIR": "tir-tir", + "URD": "ur-ur", + "UZB": "uz-uz", + "VIE": "vi-vi", + "CMN": "zho-cmn", + "YUE": "zho-yue", + "WUU": "zho-wuu", + "NAN": "zho-nan", +} + + +def fix_multigender_spks(df): + + logging.info("Fixing multigender speakers") + n0 = len(df) + for spk in multigender_spks: + male_idx = (df["speaker_id"] == spk) & (df["gender"] == "male") + female_idx = (df["speaker_id"] == spk) & (df["gender"] == "female") + num_male = np.sum(male_idx) + num_female = np.sum(female_idx) + if num_male > num_female: + df = df[~female_idx] + else: + df = df[~male_idx] + + logging.info("Fixed multigender speakers, %d/%d segments remained", len(df), n0) + return df + + +def prepare_sre_cts_superset(corpus_dir, output_dir, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + wav_dir = Path(corpus_dir) / "data" + table_file = Path(corpus_dir) / "docs/cts_superset_segment_key.tsv" + df = pd.read_csv(table_file, sep="\t") + df.drop(["segmentid", "speakerid"], axis=1, inplace=True) + df.rename( + columns={ + "subjectid": "speaker_id", + "sessionid": "session_id", + "corpusid": "corpus_id", + "phoneid": "phone_id", + }, + inplace=True, + ) + df["speaker_id"] = df["speaker_id"].astype("str") + df = fix_multigender_spks(df) + + logging.info("remove generic ENG or mixed langs") + n0 = len(df) + df = df[df["language"] != "ENG"] + df = df[df["language"] != "SPA"] + df = df[df["language"] != "UND"] + df = df[~df["language"].str.contains("\.")] + logging.info("remained %d out of %d", len(df), n0) + logging.info("renaming languages like LRE") + for k, v in lre_map.items(): + idx = df["language"] == k + df.loc[idx, "language"] = v + + df["id"] = df["filename"].str.replace("/", "-") + # put segment_id as first columnt + cols = df.columns.tolist() + cols = cols[-1:] + cols[:-1] + df = df[cols] + logging.info("sorting by segment_id") + df.sort_values(by="id", inplace=True) + + logging.info("saving segments.csv") + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / "segments.csv" + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "speaker_id", + "speech_duration", + "session_id", + "corpus_id", + "phone_id", + "language", + ] + files = [ + "utt2spk", + "utt2speech_dur", + "utt2session", + "utt2corpus", + "utt2phone", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + df.to_csv(output_file, sep=" ", columns=["id", c], header=False, index=False) + + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + wav = f"sph2pipe -f wav -p -c 1 {wav_dir}/{filename} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + f.write(f"{segment_id} {wav}\n") + + # speaker table + logging.info("saving speaker files") + spk_df = df[["speaker_id", "gender"]].drop_duplicates() + output_file = output_dir / "speaker.csv" + spk_df.to_csv(output_file, sep=",", index=False) + gender = df["gender"].str.replace("female", "f").str.replace("male", "m") + spk_df["gender"] = gender + output_file = output_dir / "spk2gender" + spk_df.to_csv(output_file, sep=" ", header=False, index=False) + + with open(output_dir / "spk2utt", "w") as f: + for spk in df["speaker_id"].unique(): + segment_ids = " ".join(df[df["speaker_id"] == spk].id.values) + f.write(f"{spk} {segment_ids}\n") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares SRE superset LDC2021E08") + + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="Ouput data path") + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_sre_cts_superset(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_voxlingua107.py b/egs/lre22/fixed.v1.8k/local/prepare_voxlingua107.py new file mode 100755 index 00000000..c4dc3894 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_voxlingua107.py @@ -0,0 +1,130 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +lre_map = { + "pl": "qsl-pol", + "ru": "qsl-rus", + "cs": "qsl-cze", + "uk": "qsl-ukr", + "hr": "qsl-cro", + "bg": "qsl-bul", + "be": "qsl-bel", + "sk": "qsl-slk", + "sl": "qsl-slv", + "bs": "qsl-bos", + "sr": "qsl-ser", + "zh": "zho-cmn", + "fr": "fra-mix", + "af": "afr-afr", +} + + +def map_to_lre(langs): + return [lre_map[l] if l in lre_map else f"{l}-{l}" for l in langs] + + +def make_kaldi(df, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = f"sox {filename} -t wav -r {target_fs} - |" + else: + wav = filename + + f.write(f"{segment_id} {wav}\n") + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_voxlingua107( + corpus_dir, output_dir, remove_langs, map_langs_to_lre_codes, target_fs, verbose +): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + files = glob.glob(str(corpus_dir / "*/*.wav")) + langs = [Path(f).parent.stem for f in files] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = [f"{l}-{Path(f).stem}" for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + if remove_langs is not None: + for lang in remove_langs: + df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = "voxlingua107" + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares Voxlingua107 for training") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--remove-langs", default=None, nargs="+", help="languages to remove" + ) + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument( + "--target-fs", default=16000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_voxlingua107(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/score_lre22.sh b/egs/lre22/fixed.v1.8k/local/score_lre22.sh new file mode 100755 index 00000000..e6564da4 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/score_lre22.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 3 ];then + echo "Usage: $0 " + exit 1 +fi + +dev_eval=$1 +score_file=$(readlink -f $2) +output_file=$(readlink -f $3) +echo $dev_eval $score_file $output_file +output_dir=$(dirname $output_file) +mkdir -p $output_dir + +conda activate $HYP_ENV + +cd ./lre-scorer +echo "Scoring $score_file -> $output_file" +if [ "$dev_eval" == "dev" ];then + config=config.ini +else + config=config_eval.ini +fi + +python ./scoreit.py -s $score_file -o $output_file -e $config + +cd - diff --git a/egs/lre22/fixed.v1.8k/local/split_dev.py b/egs/lre22/fixed.v1.8k/local/split_dev.py new file mode 100755 index 00000000..5988e245 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/split_dev.py @@ -0,0 +1,80 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path +from jsonargparse import ArgumentParser, namespace_to_dict +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet + + +def split_dev(segs_file, output_dir, num_folds, verbose): + config_logger(verbose) + segs = SegmentSet.load(segs_file) + assert "subclass_idx" in segs + class_ids = segs["class_id"] + _, class_idx = np.unique(class_ids, return_inverse=True) + logging.info("splitting segments into %d folds", num_folds) + folds = [[] for i in range(num_folds)] + for c in range(np.max(class_idx) + 1): + c_idx = class_idx == c + subclass_idx = segs.loc[c_idx, "subclass_idx"] + num_c = len(subclass_idx) + num_c_pf = num_c / num_folds + _, counts = np.unique(subclass_idx, return_counts=True) + acc_counts = np.cumsum(counts) + logging.info( + f"class {c} subclass-counts={counts}, subclass-acc-counts={acc_counts}" + ) + c_idx = np.nonzero(c_idx)[0] + first = 0 + for f in range(num_folds): + if f < num_folds - 1: + last = np.argmin(np.abs(acc_counts - (f + 1) * num_c_pf)) + else: + last = np.max(subclass_idx) + f_idx = np.logical_and(subclass_idx >= first, subclass_idx <= last) + folds[f].extend(c_idx[f_idx]) + logging.info( + ( + f"class {c} fold {f} add {np.sum(f_idx)} samples," + f"accum {len(folds[f])} samples, " + f"first-subclass={first}, last-subclass={last}" + ) + ) + first = last + 1 + + output_dir = Path(output_dir) + for f in range(num_folds): + logging.info( + "fold %d, train-samples=%d test-samples=%d", + f, + len(segs) - len(folds[f]), + len(folds[f]), + ) + f_dir = output_dir / f"fold_{f}" + f_dir.mkdir(parents=True, exist_ok=True) + mask = np.zeros((len(segs),), dtype=bool) + mask[folds[f]] = True + segs_test = SegmentSet(segs.loc[mask]) + segs_test.save(f_dir / "test_segments.csv") + segs_train = SegmentSet(segs.loc[~mask]) + segs_train.save(f_dir / "train_segments.csv") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Splits LRE22 into folds") + parser.add_argument( + "--segs-file", required=True, help="Segments file with subclass_idx column", + ) + parser.add_argument("--output-dir", required=True, help="output path") + parser.add_argument("--num-folds", default=2, type=int, help="number of folds") + parser.add_argument("-v", "--verbose", default=1, choices=[0, 1, 2, 3], type=int) + + args = parser.parse_args() + split_dev(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/split_segments_train_val.py b/egs/lre22/fixed.v1.8k/local/split_segments_train_val.py new file mode 100755 index 00000000..922c868c --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/split_segments_train_val.py @@ -0,0 +1,160 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import re +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import RecordingSet, FeatureSet, SegmentSet, ClassInfo + + +def split_train_val( + segments_file, + recordings_file, + feats_file, + durations_file, + ara_ary_seg_file, + in_class_name, + out_class_name, + val_percent, + remove_langs, + seed, + output_dir, + verbose, +): + + config_logger(verbose) + rng = np.random.RandomState(seed=seed) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + segs = SegmentSet.load(segments_file) + if durations_file is not None: + durs = SegmentSet.load(durations_file) + if "duration" in durs: + segs["duration"] = durs.loc[segs["id"], "duration"] + else: + segs["duration"] = durs.loc[segs["id"], "class_id"].astype(float) + + if remove_langs is not None: + for lang in remove_langs: + segs = segs[segs[in_class_name] != lang] + + segs = SegmentSet(segs) + + if ara_ary_seg_file is not None: + segs_ary = SegmentSet.load(ara_ary_seg_file) + segs.loc[segs_ary["id"], in_class_name] = segs_ary["class_id"] + n1 = len(segs) + noary_idx = segs[in_class_name] != "ara-ary" + segs = SegmentSet(segs.loc[noary_idx]) + logging.info("removing ara-ary segments remained %d / %d", len(segs), n1) + + logging.info("creating class_info file") + class_ids = segs[in_class_name].drop_duplicates().sort_values() + class_info = ClassInfo(pd.DataFrame({"id": class_ids})) + class_info.save(output_dir / "class_file.csv") + + logging.info("splitting segments into train and val") + segs.set_index(in_class_name) + val_mask = np.zeros((len(segs),), dtype=bool) + for c in class_info["id"]: + seg_idx_c = segs.get_loc(c) + num_val = int(val_percent * len(seg_idx_c) / 100) + val_idx = rng.choice(seg_idx_c, size=num_val, replace=False) + val_mask[val_idx] = True + logging.info( + "class %s total=%d train=%d val=%d", + c, + len(seg_idx_c), + len(seg_idx_c) - num_val, + num_val, + ) + + segs.reset_index() + if out_class_name is not None: + segs.rename(columns={in_class_name: out_class_name}, inplace=True) + + train_segs = SegmentSet(segs.loc[~val_mask]) + train_segs.save(output_dir / "train_segments.csv") + val_segs = SegmentSet(segs.loc[val_mask]) + val_segs.save(output_dir / "val_segments.csv") + + if recordings_file is not None: + logging.info("splitting recordings into train and val") + recs = RecordingSet.load(recordings_file) + train_recs = RecordingSet(recs.loc[train_segs.recording_ids(train_segs["id"])]) + train_recs.save(output_dir / "train_recordings.csv") + val_recs = RecordingSet(recs.loc[val_segs.recording_ids(val_segs["id"])]) + val_recs.save(output_dir / "val_recordings.csv") + + if feats_file is not None: + logging.info("splitting features into train and val") + feats = FeatureSet.load(feats_file) + train_feats = FeatureSet(feats.loc[train_segs["id"]]) + train_feats.save(output_dir / "train_feats.csv") + val_feats = FeatureSet(feats.loc[val_segs["id"]]) + val_feats.save(output_dir / "val_feats.csv") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Split Segment list into training and validation" + ) + parser.add_argument( + "--segments-file", required=True, help="path to segments file", + ) + parser.add_argument( + "--recordings-file", + default=None, + help="if not None, splits recordings file into train and val", + ) + + parser.add_argument( + "--durations-file", + default=None, + help="if not None, add durations to segments file", + ) + + parser.add_argument( + "--feats-file", + default=None, + help="if not None, splits features file into train and val", + ) + parser.add_argument( + "--ara-ary-seg-file", + default=None, + help="segment-file with labels for Maghrebi Arabic", + ) + + parser.add_argument( + "--in-class-name", + default="class_id", + help="column name containing the class_id that we consider to make the partition", + ) + parser.add_argument( + "--out-class-name", + default=None, + help="if not None, we rename the class_id column in the output file", + ) + parser.add_argument( + "--val-percent", default=5.0, type=float, help="percentage of data used for val" + ) + parser.add_argument( + "--remove-langs", default=None, nargs="+", help="remove languages from training" + ) + parser.add_argument("--seed", default=1123, type=int, help="random seed") + + parser.add_argument("--output-dir", required=True, help="output directory") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + split_train_val(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/train_calibration_lre22.sh b/egs/lre22/fixed.v1.8k/local/train_calibration_lre22.sh new file mode 100755 index 00000000..227331b3 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/train_calibration_lre22.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 1 ];then + echo "Usage: $0 " + exit 1 +fi + +score_dir=$1 +nocal_dir=$score_dir/nocal +cal_dir=$score_dir/cal_v1 + +train_list=data/lre22_dev/utt2lang +train_file=$nocal_dir/lre22_dev_scores.tsv +train_cal_file=$cal_dir/lre22_dev_scores.tsv +eval_file=$nocal_dir/lre22_eval_scores.tsv +eval_cal_file=$cal_dir/lre22_eval_scores.tsv +mkdir -p $cal_dir +model_file=$cal_dir/cal.mat + +if [ "$(hostname --domain)" == "cm.gemini" ];then + module load matlab +fi + +echo " +addpath('steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +train_fusion('$train_list', {'$train_file'}, '$model_file'); +" | matlab -nodisplay -nosplash > $cal_dir/train.log + +echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({'$train_file'}, '$train_cal_file', '$model_file'); +" | matlab -nodisplay -nosplash > $cal_dir/eval_lre22_dev.log + +if [ -f $eval_file ];then + echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({'$eval_file'}, '$eval_cal_file', '$model_file'); +" | matlab -nodisplay -nosplash > $cal_dir/eval_lre22_eval.log +fi + + diff --git a/egs/lre22/fixed.v1.8k/local/train_fusion_lre22.sh b/egs/lre22/fixed.v1.8k/local/train_fusion_lre22.sh new file mode 100755 index 00000000..add44362 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/train_fusion_lre22.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 2 ];then + echo "Usage: $0 " + exit 1 +fi + +score_dirs="$1" +output_dir=$2 + +train_list=data/lre22_dev/utt2lang +train_base=lre22_dev_scores.tsv +train_files=$(echo $score_dirs | awk 'BEGIN{OFS=","}{ for(i=1;i<=NF;i++){ $i="'\''"$i"/'$train_base\''" }; print $0}') + +train_fus_file=$output_dir/$train_base +mkdir -p $output_dir +model_file=$output_dir/fus.mat + +if [ "$(hostname --domain)" == "cm.gemini" ];then + module load matlab +fi + +echo " +addpath('steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +train_fusion('$train_list', {$train_files}, '$model_file'); +" | matlab -nodisplay -nosplash > $output_dir/train.log + +echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({$train_files}, '$train_fus_file', '$model_file'); +" | matlab -nodisplay -nosplash > $output_dir/eval_lre22_dev.log + diff --git a/egs/lre22/fixed.v1.8k/local/validate_lre22.sh b/egs/lre22/fixed.v1.8k/local/validate_lre22.sh new file mode 100755 index 00000000..fe039a5a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/validate_lre22.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 1 ];then + echo "Usage: $0 " + exit 1 +fi + +score_file=$(readlink -f $1) +conda activate $HYP_ENV + +cd ./lre-scorer +echo "Scoring $score_file -> $output_file" +python ./scoreit.py -s $score_file -o $score_file.val -v + +cd - diff --git a/egs/lre22/fixed.v1.8k/path.sh b/egs/lre22/fixed.v1.8k/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/lre22/fixed.v1.8k/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/test_segments.csv b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/test_segments.csv new file mode 100644 index 00000000..6518f24e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/test_segments.csv @@ -0,0 +1,2114 @@ +id,class_id,subclass_idx +lre22_dev_aadaq,afr-afr,5 +lre22_dev_aaxdt,xho-xho,14 +lre22_dev_abujj,xho-xho,15 +lre22_dev_acgiu,zul-zul,6 +lre22_dev_acnyv,ven-ven,7 +lre22_dev_adbku,ara-ayl,4 +lre22_dev_ademr,orm-orm,3 +lre22_dev_adgoy,xho-xho,4 +lre22_dev_adnpi,eng-ens,1 +lre22_dev_adqaa,ven-ven,10 +lre22_dev_adwzf,zul-zul,2 +lre22_dev_aeiuj,afr-afr,4 +lre22_dev_afhui,eng-ens,4 +lre22_dev_afuav,nbl-nbl,15 +lre22_dev_afvvg,ven-ven,10 +lre22_dev_afxjf,eng-iaf,10 +lre22_dev_agmwb,ara-aeb,10 +lre22_dev_agnik,eng-ens,3 +lre22_dev_ahcja,orm-orm,14 +lre22_dev_ahobp,afr-afr,13 +lre22_dev_ahupk,eng-ens,11 +lre22_dev_aicjg,xho-xho,12 +lre22_dev_aikrz,eng-ens,9 +lre22_dev_ailwo,orm-orm,7 +lre22_dev_aiqhl,tir-tir,10 +lre22_dev_aiuwf,ara-ayl,5 +lre22_dev_aizyr,ara-arq,0 +lre22_dev_ajbui,zul-zul,12 +lre22_dev_ajigk,ara-aeb,10 +lre22_dev_ajuwq,ara-ayl,3 +lre22_dev_akbly,nbl-nbl,3 +lre22_dev_akhwr,xho-xho,6 +lre22_dev_aksxd,nbl-nbl,6 +lre22_dev_aktcg,afr-afr,1 +lre22_dev_aktzw,eng-ens,11 +lre22_dev_akulq,orm-orm,14 +lre22_dev_alcie,orm-orm,11 +lre22_dev_alunz,xho-xho,6 +lre22_dev_amaec,tir-tir,10 +lre22_dev_amnvo,ara-arq,6 +lre22_dev_amxrk,zul-zul,9 +lre22_dev_anmuv,tso-tso,11 +lre22_dev_aomcz,ara-aeb,7 +lre22_dev_aooht,fra-ntf,11 +lre22_dev_aprbe,ara-arq,3 +lre22_dev_apxxx,orm-orm,12 +lre22_dev_aqdwu,ven-ven,6 +lre22_dev_aqejl,xho-xho,5 +lre22_dev_aqnyy,tso-tso,5 +lre22_dev_arjuc,afr-afr,5 +lre22_dev_arrkp,tir-tir,1 +lre22_dev_atdgp,zul-zul,13 +lre22_dev_atoxn,eng-ens,10 +lre22_dev_audls,afr-afr,6 +lre22_dev_auilj,ven-ven,11 +lre22_dev_auqgt,eng-iaf,3 +lre22_dev_autlo,zul-zul,7 +lre22_dev_avait,zul-zul,3 +lre22_dev_avvik,nbl-nbl,14 +lre22_dev_awgem,ara-ayl,3 +lre22_dev_awgnb,fra-ntf,14 +lre22_dev_awvym,ara-ayl,9 +lre22_dev_axhbz,tir-tir,12 +lre22_dev_axici,tir-tir,8 +lre22_dev_axtpv,xho-xho,6 +lre22_dev_aygsz,ara-aeb,4 +lre22_dev_ayiif,ven-ven,7 +lre22_dev_azqvo,zul-zul,3 +lre22_dev_basml,eng-ens,11 +lre22_dev_bawje,tir-tir,6 +lre22_dev_bbana,zul-zul,7 +lre22_dev_bbtpz,ven-ven,5 +lre22_dev_bcbrw,eng-iaf,2 +lre22_dev_bchvx,zul-zul,9 +lre22_dev_bcllp,afr-afr,13 +lre22_dev_bcsmi,fra-ntf,6 +lre22_dev_bdqaw,ven-ven,6 +lre22_dev_bdwle,ara-arq,6 +lre22_dev_behbh,ara-ayl,4 +lre22_dev_bexda,ara-arq,6 +lre22_dev_bfbyn,ara-aeb,9 +lre22_dev_bfjgx,ara-ayl,7 +lre22_dev_bgbjo,nbl-nbl,1 +lre22_dev_bgebs,ara-ayl,5 +lre22_dev_bgnod,fra-ntf,3 +lre22_dev_bhezb,ara-ayl,7 +lre22_dev_bhyuy,afr-afr,13 +lre22_dev_bidge,tir-tir,12 +lre22_dev_bimnd,eng-ens,7 +lre22_dev_biyaj,ara-ayl,5 +lre22_dev_bjsgu,afr-afr,10 +lre22_dev_blmfp,eng-iaf,5 +lre22_dev_blohd,ven-ven,4 +lre22_dev_bmebz,ara-arq,4 +lre22_dev_bmjuo,ara-aeb,6 +lre22_dev_bmkrm,fra-ntf,10 +lre22_dev_bmzym,zul-zul,5 +lre22_dev_bnfuu,orm-orm,13 +lre22_dev_bnilb,zul-zul,8 +lre22_dev_bnxna,eng-ens,1 +lre22_dev_boikl,orm-orm,7 +lre22_dev_boisz,ven-ven,2 +lre22_dev_boqxy,zul-zul,13 +lre22_dev_bpqhd,tso-tso,2 +lre22_dev_briiw,ara-aeb,8 +lre22_dev_brohj,fra-ntf,1 +lre22_dev_brqdv,nbl-nbl,3 +lre22_dev_brwcj,afr-afr,6 +lre22_dev_bsclv,orm-orm,8 +lre22_dev_bsdbb,ara-arq,4 +lre22_dev_bstjt,nbl-nbl,10 +lre22_dev_btbke,ara-aeb,0 +lre22_dev_btcfj,ven-ven,12 +lre22_dev_btomw,ven-ven,6 +lre22_dev_btpvy,afr-afr,1 +lre22_dev_btrtb,ara-arq,4 +lre22_dev_btruf,zul-zul,8 +lre22_dev_btsll,ara-ayl,7 +lre22_dev_butrw,ara-ayl,6 +lre22_dev_buwrj,ara-ayl,2 +lre22_dev_bvlhb,fra-ntf,8 +lre22_dev_bvmql,xho-xho,10 +lre22_dev_bvnsc,tir-tir,10 +lre22_dev_bwrej,ven-ven,9 +lre22_dev_bxial,eng-ens,2 +lre22_dev_bxnbf,fra-ntf,9 +lre22_dev_bybim,afr-afr,6 +lre22_dev_byegp,orm-orm,15 +lre22_dev_byngq,ven-ven,9 +lre22_dev_byytf,fra-ntf,6 +lre22_dev_bzies,tso-tso,3 +lre22_dev_bzipd,afr-afr,7 +lre22_dev_cacop,nbl-nbl,5 +lre22_dev_caent,afr-afr,12 +lre22_dev_capsb,ven-ven,0 +lre22_dev_cawbw,orm-orm,12 +lre22_dev_cblep,ven-ven,3 +lre22_dev_cblig,fra-ntf,6 +lre22_dev_ccexy,ven-ven,7 +lre22_dev_ccsye,ara-aeb,8 +lre22_dev_cctyt,eng-iaf,11 +lre22_dev_ccuie,eng-ens,7 +lre22_dev_ccvzf,eng-iaf,1 +lre22_dev_cdlkq,tso-tso,8 +lre22_dev_cdtiu,ara-ayl,9 +lre22_dev_cemyb,tir-tir,12 +lre22_dev_ceprg,eng-iaf,9 +lre22_dev_ceqow,nbl-nbl,15 +lre22_dev_cfdsu,fra-ntf,7 +lre22_dev_cfhbm,ven-ven,3 +lre22_dev_cfsew,afr-afr,12 +lre22_dev_cgges,eng-iaf,11 +lre22_dev_cgjnr,eng-iaf,10 +lre22_dev_cgotg,eng-ens,11 +lre22_dev_cgovb,nbl-nbl,15 +lre22_dev_cgssg,tir-tir,7 +lre22_dev_chhsl,tir-tir,7 +lre22_dev_chjuh,nbl-nbl,9 +lre22_dev_chpoe,nbl-nbl,11 +lre22_dev_chtgu,ara-aeb,10 +lre22_dev_chtlt,eng-iaf,10 +lre22_dev_cigir,eng-ens,9 +lre22_dev_ciyeh,ara-ayl,2 +lre22_dev_cjswm,orm-orm,12 +lre22_dev_cjtdl,ven-ven,13 +lre22_dev_ckzie,ara-aeb,10 +lre22_dev_cldfc,ara-ayl,8 +lre22_dev_clxqz,ara-arq,9 +lre22_dev_cmahj,afr-afr,13 +lre22_dev_cmqxm,tir-tir,6 +lre22_dev_cmrdt,afr-afr,5 +lre22_dev_cmvpq,ara-ayl,2 +lre22_dev_cnbfw,eng-iaf,5 +lre22_dev_cnbvd,afr-afr,9 +lre22_dev_cnomp,orm-orm,15 +lre22_dev_cnrvj,xho-xho,11 +lre22_dev_cnszu,ara-ayl,4 +lre22_dev_cnudd,xho-xho,14 +lre22_dev_cnuoi,orm-orm,14 +lre22_dev_cnxjs,orm-orm,8 +lre22_dev_coarm,xho-xho,4 +lre22_dev_cocyn,zul-zul,6 +lre22_dev_colxc,zul-zul,13 +lre22_dev_cosfn,ara-aeb,10 +lre22_dev_cosgu,ara-ayl,7 +lre22_dev_cpjab,ara-aeb,10 +lre22_dev_cpple,tso-tso,6 +lre22_dev_cqhjy,ara-ayl,3 +lre22_dev_cqkmy,ara-aeb,10 +lre22_dev_cqukb,tso-tso,9 +lre22_dev_cqusc,orm-orm,6 +lre22_dev_cqyzf,fra-ntf,13 +lre22_dev_crcwu,xho-xho,12 +lre22_dev_crqjz,nbl-nbl,10 +lre22_dev_crtpm,ara-arq,5 +lre22_dev_crucu,tir-tir,6 +lre22_dev_crvby,eng-iaf,12 +lre22_dev_crvoh,eng-ens,7 +lre22_dev_csjxv,ara-arq,3 +lre22_dev_ctfiv,ara-aeb,5 +lre22_dev_ctgpr,ven-ven,12 +lre22_dev_ctlrz,tir-tir,8 +lre22_dev_ctzhm,zul-zul,6 +lre22_dev_cudew,ven-ven,8 +lre22_dev_cusin,ara-arq,10 +lre22_dev_cvaad,eng-iaf,5 +lre22_dev_cvedm,zul-zul,12 +lre22_dev_cvgfx,eng-iaf,8 +lre22_dev_cvujh,ara-ayl,2 +lre22_dev_cweil,ara-aeb,10 +lre22_dev_cweuh,eng-ens,7 +lre22_dev_cwiro,afr-afr,6 +lre22_dev_cwtby,ara-arq,7 +lre22_dev_cxggy,afr-afr,4 +lre22_dev_cxnqr,tso-tso,7 +lre22_dev_cxpan,nbl-nbl,14 +lre22_dev_cxsxl,ara-aeb,10 +lre22_dev_cxyti,tso-tso,8 +lre22_dev_cypcg,zul-zul,12 +lre22_dev_czcmz,zul-zul,10 +lre22_dev_czdzw,orm-orm,7 +lre22_dev_czppj,zul-zul,10 +lre22_dev_czxff,zul-zul,9 +lre22_dev_czxld,fra-ntf,9 +lre22_dev_dajnt,zul-zul,12 +lre22_dev_dbcxi,orm-orm,9 +lre22_dev_dbdbv,tso-tso,8 +lre22_dev_dbdwv,orm-orm,5 +lre22_dev_dbgof,nbl-nbl,15 +lre22_dev_dblhh,eng-iaf,0 +lre22_dev_dbljb,xho-xho,12 +lre22_dev_dcibg,eng-iaf,4 +lre22_dev_dcobk,ara-arq,8 +lre22_dev_dcvcu,afr-afr,4 +lre22_dev_dcvyc,fra-ntf,14 +lre22_dev_ddfeo,ara-ayl,5 +lre22_dev_ddhaq,zul-zul,10 +lre22_dev_ddhes,afr-afr,6 +lre22_dev_ddsds,afr-afr,12 +lre22_dev_ddxvn,ven-ven,5 +lre22_dev_dfdrs,ven-ven,7 +lre22_dev_dfifl,ara-ayl,9 +lre22_dev_dfjek,ven-ven,4 +lre22_dev_dflco,zul-zul,12 +lre22_dev_dftta,tso-tso,6 +lre22_dev_dfxnq,eng-ens,11 +lre22_dev_dgjdi,orm-orm,8 +lre22_dev_dgqwo,tir-tir,9 +lre22_dev_dhapq,ara-aeb,8 +lre22_dev_dhdfk,eng-ens,8 +lre22_dev_dhfjj,ara-arq,4 +lre22_dev_dhlxh,ara-aeb,4 +lre22_dev_dhnne,eng-ens,10 +lre22_dev_dhtlz,eng-ens,6 +lre22_dev_diarz,ara-ayl,2 +lre22_dev_diggg,tir-tir,9 +lre22_dev_diqtw,ara-aeb,8 +lre22_dev_dixuw,orm-orm,9 +lre22_dev_diypf,orm-orm,13 +lre22_dev_djzsk,nbl-nbl,13 +lre22_dev_dksey,nbl-nbl,11 +lre22_dev_dlzwh,fra-ntf,12 +lre22_dev_dmdpv,eng-ens,2 +lre22_dev_dmeea,orm-orm,14 +lre22_dev_dmhdv,xho-xho,10 +lre22_dev_dmics,fra-ntf,14 +lre22_dev_dmiiu,ara-aeb,6 +lre22_dev_dmjxr,xho-xho,10 +lre22_dev_dmzxn,afr-afr,4 +lre22_dev_dngtw,ara-ayl,3 +lre22_dev_dnjdq,eng-ens,7 +lre22_dev_dnprz,zul-zul,12 +lre22_dev_dobdj,fra-ntf,0 +lre22_dev_dobwk,orm-orm,8 +lre22_dev_donqm,ara-arq,3 +lre22_dev_dpbyt,tso-tso,6 +lre22_dev_dpfns,ara-aeb,4 +lre22_dev_dpjjp,fra-ntf,7 +lre22_dev_dpomx,eng-iaf,5 +lre22_dev_dpwhs,eng-ens,8 +lre22_dev_dpygj,eng-iaf,8 +lre22_dev_dqzex,xho-xho,3 +lre22_dev_drcqx,eng-iaf,7 +lre22_dev_drfhb,ara-aeb,10 +lre22_dev_drfte,ara-arq,8 +lre22_dev_driks,eng-ens,11 +lre22_dev_drofs,fra-ntf,1 +lre22_dev_dslxl,ara-ayl,7 +lre22_dev_dsmwd,ven-ven,13 +lre22_dev_dsyyk,tir-tir,9 +lre22_dev_dthcb,zul-zul,12 +lre22_dev_dtumd,fra-ntf,5 +lre22_dev_dtwmj,afr-afr,7 +lre22_dev_duegm,tso-tso,9 +lre22_dev_dvirs,afr-afr,6 +lre22_dev_dvtzf,eng-iaf,7 +lre22_dev_dwcfi,ven-ven,7 +lre22_dev_dwfle,fra-ntf,7 +lre22_dev_dwgsv,tir-tir,6 +lre22_dev_dwlay,ara-arq,3 +lre22_dev_dwnit,xho-xho,15 +lre22_dev_dwvoh,tso-tso,6 +lre22_dev_dxgpq,afr-afr,12 +lre22_dev_dxhpf,ara-ayl,9 +lre22_dev_dxlhq,ara-arq,5 +lre22_dev_dxrcj,zul-zul,5 +lre22_dev_dywox,tir-tir,9 +lre22_dev_dzjrv,eng-iaf,8 +lre22_dev_dzsql,tso-tso,6 +lre22_dev_dzxkv,orm-orm,13 +lre22_dev_eabne,xho-xho,2 +lre22_dev_eacdl,fra-ntf,14 +lre22_dev_eaupg,eng-iaf,11 +lre22_dev_eawug,eng-iaf,6 +lre22_dev_ebbgx,nbl-nbl,15 +lre22_dev_ecber,afr-afr,10 +lre22_dev_ecdgv,ara-arq,5 +lre22_dev_ecneb,afr-afr,6 +lre22_dev_ecxrr,tir-tir,9 +lre22_dev_edldw,tir-tir,10 +lre22_dev_edofc,afr-afr,6 +lre22_dev_edvaf,xho-xho,13 +lre22_dev_edydw,eng-ens,5 +lre22_dev_eejtn,zul-zul,4 +lre22_dev_eekzc,fra-ntf,4 +lre22_dev_eenhx,eng-iaf,9 +lre22_dev_efcgi,fra-ntf,0 +lre22_dev_efdoz,ven-ven,8 +lre22_dev_efioy,tso-tso,9 +lre22_dev_efiwx,eng-ens,9 +lre22_dev_efrlw,ven-ven,2 +lre22_dev_eghmh,eng-ens,11 +lre22_dev_ehhyu,nbl-nbl,10 +lre22_dev_eiomi,ven-ven,12 +lre22_dev_eisiy,orm-orm,8 +lre22_dev_ejaiq,ara-aeb,1 +lre22_dev_ejkmr,eng-iaf,5 +lre22_dev_ejthv,ven-ven,12 +lre22_dev_ejtyd,fra-ntf,14 +lre22_dev_ekfzq,ara-ayl,6 +lre22_dev_ekgjp,zul-zul,3 +lre22_dev_ekixu,nbl-nbl,2 +lre22_dev_ekjxx,ara-arq,6 +lre22_dev_ekvxc,eng-iaf,4 +lre22_dev_eldrg,orm-orm,11 +lre22_dev_elitc,ara-arq,3 +lre22_dev_emdtf,xho-xho,0 +lre22_dev_emhqx,tir-tir,4 +lre22_dev_emxnm,afr-afr,4 +lre22_dev_emzaa,xho-xho,3 +lre22_dev_engqe,xho-xho,15 +lre22_dev_ennjl,tso-tso,10 +lre22_dev_eokyg,nbl-nbl,2 +lre22_dev_epkwr,tir-tir,5 +lre22_dev_epojj,tir-tir,7 +lre22_dev_epsdk,nbl-nbl,12 +lre22_dev_epsfl,xho-xho,14 +lre22_dev_epuno,eng-ens,1 +lre22_dev_epylu,eng-iaf,10 +lre22_dev_ereen,ara-arq,10 +lre22_dev_eriaf,eng-ens,4 +lre22_dev_ermqx,ara-arq,2 +lre22_dev_escob,fra-ntf,9 +lre22_dev_esjsk,ara-ayl,7 +lre22_dev_esqti,xho-xho,9 +lre22_dev_etaln,zul-zul,12 +lre22_dev_etarn,nbl-nbl,6 +lre22_dev_etndu,ven-ven,13 +lre22_dev_etpdc,afr-afr,3 +lre22_dev_etsam,zul-zul,7 +lre22_dev_etwge,eng-ens,6 +lre22_dev_etxyc,orm-orm,12 +lre22_dev_eumsq,zul-zul,10 +lre22_dev_eusfl,orm-orm,8 +lre22_dev_eutkk,tso-tso,0 +lre22_dev_euxuy,orm-orm,13 +lre22_dev_evaon,ara-aeb,4 +lre22_dev_evkaz,eng-iaf,8 +lre22_dev_evret,fra-ntf,8 +lre22_dev_evvep,tso-tso,9 +lre22_dev_evvvd,tir-tir,10 +lre22_dev_ewems,ven-ven,7 +lre22_dev_ewijw,orm-orm,11 +lre22_dev_ewqpv,eng-iaf,6 +lre22_dev_ewywf,nbl-nbl,10 +lre22_dev_exaia,afr-afr,3 +lre22_dev_exbum,afr-afr,4 +lre22_dev_exhhd,ara-aeb,5 +lre22_dev_exkkf,afr-afr,3 +lre22_dev_extrh,zul-zul,6 +lre22_dev_exzyo,xho-xho,15 +lre22_dev_eyrzt,ara-ayl,1 +lre22_dev_eysdu,zul-zul,4 +lre22_dev_eyshz,xho-xho,1 +lre22_dev_eyuyq,ara-ayl,7 +lre22_dev_ezsyu,ven-ven,3 +lre22_dev_faahr,afr-afr,9 +lre22_dev_fabli,ven-ven,6 +lre22_dev_fatah,zul-zul,12 +lre22_dev_fccpw,orm-orm,12 +lre22_dev_fcpbu,xho-xho,8 +lre22_dev_fcqbx,tso-tso,3 +lre22_dev_fcwnw,fra-ntf,8 +lre22_dev_fdgia,orm-orm,10 +lre22_dev_febnk,eng-ens,5 +lre22_dev_fedau,eng-iaf,5 +lre22_dev_fehxn,xho-xho,8 +lre22_dev_fejsd,ven-ven,8 +lre22_dev_feqjc,eng-iaf,12 +lre22_dev_fesss,nbl-nbl,15 +lre22_dev_feuww,fra-ntf,8 +lre22_dev_fevex,zul-zul,2 +lre22_dev_ffban,ara-arq,6 +lre22_dev_ffefw,orm-orm,13 +lre22_dev_ffsps,fra-ntf,8 +lre22_dev_ffwid,tso-tso,11 +lre22_dev_fgbtr,nbl-nbl,15 +lre22_dev_fgmbr,ara-arq,6 +lre22_dev_fgmxd,eng-ens,9 +lre22_dev_fgnfs,tir-tir,12 +lre22_dev_fgrze,eng-ens,11 +lre22_dev_fhlhy,ara-aeb,7 +lre22_dev_fihvr,eng-iaf,7 +lre22_dev_fiizm,xho-xho,14 +lre22_dev_fiksd,fra-ntf,12 +lre22_dev_fitjt,tso-tso,6 +lre22_dev_fiuun,eng-ens,7 +lre22_dev_fjdul,ara-ayl,3 +lre22_dev_fjgrh,ven-ven,8 +lre22_dev_fkaqj,nbl-nbl,13 +lre22_dev_flfgv,ara-aeb,9 +lre22_dev_flirl,fra-ntf,13 +lre22_dev_fljab,fra-ntf,14 +lre22_dev_flnzm,tir-tir,11 +lre22_dev_flsmp,orm-orm,15 +lre22_dev_fmjvq,ven-ven,2 +lre22_dev_fmmxd,afr-afr,4 +lre22_dev_fnglh,afr-afr,13 +lre22_dev_fnsax,xho-xho,6 +lre22_dev_fojyn,eng-ens,5 +lre22_dev_foqgk,ven-ven,2 +lre22_dev_fovba,ara-arq,4 +lre22_dev_fozyj,ara-arq,2 +lre22_dev_fpavw,ara-aeb,8 +lre22_dev_fptba,eng-ens,3 +lre22_dev_fqdfc,tso-tso,11 +lre22_dev_fqdhm,eng-iaf,8 +lre22_dev_fqfet,nbl-nbl,7 +lre22_dev_fqgty,fra-ntf,4 +lre22_dev_fqgyd,zul-zul,10 +lre22_dev_fqvup,tso-tso,2 +lre22_dev_frviu,ara-aeb,10 +lre22_dev_frwfk,nbl-nbl,9 +lre22_dev_fsygm,eng-iaf,5 +lre22_dev_ftfjv,orm-orm,11 +lre22_dev_ftjvg,afr-afr,12 +lre22_dev_ftmnu,ara-aeb,10 +lre22_dev_ftrcl,eng-ens,3 +lre22_dev_ftygz,eng-ens,8 +lre22_dev_fughv,eng-iaf,3 +lre22_dev_fuhuk,ara-ayl,5 +lre22_dev_fusyr,ven-ven,13 +lre22_dev_futhm,zul-zul,5 +lre22_dev_fvbzh,ara-ayl,7 +lre22_dev_fvecf,ven-ven,9 +lre22_dev_fvktn,fra-ntf,8 +lre22_dev_fvpts,orm-orm,6 +lre22_dev_fvsmm,eng-iaf,12 +lre22_dev_fvvgc,ara-arq,5 +lre22_dev_fwvzh,zul-zul,2 +lre22_dev_fwwsy,xho-xho,5 +lre22_dev_fxggn,fra-ntf,1 +lre22_dev_fxqfi,orm-orm,10 +lre22_dev_fxuqw,ara-ayl,3 +lre22_dev_fxwfc,eng-iaf,12 +lre22_dev_fymdc,tso-tso,4 +lre22_dev_fywir,tso-tso,10 +lre22_dev_fzjzu,xho-xho,14 +lre22_dev_fzpeh,ara-aeb,10 +lre22_dev_fztdi,tir-tir,9 +lre22_dev_gcced,ven-ven,6 +lre22_dev_gchqj,zul-zul,10 +lre22_dev_gctmk,xho-xho,12 +lre22_dev_gcupw,ven-ven,7 +lre22_dev_gdfdn,tir-tir,5 +lre22_dev_gdlpg,tir-tir,3 +lre22_dev_gdrwq,fra-ntf,14 +lre22_dev_gdvjh,afr-afr,5 +lre22_dev_gdvtc,eng-iaf,13 +lre22_dev_gdxck,orm-orm,4 +lre22_dev_gecgq,afr-afr,12 +lre22_dev_gevbs,nbl-nbl,13 +lre22_dev_gfqxw,tir-tir,11 +lre22_dev_gfujh,eng-ens,8 +lre22_dev_gfwqx,fra-ntf,10 +lre22_dev_ggchj,tir-tir,10 +lre22_dev_ggeie,ara-arq,8 +lre22_dev_ggqob,ara-aeb,9 +lre22_dev_ghllb,eng-ens,8 +lre22_dev_ghlqh,afr-afr,12 +lre22_dev_ghmuk,afr-afr,13 +lre22_dev_ghskg,tso-tso,4 +lre22_dev_ghwmw,ara-arq,2 +lre22_dev_giijn,ven-ven,6 +lre22_dev_gised,xho-xho,9 +lre22_dev_gisrt,tir-tir,9 +lre22_dev_gjptx,nbl-nbl,4 +lre22_dev_gjvkc,ara-arq,7 +lre22_dev_gjxkc,eng-iaf,13 +lre22_dev_gkywh,ara-aeb,7 +lre22_dev_glhtl,eng-iaf,3 +lre22_dev_glulw,ara-aeb,8 +lre22_dev_gmpja,nbl-nbl,3 +lre22_dev_gmpjm,nbl-nbl,12 +lre22_dev_gnkvz,eng-iaf,13 +lre22_dev_gnmcz,nbl-nbl,4 +lre22_dev_goggr,afr-afr,5 +lre22_dev_goqov,ara-aeb,8 +lre22_dev_gpzgq,tso-tso,9 +lre22_dev_gpzuz,fra-ntf,5 +lre22_dev_gqpul,ara-arq,10 +lre22_dev_gratu,tir-tir,7 +lre22_dev_grewx,afr-afr,9 +lre22_dev_grizt,eng-ens,2 +lre22_dev_grsam,afr-afr,11 +lre22_dev_grsyr,zul-zul,1 +lre22_dev_grxus,nbl-nbl,15 +lre22_dev_gsanj,ven-ven,13 +lre22_dev_gsbwz,nbl-nbl,9 +lre22_dev_gtwjj,tso-tso,4 +lre22_dev_gtxwq,orm-orm,12 +lre22_dev_gubts,ara-ayl,0 +lre22_dev_gvawh,xho-xho,11 +lre22_dev_gvfsb,ara-aeb,10 +lre22_dev_gvhgg,afr-afr,9 +lre22_dev_gvnaj,fra-ntf,8 +lre22_dev_gvysc,ara-aeb,10 +lre22_dev_gwfkz,xho-xho,2 +lre22_dev_gwnqp,xho-xho,7 +lre22_dev_gwumi,tso-tso,3 +lre22_dev_gwvcw,xho-xho,11 +lre22_dev_gwwxz,eng-iaf,1 +lre22_dev_gwzrc,eng-ens,11 +lre22_dev_gxtlx,fra-ntf,13 +lre22_dev_gxygl,tso-tso,9 +lre22_dev_gycld,orm-orm,4 +lre22_dev_gzakl,nbl-nbl,15 +lre22_dev_gzrgo,ara-arq,9 +lre22_dev_hbkul,orm-orm,6 +lre22_dev_hbodn,eng-ens,10 +lre22_dev_hbwgy,ara-arq,6 +lre22_dev_hbwyc,nbl-nbl,5 +lre22_dev_hczek,fra-ntf,7 +lre22_dev_hdpsb,nbl-nbl,6 +lre22_dev_hdvsb,ara-aeb,8 +lre22_dev_hetsy,xho-xho,10 +lre22_dev_hfgrm,ven-ven,12 +lre22_dev_hfurz,afr-afr,13 +lre22_dev_hfwyw,nbl-nbl,11 +lre22_dev_hgdqx,tso-tso,3 +lre22_dev_hgwdk,eng-ens,8 +lre22_dev_hgxqf,eng-iaf,8 +lre22_dev_hgyuk,ven-ven,11 +lre22_dev_hhetm,fra-ntf,14 +lre22_dev_hhjki,ara-arq,8 +lre22_dev_hhvtc,ara-arq,10 +lre22_dev_hhxqv,tso-tso,5 +lre22_dev_hiisb,nbl-nbl,15 +lre22_dev_hioxp,tso-tso,3 +lre22_dev_hjqaf,ara-aeb,9 +lre22_dev_hjqid,orm-orm,6 +lre22_dev_hjzwc,eng-iaf,3 +lre22_dev_hkdzu,ara-arq,9 +lre22_dev_hlatl,eng-iaf,12 +lre22_dev_hlywv,nbl-nbl,2 +lre22_dev_hlzxa,ven-ven,7 +lre22_dev_hmvzg,ara-ayl,3 +lre22_dev_hnjgb,eng-ens,9 +lre22_dev_hntdv,eng-ens,11 +lre22_dev_hoish,tir-tir,2 +lre22_dev_hokbg,ara-ayl,6 +lre22_dev_hondp,eng-iaf,8 +lre22_dev_hpbve,tir-tir,11 +lre22_dev_hpdvc,fra-ntf,8 +lre22_dev_hpgst,orm-orm,5 +lre22_dev_hqbjb,xho-xho,5 +lre22_dev_hqdev,tso-tso,2 +lre22_dev_hqidg,tir-tir,1 +lre22_dev_hqids,afr-afr,9 +lre22_dev_hqltr,tir-tir,4 +lre22_dev_hqqhq,eng-ens,11 +lre22_dev_hrmcg,zul-zul,13 +lre22_dev_hrrcp,afr-afr,8 +lre22_dev_hstgi,xho-xho,9 +lre22_dev_hsvpq,ara-ayl,9 +lre22_dev_hswsy,ara-aeb,4 +lre22_dev_htcgm,eng-iaf,6 +lre22_dev_htedo,xho-xho,13 +lre22_dev_hthkx,eng-iaf,7 +lre22_dev_htohd,afr-afr,6 +lre22_dev_htxik,fra-ntf,0 +lre22_dev_huqbr,xho-xho,10 +lre22_dev_hvdom,afr-afr,8 +lre22_dev_hvkoa,afr-afr,13 +lre22_dev_hvnkg,tir-tir,9 +lre22_dev_hvocp,nbl-nbl,12 +lre22_dev_hvqzj,zul-zul,12 +lre22_dev_hvwph,afr-afr,3 +lre22_dev_hwaqg,zul-zul,8 +lre22_dev_hwgvu,ara-aeb,6 +lre22_dev_hwhlz,ven-ven,11 +lre22_dev_hwkes,fra-ntf,12 +lre22_dev_hwvna,eng-ens,2 +lre22_dev_hxfim,eng-iaf,12 +lre22_dev_hxmdw,afr-afr,10 +lre22_dev_hxrnp,zul-zul,6 +lre22_dev_hxvie,tir-tir,9 +lre22_dev_hxvju,zul-zul,3 +lre22_dev_hxzxm,zul-zul,6 +lre22_dev_hybef,nbl-nbl,14 +lre22_dev_hyfok,eng-ens,2 +lre22_dev_hyscv,ara-arq,4 +lre22_dev_hyzod,eng-iaf,6 +lre22_dev_hzdpb,tso-tso,7 +lre22_dev_hzjwn,ara-aeb,5 +lre22_dev_hzljv,tir-tir,8 +lre22_dev_hzomy,tso-tso,9 +lre22_dev_iaaar,tso-tso,9 +lre22_dev_iaimu,afr-afr,13 +lre22_dev_iakmg,orm-orm,15 +lre22_dev_iarxv,ara-aeb,9 +lre22_dev_iaywv,ara-ayl,6 +lre22_dev_ibcne,eng-ens,11 +lre22_dev_ibeth,zul-zul,2 +lre22_dev_ibwbi,tir-tir,9 +lre22_dev_ibyqr,tso-tso,7 +lre22_dev_iccwp,eng-iaf,6 +lre22_dev_ichmi,afr-afr,12 +lre22_dev_idjrt,zul-zul,8 +lre22_dev_iegng,afr-afr,8 +lre22_dev_iezrr,ara-ayl,7 +lre22_dev_ifaib,ara-ayl,5 +lre22_dev_ifhil,tso-tso,9 +lre22_dev_ifptd,ven-ven,12 +lre22_dev_ifriu,ara-aeb,6 +lre22_dev_ignvp,zul-zul,13 +lre22_dev_igxzy,eng-iaf,12 +lre22_dev_ihdva,fra-ntf,10 +lre22_dev_iiydv,eng-iaf,5 +lre22_dev_ijoyg,ara-ayl,9 +lre22_dev_ikghg,eng-iaf,7 +lre22_dev_ikijv,ven-ven,2 +lre22_dev_ilawb,ara-aeb,8 +lre22_dev_ilgnm,orm-orm,6 +lre22_dev_ilqhp,orm-orm,13 +lre22_dev_imrsx,tso-tso,8 +lre22_dev_inrfz,ara-arq,1 +lre22_dev_inrlw,eng-ens,1 +lre22_dev_inttm,tso-tso,8 +lre22_dev_iorip,ven-ven,13 +lre22_dev_ioryq,ara-aeb,8 +lre22_dev_iosse,afr-afr,1 +lre22_dev_ipahz,tir-tir,12 +lre22_dev_ipaup,tir-tir,10 +lre22_dev_ipllz,tir-tir,12 +lre22_dev_iprih,ara-aeb,4 +lre22_dev_iqkpj,tir-tir,6 +lre22_dev_iqowb,ara-aeb,0 +lre22_dev_iqzfp,orm-orm,15 +lre22_dev_irhue,tso-tso,8 +lre22_dev_irkvo,orm-orm,15 +lre22_dev_irnie,ara-aeb,8 +lre22_dev_irnxg,zul-zul,9 +lre22_dev_irsgt,ven-ven,2 +lre22_dev_isavf,nbl-nbl,0 +lre22_dev_isfpd,nbl-nbl,11 +lre22_dev_iskfd,ara-arq,4 +lre22_dev_isndz,ara-arq,6 +lre22_dev_istwz,nbl-nbl,15 +lre22_dev_isxpy,orm-orm,5 +lre22_dev_iszkk,tir-tir,9 +lre22_dev_itdot,ara-ayl,9 +lre22_dev_itfgh,eng-iaf,9 +lre22_dev_itlqd,tir-tir,12 +lre22_dev_itmbo,ara-aeb,10 +lre22_dev_itznp,ara-aeb,3 +lre22_dev_iucwv,zul-zul,5 +lre22_dev_iuowb,ara-aeb,8 +lre22_dev_iupes,zul-zul,4 +lre22_dev_iurgk,fra-ntf,4 +lre22_dev_ivcpr,nbl-nbl,12 +lre22_dev_ivrwa,ven-ven,3 +lre22_dev_ivvlb,afr-afr,11 +lre22_dev_ivwhm,tir-tir,6 +lre22_dev_iwoya,ara-aeb,4 +lre22_dev_iwpvu,orm-orm,5 +lre22_dev_ixpuq,ara-ayl,5 +lre22_dev_ixpyb,tso-tso,11 +lre22_dev_iyfiz,eng-iaf,5 +lre22_dev_iylyu,xho-xho,12 +lre22_dev_iyuli,zul-zul,13 +lre22_dev_iyupt,orm-orm,5 +lre22_dev_iyxjf,zul-zul,12 +lre22_dev_iyzgz,tso-tso,10 +lre22_dev_izepb,ara-arq,4 +lre22_dev_izkix,ven-ven,10 +lre22_dev_izknz,ven-ven,12 +lre22_dev_jadfl,ara-arq,9 +lre22_dev_jafja,zul-zul,9 +lre22_dev_jamvn,ven-ven,1 +lre22_dev_jbach,eng-iaf,2 +lre22_dev_jbqcq,ara-aeb,6 +lre22_dev_jcxgo,afr-afr,6 +lre22_dev_jddrh,fra-ntf,13 +lre22_dev_jdjpg,tir-tir,12 +lre22_dev_jdtrb,eng-iaf,11 +lre22_dev_jdwjj,zul-zul,7 +lre22_dev_jdzqw,tir-tir,3 +lre22_dev_jeaev,nbl-nbl,8 +lre22_dev_jeobs,ara-aeb,9 +lre22_dev_jesxq,eng-ens,10 +lre22_dev_jgcla,ara-arq,2 +lre22_dev_jggxv,fra-ntf,3 +lre22_dev_jgntz,orm-orm,5 +lre22_dev_jhcao,ven-ven,7 +lre22_dev_jhgik,eng-ens,11 +lre22_dev_jhpkj,ara-arq,4 +lre22_dev_jhuof,orm-orm,15 +lre22_dev_jignq,ara-ayl,9 +lre22_dev_jjffc,ven-ven,13 +lre22_dev_jjkfe,eng-ens,9 +lre22_dev_jjqxi,ara-aeb,8 +lre22_dev_jjrgq,eng-iaf,4 +lre22_dev_jkacy,tso-tso,3 +lre22_dev_jkmin,orm-orm,15 +lre22_dev_jkobe,xho-xho,7 +lre22_dev_jkosd,zul-zul,10 +lre22_dev_jkovc,tso-tso,3 +lre22_dev_jktcq,zul-zul,7 +lre22_dev_jlodp,eng-ens,9 +lre22_dev_jmbjo,nbl-nbl,9 +lre22_dev_jmccw,ara-arq,3 +lre22_dev_jminj,fra-ntf,5 +lre22_dev_jmmyw,afr-afr,3 +lre22_dev_jobae,fra-ntf,13 +lre22_dev_jobsv,nbl-nbl,14 +lre22_dev_jobxi,ara-arq,5 +lre22_dev_joghi,ara-arq,6 +lre22_dev_johkj,xho-xho,7 +lre22_dev_jolqw,ara-ayl,5 +lre22_dev_jplye,fra-ntf,11 +lre22_dev_jpsmt,ara-arq,9 +lre22_dev_jqdnf,eng-iaf,13 +lre22_dev_jqqpg,orm-orm,5 +lre22_dev_jqqrs,nbl-nbl,11 +lre22_dev_jrmnp,tir-tir,9 +lre22_dev_jsahe,fra-ntf,12 +lre22_dev_jsciw,eng-ens,5 +lre22_dev_jsisu,eng-iaf,4 +lre22_dev_jstjq,zul-zul,4 +lre22_dev_jsxuw,eng-iaf,8 +lre22_dev_jtaxh,ven-ven,4 +lre22_dev_jtgjo,ara-arq,9 +lre22_dev_jtxor,orm-orm,3 +lre22_dev_junyj,orm-orm,5 +lre22_dev_juykt,ara-ayl,7 +lre22_dev_jvqzf,fra-ntf,9 +lre22_dev_jvvxl,afr-afr,7 +lre22_dev_jvxpt,nbl-nbl,1 +lre22_dev_jwfeb,eng-iaf,4 +lre22_dev_jwmmp,eng-ens,3 +lre22_dev_jwyiq,tso-tso,10 +lre22_dev_jxcmp,ara-aeb,10 +lre22_dev_jxfsy,ara-ayl,9 +lre22_dev_jxjar,tso-tso,10 +lre22_dev_jylrr,ara-aeb,9 +lre22_dev_jzciw,orm-orm,5 +lre22_dev_jzcyt,tso-tso,5 +lre22_dev_jzhpf,tso-tso,4 +lre22_dev_jzidh,afr-afr,11 +lre22_dev_jznzw,eng-iaf,6 +lre22_dev_jzoqd,afr-afr,7 +lre22_dev_jzwnu,ven-ven,11 +lre22_dev_kaoyk,afr-afr,6 +lre22_dev_kasoe,zul-zul,12 +lre22_dev_kaygq,eng-ens,9 +lre22_dev_kayqh,fra-ntf,8 +lre22_dev_kbpcw,eng-iaf,3 +lre22_dev_kbtrx,orm-orm,10 +lre22_dev_kcebk,ven-ven,7 +lre22_dev_kdbil,orm-orm,15 +lre22_dev_kddhf,ara-arq,10 +lre22_dev_kdeij,ara-ayl,3 +lre22_dev_kdiak,zul-zul,12 +lre22_dev_kedwl,nbl-nbl,12 +lre22_dev_keouf,fra-ntf,9 +lre22_dev_keozw,ara-aeb,10 +lre22_dev_kervm,eng-ens,7 +lre22_dev_kflpm,xho-xho,1 +lre22_dev_kfqpd,ara-arq,8 +lre22_dev_kgaqj,ara-aeb,8 +lre22_dev_kghnx,fra-ntf,3 +lre22_dev_kgoze,zul-zul,4 +lre22_dev_kgrxe,fra-ntf,9 +lre22_dev_kgsdu,ara-arq,5 +lre22_dev_kheef,xho-xho,15 +lre22_dev_khgyl,xho-xho,8 +lre22_dev_khsgr,tso-tso,7 +lre22_dev_khxvm,nbl-nbl,9 +lre22_dev_kijjo,ara-aeb,3 +lre22_dev_kiush,xho-xho,2 +lre22_dev_kiyso,ara-arq,1 +lre22_dev_kjewo,ven-ven,6 +lre22_dev_kjgkg,ara-ayl,5 +lre22_dev_kjksh,ven-ven,3 +lre22_dev_kjomd,afr-afr,4 +lre22_dev_kjrcy,afr-afr,11 +lre22_dev_kkauw,fra-ntf,10 +lre22_dev_kkiew,orm-orm,15 +lre22_dev_kkyyu,zul-zul,8 +lre22_dev_klafc,ara-ayl,4 +lre22_dev_klalo,eng-ens,5 +lre22_dev_kliip,afr-afr,1 +lre22_dev_klkxg,tso-tso,8 +lre22_dev_klqwc,ara-arq,7 +lre22_dev_kmbgg,tir-tir,12 +lre22_dev_kmgoo,tir-tir,8 +lre22_dev_kmnko,zul-zul,3 +lre22_dev_kmtyc,ara-aeb,8 +lre22_dev_kmxqj,xho-xho,8 +lre22_dev_kmzdw,fra-ntf,3 +lre22_dev_knxsi,ara-arq,9 +lre22_dev_kofob,orm-orm,7 +lre22_dev_kokfk,fra-ntf,14 +lre22_dev_kokir,nbl-nbl,12 +lre22_dev_kooxu,ara-arq,9 +lre22_dev_korip,tso-tso,7 +lre22_dev_kpbnd,zul-zul,4 +lre22_dev_kpnyf,eng-iaf,3 +lre22_dev_kpwts,ara-ayl,8 +lre22_dev_kpxne,orm-orm,6 +lre22_dev_kpzbl,ven-ven,12 +lre22_dev_kqact,zul-zul,0 +lre22_dev_kqfbl,eng-iaf,12 +lre22_dev_kqfsm,zul-zul,5 +lre22_dev_kqfyp,ara-arq,1 +lre22_dev_kqkqj,ara-ayl,7 +lre22_dev_kqvwr,xho-xho,13 +lre22_dev_kragl,zul-zul,13 +lre22_dev_krbdn,xho-xho,14 +lre22_dev_ksake,ara-aeb,8 +lre22_dev_ksoly,nbl-nbl,11 +lre22_dev_kttyt,orm-orm,5 +lre22_dev_kttzq,tso-tso,9 +lre22_dev_ktwaf,zul-zul,3 +lre22_dev_ktwqf,ven-ven,6 +lre22_dev_ktxef,zul-zul,0 +lre22_dev_ktztb,orm-orm,12 +lre22_dev_kufkm,nbl-nbl,15 +lre22_dev_kuqsu,afr-afr,9 +lre22_dev_kuyka,tir-tir,4 +lre22_dev_kvcpn,ara-ayl,3 +lre22_dev_kvghz,eng-iaf,10 +lre22_dev_kvswv,ven-ven,11 +lre22_dev_kxkos,orm-orm,10 +lre22_dev_kxkzg,ara-ayl,9 +lre22_dev_kxqef,ven-ven,12 +lre22_dev_kyjpf,ven-ven,7 +lre22_dev_kynap,ara-ayl,9 +lre22_dev_kyptg,ven-ven,8 +lre22_dev_kytyr,nbl-nbl,11 +lre22_dev_kywmf,orm-orm,4 +lre22_dev_kzibn,zul-zul,3 +lre22_dev_kzqxx,fra-ntf,1 +lre22_dev_lacgv,tso-tso,7 +lre22_dev_lagpe,tso-tso,6 +lre22_dev_lanuu,tso-tso,9 +lre22_dev_lapag,afr-afr,6 +lre22_dev_larnq,zul-zul,4 +lre22_dev_lbbvq,xho-xho,8 +lre22_dev_lbfca,ara-arq,8 +lre22_dev_lbhoj,orm-orm,11 +lre22_dev_lbiin,ara-ayl,4 +lre22_dev_lcdyj,ara-arq,9 +lre22_dev_ldasz,fra-ntf,9 +lre22_dev_ldbur,tso-tso,1 +lre22_dev_lddhs,orm-orm,12 +lre22_dev_ldedw,ara-aeb,5 +lre22_dev_ldmbr,ara-ayl,5 +lre22_dev_ldmqc,tir-tir,7 +lre22_dev_leadw,eng-iaf,3 +lre22_dev_leaqq,tso-tso,10 +lre22_dev_ledsh,afr-afr,11 +lre22_dev_leovk,afr-afr,6 +lre22_dev_lexlh,ara-aeb,2 +lre22_dev_lfilk,eng-ens,10 +lre22_dev_lfyll,zul-zul,10 +lre22_dev_lgada,zul-zul,6 +lre22_dev_lgcjy,afr-afr,9 +lre22_dev_lgfri,ara-aeb,5 +lre22_dev_lgkbt,xho-xho,4 +lre22_dev_lhbjq,ara-arq,0 +lre22_dev_lhemi,xho-xho,9 +lre22_dev_lhfne,ara-arq,6 +lre22_dev_lhmtg,ara-arq,9 +lre22_dev_lieso,ara-aeb,8 +lre22_dev_likcy,afr-afr,13 +lre22_dev_lipyu,zul-zul,12 +lre22_dev_lisum,ven-ven,4 +lre22_dev_ljevp,ara-ayl,3 +lre22_dev_ljijh,orm-orm,3 +lre22_dev_ljylg,nbl-nbl,13 +lre22_dev_lkfig,ara-ayl,2 +lre22_dev_lklnc,ara-arq,3 +lre22_dev_lkopy,tir-tir,9 +lre22_dev_lllwi,eng-iaf,5 +lre22_dev_llstb,nbl-nbl,10 +lre22_dev_lmeax,eng-iaf,10 +lre22_dev_lmkui,ara-arq,7 +lre22_dev_lmrbp,tir-tir,9 +lre22_dev_lnejh,eng-ens,10 +lre22_dev_lnttv,ven-ven,10 +lre22_dev_loxqz,eng-iaf,8 +lre22_dev_loybq,ara-aeb,10 +lre22_dev_lpadb,fra-ntf,4 +lre22_dev_lpahk,nbl-nbl,11 +lre22_dev_lphgs,tir-tir,7 +lre22_dev_lphoa,eng-ens,2 +lre22_dev_lpkie,eng-iaf,5 +lre22_dev_lpkpc,zul-zul,6 +lre22_dev_lptpx,eng-iaf,4 +lre22_dev_lqwcv,xho-xho,13 +lre22_dev_lrgwx,orm-orm,10 +lre22_dev_lruoj,orm-orm,2 +lre22_dev_lrwee,fra-ntf,10 +lre22_dev_lsess,ven-ven,1 +lre22_dev_lsycj,tir-tir,9 +lre22_dev_ltaoe,eng-ens,8 +lre22_dev_ltish,ara-aeb,5 +lre22_dev_ltqeb,eng-ens,8 +lre22_dev_ltzfg,ven-ven,10 +lre22_dev_luuhd,ara-arq,2 +lre22_dev_lvejl,zul-zul,11 +lre22_dev_lvgsm,tir-tir,10 +lre22_dev_lvwle,xho-xho,7 +lre22_dev_lvxea,tir-tir,8 +lre22_dev_lwsmk,eng-ens,10 +lre22_dev_lwzhq,ara-ayl,3 +lre22_dev_lxbdd,ara-ayl,8 +lre22_dev_lxdgx,nbl-nbl,1 +lre22_dev_lxjij,ara-ayl,7 +lre22_dev_lxldm,tso-tso,8 +lre22_dev_lxmsa,zul-zul,11 +lre22_dev_lxugv,zul-zul,13 +lre22_dev_lxwig,tso-tso,4 +lre22_dev_lyigi,xho-xho,4 +lre22_dev_lymzv,ara-arq,6 +lre22_dev_lyuls,ara-arq,4 +lre22_dev_lyyzw,ara-ayl,5 +lre22_dev_lzhrm,ara-arq,8 +lre22_dev_lzjgb,xho-xho,12 +lre22_dev_lzrpe,xho-xho,8 +lre22_dev_lzvmq,fra-ntf,13 +lre22_dev_maagy,ven-ven,6 +lre22_dev_mabmx,ara-arq,4 +lre22_dev_macre,zul-zul,7 +lre22_dev_maggb,nbl-nbl,7 +lre22_dev_margf,ara-ayl,6 +lre22_dev_maydg,eng-iaf,4 +lre22_dev_mbsgm,zul-zul,7 +lre22_dev_mbttd,fra-ntf,14 +lre22_dev_mcebh,tso-tso,8 +lre22_dev_mcfve,ara-ayl,3 +lre22_dev_mclrc,zul-zul,12 +lre22_dev_mcvgl,ara-ayl,5 +lre22_dev_mdgok,ara-aeb,5 +lre22_dev_mdilb,ven-ven,3 +lre22_dev_mdzqr,nbl-nbl,11 +lre22_dev_mehfu,ara-arq,3 +lre22_dev_meiyg,eng-ens,11 +lre22_dev_merbq,orm-orm,9 +lre22_dev_mfoys,afr-afr,8 +lre22_dev_mgpfx,xho-xho,8 +lre22_dev_mgtzj,zul-zul,12 +lre22_dev_mgxxc,ven-ven,11 +lre22_dev_mhldj,nbl-nbl,14 +lre22_dev_mhvio,eng-iaf,6 +lre22_dev_mhxgi,tir-tir,9 +lre22_dev_miegc,fra-ntf,6 +lre22_dev_miwyu,ara-aeb,8 +lre22_dev_mjocm,ara-aeb,2 +lre22_dev_mjqij,orm-orm,12 +lre22_dev_mjxgy,afr-afr,8 +lre22_dev_mkeyt,tir-tir,12 +lre22_dev_mklub,ven-ven,4 +lre22_dev_mknzf,ara-aeb,10 +lre22_dev_mlhes,ara-arq,9 +lre22_dev_mlhse,tso-tso,3 +lre22_dev_mlhtc,orm-orm,8 +lre22_dev_mlpuq,ven-ven,10 +lre22_dev_mluow,orm-orm,2 +lre22_dev_mmwtu,ara-arq,4 +lre22_dev_mmwzf,tso-tso,7 +lre22_dev_mnjdq,tir-tir,10 +lre22_dev_mnkfe,nbl-nbl,4 +lre22_dev_mnmcm,ara-arq,3 +lre22_dev_mocss,xho-xho,9 +lre22_dev_mohxo,zul-zul,12 +lre22_dev_mojui,fra-ntf,1 +lre22_dev_mojvy,xho-xho,7 +lre22_dev_molqa,fra-ntf,14 +lre22_dev_mopiq,nbl-nbl,14 +lre22_dev_moqto,tir-tir,12 +lre22_dev_morri,ara-aeb,8 +lre22_dev_mpxyg,eng-ens,4 +lre22_dev_mqiap,xho-xho,14 +lre22_dev_mqxep,ara-ayl,2 +lre22_dev_mrcoe,ara-ayl,7 +lre22_dev_mriiq,tso-tso,4 +lre22_dev_mryoy,eng-ens,11 +lre22_dev_mryzh,ara-arq,4 +lre22_dev_msadm,ven-ven,2 +lre22_dev_msghz,nbl-nbl,11 +lre22_dev_mtpfp,ara-aeb,9 +lre22_dev_mtqft,orm-orm,14 +lre22_dev_mtzvt,ara-aeb,10 +lre22_dev_munim,xho-xho,15 +lre22_dev_murhb,nbl-nbl,1 +lre22_dev_mvbra,xho-xho,4 +lre22_dev_mvhza,afr-afr,13 +lre22_dev_mviud,xho-xho,12 +lre22_dev_mvxjk,afr-afr,9 +lre22_dev_mwnkm,orm-orm,8 +lre22_dev_mwoml,xho-xho,9 +lre22_dev_mxhup,eng-ens,8 +lre22_dev_mykuh,ara-ayl,5 +lre22_dev_myqfn,eng-iaf,4 +lre22_dev_mywmj,ven-ven,9 +lre22_dev_mzbrr,ara-arq,10 +lre22_dev_mzsiq,afr-afr,9 +lre22_dev_mztms,eng-ens,3 +lre22_dev_mzuxc,ara-arq,9 +lre22_dev_nbdbe,ara-ayl,7 +lre22_dev_nbjqz,ara-aeb,9 +lre22_dev_nbyhp,afr-afr,3 +lre22_dev_ncnyb,ven-ven,8 +lre22_dev_ncocl,nbl-nbl,6 +lre22_dev_ndecq,ara-ayl,8 +lre22_dev_ndjsl,nbl-nbl,6 +lre22_dev_nelsk,orm-orm,0 +lre22_dev_nenly,eng-iaf,11 +lre22_dev_neqkb,ven-ven,2 +lre22_dev_nfjid,orm-orm,12 +lre22_dev_nfkqr,orm-orm,8 +lre22_dev_nfoas,orm-orm,15 +lre22_dev_ngjbm,eng-ens,10 +lre22_dev_ngmbz,eng-iaf,9 +lre22_dev_ngnua,fra-ntf,10 +lre22_dev_nguuu,fra-ntf,13 +lre22_dev_ngyse,ven-ven,7 +lre22_dev_nhfso,fra-ntf,14 +lre22_dev_nhuue,zul-zul,1 +lre22_dev_niack,ara-ayl,8 +lre22_dev_niari,ven-ven,7 +lre22_dev_nibme,ara-arq,9 +lre22_dev_nikby,tso-tso,10 +lre22_dev_nimex,ara-ayl,8 +lre22_dev_nivmv,xho-xho,11 +lre22_dev_nkebu,eng-ens,5 +lre22_dev_nkgml,eng-ens,10 +lre22_dev_nkofi,fra-ntf,11 +lre22_dev_nkrez,xho-xho,5 +lre22_dev_nkscn,tso-tso,5 +lre22_dev_nkwrs,ara-aeb,2 +lre22_dev_nkxcy,afr-afr,4 +lre22_dev_nlast,xho-xho,12 +lre22_dev_nlcun,eng-ens,0 +lre22_dev_nljyr,afr-afr,5 +lre22_dev_nlkdv,eng-iaf,12 +lre22_dev_nlpcs,ara-ayl,7 +lre22_dev_nlrcn,ara-ayl,4 +lre22_dev_nlxla,xho-xho,0 +lre22_dev_nmmij,ara-ayl,4 +lre22_dev_nmrkv,fra-ntf,12 +lre22_dev_nmufp,tso-tso,10 +lre22_dev_nnbmo,tso-tso,10 +lre22_dev_nnnpi,afr-afr,4 +lre22_dev_nnzok,tir-tir,5 +lre22_dev_noqch,fra-ntf,12 +lre22_dev_nownd,xho-xho,2 +lre22_dev_npabl,nbl-nbl,5 +lre22_dev_npjhu,afr-afr,6 +lre22_dev_nqbks,afr-afr,11 +lre22_dev_nqijo,orm-orm,7 +lre22_dev_nqljj,ara-arq,6 +lre22_dev_nqvfr,tir-tir,7 +lre22_dev_nrtej,tir-tir,11 +lre22_dev_nshvj,nbl-nbl,7 +lre22_dev_nsmyy,tir-tir,12 +lre22_dev_nsqcm,fra-ntf,13 +lre22_dev_nstrj,nbl-nbl,9 +lre22_dev_nsvla,nbl-nbl,10 +lre22_dev_nthbx,eng-ens,0 +lre22_dev_nvwkf,ven-ven,0 +lre22_dev_nvwzy,tso-tso,11 +lre22_dev_nvyyg,orm-orm,7 +lre22_dev_nxdml,eng-ens,1 +lre22_dev_nxmxb,zul-zul,12 +lre22_dev_nxqpl,nbl-nbl,13 +lre22_dev_nxslf,fra-ntf,9 +lre22_dev_nyaof,nbl-nbl,5 +lre22_dev_nzeot,zul-zul,12 +lre22_dev_nzhhf,ara-ayl,7 +lre22_dev_nzpbh,fra-ntf,14 +lre22_dev_nzyjp,orm-orm,4 +lre22_dev_nzzyd,xho-xho,11 +lre22_dev_oaiij,ven-ven,7 +lre22_dev_oaimr,orm-orm,14 +lre22_dev_oatzl,fra-ntf,13 +lre22_dev_oaycx,ara-ayl,8 +lre22_dev_objwd,eng-ens,1 +lre22_dev_oboem,tir-tir,9 +lre22_dev_obzyj,xho-xho,5 +lre22_dev_occhn,fra-ntf,9 +lre22_dev_ocfcr,ven-ven,7 +lre22_dev_ochni,ven-ven,13 +lre22_dev_ociva,tir-tir,5 +lre22_dev_odofq,xho-xho,5 +lre22_dev_odtjr,eng-ens,11 +lre22_dev_oejjy,fra-ntf,4 +lre22_dev_offnw,afr-afr,8 +lre22_dev_ofgqs,ara-ayl,6 +lre22_dev_ofkvj,xho-xho,15 +lre22_dev_ofzhh,orm-orm,11 +lre22_dev_ogilp,afr-afr,6 +lre22_dev_oglxd,ara-ayl,4 +lre22_dev_ogoyt,tso-tso,8 +lre22_dev_ogpou,ven-ven,3 +lre22_dev_ohatz,eng-ens,10 +lre22_dev_ohlzs,nbl-nbl,15 +lre22_dev_ohpzj,tir-tir,4 +lre22_dev_ohzdt,ara-aeb,5 +lre22_dev_oicrh,eng-ens,9 +lre22_dev_oigem,orm-orm,14 +lre22_dev_ojbnw,ara-arq,4 +lre22_dev_ojebm,ven-ven,7 +lre22_dev_ojila,ara-arq,4 +lre22_dev_ojiso,fra-ntf,5 +lre22_dev_ojpdy,tso-tso,9 +lre22_dev_ojtki,tir-tir,11 +lre22_dev_ojxso,nbl-nbl,4 +lre22_dev_okdqa,fra-ntf,14 +lre22_dev_oktvp,ara-ayl,7 +lre22_dev_okvsg,zul-zul,10 +lre22_dev_okyah,tso-tso,11 +lre22_dev_olabw,ara-arq,4 +lre22_dev_omhry,tir-tir,4 +lre22_dev_omnrf,eng-iaf,13 +lre22_dev_omptm,ven-ven,6 +lre22_dev_omqfq,fra-ntf,4 +lre22_dev_onqdn,fra-ntf,13 +lre22_dev_onsyx,tso-tso,9 +lre22_dev_onvgj,tir-tir,6 +lre22_dev_onzha,zul-zul,10 +lre22_dev_ooptw,nbl-nbl,5 +lre22_dev_oowvo,eng-ens,11 +lre22_dev_ooyea,tso-tso,2 +lre22_dev_oozri,ven-ven,0 +lre22_dev_opazz,ara-ayl,1 +lre22_dev_opqkl,nbl-nbl,11 +lre22_dev_oqsva,ara-ayl,2 +lre22_dev_oquxw,nbl-nbl,15 +lre22_dev_orktv,afr-afr,5 +lre22_dev_ornjf,ara-ayl,6 +lre22_dev_ortbp,ara-arq,0 +lre22_dev_osauy,fra-ntf,12 +lre22_dev_osnch,afr-afr,1 +lre22_dev_otelo,eng-iaf,7 +lre22_dev_otewx,tso-tso,10 +lre22_dev_otnwj,eng-ens,3 +lre22_dev_ouecw,ara-aeb,10 +lre22_dev_ouzui,ara-arq,3 +lre22_dev_ovdtj,ara-ayl,6 +lre22_dev_ovjny,tso-tso,1 +lre22_dev_ovqwp,ara-ayl,7 +lre22_dev_ovvkn,afr-afr,11 +lre22_dev_ovvmi,tso-tso,2 +lre22_dev_owyeq,ara-arq,6 +lre22_dev_oxlrt,ara-aeb,10 +lre22_dev_oybst,zul-zul,9 +lre22_dev_oybua,nbl-nbl,2 +lre22_dev_oykjs,tso-tso,4 +lre22_dev_oyswm,ara-arq,8 +lre22_dev_oyxbj,ven-ven,8 +lre22_dev_oyxtq,eng-ens,11 +lre22_dev_oyyxh,ara-arq,8 +lre22_dev_ozbct,tir-tir,12 +lre22_dev_ozcvt,ara-aeb,10 +lre22_dev_ozjel,ara-arq,10 +lre22_dev_ozmuj,zul-zul,3 +lre22_dev_ozuvk,tir-tir,10 +lre22_dev_paguh,fra-ntf,1 +lre22_dev_paspj,tir-tir,6 +lre22_dev_pbmai,fra-ntf,6 +lre22_dev_pbpug,zul-zul,10 +lre22_dev_pbsbs,tso-tso,10 +lre22_dev_pbszl,tso-tso,1 +lre22_dev_pbxxf,eng-iaf,2 +lre22_dev_pcgvn,eng-iaf,3 +lre22_dev_pcmbn,eng-ens,1 +lre22_dev_pcqce,ara-arq,8 +lre22_dev_pdlnr,tso-tso,2 +lre22_dev_pdrus,orm-orm,1 +lre22_dev_pedyx,eng-iaf,12 +lre22_dev_pegyr,nbl-nbl,11 +lre22_dev_pesej,ara-arq,4 +lre22_dev_pevhh,tir-tir,12 +lre22_dev_peykl,xho-xho,13 +lre22_dev_pezwc,tso-tso,4 +lre22_dev_pfemh,eng-iaf,4 +lre22_dev_pfrfc,ven-ven,8 +lre22_dev_pfsoa,nbl-nbl,15 +lre22_dev_pgeoo,tso-tso,9 +lre22_dev_pgwei,orm-orm,2 +lre22_dev_pgxyv,tso-tso,4 +lre22_dev_phofb,ara-ayl,8 +lre22_dev_phula,nbl-nbl,14 +lre22_dev_phwnf,tso-tso,9 +lre22_dev_pifyx,orm-orm,9 +lre22_dev_pilvp,tso-tso,11 +lre22_dev_pinzj,nbl-nbl,11 +lre22_dev_piocw,ara-aeb,8 +lre22_dev_pipas,zul-zul,13 +lre22_dev_pipgo,afr-afr,3 +lre22_dev_pitmn,ara-arq,10 +lre22_dev_pizdz,ara-aeb,2 +lre22_dev_pizlx,ara-ayl,6 +lre22_dev_pjatg,ven-ven,9 +lre22_dev_pjavt,orm-orm,11 +lre22_dev_pjcec,eng-iaf,12 +lre22_dev_pjdwy,afr-afr,1 +lre22_dev_pjlmw,ara-ayl,7 +lre22_dev_pjsqe,eng-ens,7 +lre22_dev_pkdij,ara-ayl,3 +lre22_dev_pkekq,ara-aeb,3 +lre22_dev_pkpst,eng-iaf,9 +lre22_dev_plhqb,nbl-nbl,13 +lre22_dev_plowv,nbl-nbl,5 +lre22_dev_plrjb,xho-xho,12 +lre22_dev_pmove,eng-iaf,4 +lre22_dev_pneax,eng-ens,11 +lre22_dev_pnexr,nbl-nbl,9 +lre22_dev_pngea,nbl-nbl,11 +lre22_dev_pnipe,eng-ens,9 +lre22_dev_pnmlr,ara-arq,5 +lre22_dev_pnsuk,xho-xho,2 +lre22_dev_pnuct,tir-tir,10 +lre22_dev_pocev,ara-arq,4 +lre22_dev_powkd,eng-ens,9 +lre22_dev_pprvm,ara-ayl,7 +lre22_dev_ppyle,ara-aeb,7 +lre22_dev_pqfda,fra-ntf,5 +lre22_dev_pqryo,afr-afr,4 +lre22_dev_prrzc,afr-afr,9 +lre22_dev_psjuf,afr-afr,13 +lre22_dev_psngm,zul-zul,13 +lre22_dev_psroz,fra-ntf,13 +lre22_dev_pssqo,orm-orm,10 +lre22_dev_psvlh,fra-ntf,13 +lre22_dev_pswld,tir-tir,10 +lre22_dev_ptcns,nbl-nbl,11 +lre22_dev_ptobm,afr-afr,6 +lre22_dev_ptowg,tir-tir,8 +lre22_dev_ptreu,xho-xho,15 +lre22_dev_ptwru,fra-ntf,14 +lre22_dev_ptyff,ara-ayl,1 +lre22_dev_ptygm,tir-tir,3 +lre22_dev_pudne,ara-arq,4 +lre22_dev_puelp,zul-zul,9 +lre22_dev_purej,nbl-nbl,9 +lre22_dev_puyvb,ara-ayl,3 +lre22_dev_pvrdh,ara-aeb,9 +lre22_dev_pvryr,eng-ens,11 +lre22_dev_pwets,tir-tir,9 +lre22_dev_pwgnk,tir-tir,10 +lre22_dev_pwhyy,tir-tir,11 +lre22_dev_pwkgs,zul-zul,2 +lre22_dev_pwtdp,eng-iaf,0 +lre22_dev_pxccc,ara-ayl,5 +lre22_dev_pxpdo,xho-xho,14 +lre22_dev_pxsot,xho-xho,14 +lre22_dev_pxuhy,ara-aeb,6 +lre22_dev_pybxn,eng-iaf,11 +lre22_dev_pyoft,eng-iaf,12 +lre22_dev_pyvql,eng-iaf,7 +lre22_dev_pzcnz,nbl-nbl,2 +lre22_dev_pzhrk,ara-aeb,4 +lre22_dev_qadjy,ven-ven,7 +lre22_dev_qaeek,ven-ven,7 +lre22_dev_qafse,eng-iaf,11 +lre22_dev_qahft,ven-ven,13 +lre22_dev_qakoa,zul-zul,9 +lre22_dev_qalhd,ara-ayl,2 +lre22_dev_qazjh,ven-ven,11 +lre22_dev_qbfkw,eng-iaf,6 +lre22_dev_qbgcd,fra-ntf,14 +lre22_dev_qbisr,ara-ayl,3 +lre22_dev_qcnbm,ven-ven,3 +lre22_dev_qdcbb,tir-tir,5 +lre22_dev_qdfgi,zul-zul,12 +lre22_dev_qdmbj,eng-ens,4 +lre22_dev_qdwtg,fra-ntf,11 +lre22_dev_qefvt,ara-ayl,7 +lre22_dev_qffki,orm-orm,13 +lre22_dev_qfplk,tir-tir,8 +lre22_dev_qgxdl,xho-xho,14 +lre22_dev_qhadd,afr-afr,2 +lre22_dev_qhgaf,ara-ayl,7 +lre22_dev_qhinf,tir-tir,6 +lre22_dev_qhkjz,ara-aeb,6 +lre22_dev_qhlwj,ara-arq,8 +lre22_dev_qiarf,ara-arq,4 +lre22_dev_qidwl,ara-arq,5 +lre22_dev_qivzc,orm-orm,12 +lre22_dev_qizyt,ara-ayl,2 +lre22_dev_qjeue,ara-arq,9 +lre22_dev_qjgxh,ara-arq,1 +lre22_dev_qkdhb,afr-afr,1 +lre22_dev_qkiqi,orm-orm,4 +lre22_dev_qkoth,tir-tir,5 +lre22_dev_qkucq,fra-ntf,3 +lre22_dev_qltea,nbl-nbl,2 +lre22_dev_qlube,ara-aeb,5 +lre22_dev_qmcji,nbl-nbl,15 +lre22_dev_qmpzc,nbl-nbl,11 +lre22_dev_qmsog,tir-tir,3 +lre22_dev_qoech,eng-iaf,7 +lre22_dev_qovfg,ara-arq,10 +lre22_dev_qozzv,tir-tir,2 +lre22_dev_qpasx,tir-tir,3 +lre22_dev_qpauj,ara-aeb,4 +lre22_dev_qpfch,orm-orm,6 +lre22_dev_qpvea,orm-orm,9 +lre22_dev_qrgka,ara-arq,8 +lre22_dev_qrqmm,ara-ayl,7 +lre22_dev_qsaol,xho-xho,14 +lre22_dev_qsgpx,ara-arq,10 +lre22_dev_qspeg,eng-ens,7 +lre22_dev_qsvbe,fra-ntf,3 +lre22_dev_qsxoh,fra-ntf,5 +lre22_dev_qtbnc,xho-xho,7 +lre22_dev_qthzi,afr-afr,12 +lre22_dev_qtmaw,fra-ntf,13 +lre22_dev_qtnqh,eng-iaf,13 +lre22_dev_qtpsb,tso-tso,8 +lre22_dev_qtqpc,eng-iaf,12 +lre22_dev_qtwfv,eng-iaf,4 +lre22_dev_qvamq,fra-ntf,9 +lre22_dev_qveuq,tir-tir,9 +lre22_dev_qvffg,orm-orm,0 +lre22_dev_qvplf,xho-xho,6 +lre22_dev_qvqvi,ven-ven,7 +lre22_dev_qwhsh,afr-afr,7 +lre22_dev_qwiwm,eng-ens,9 +lre22_dev_qxbch,ara-aeb,9 +lre22_dev_qxlca,nbl-nbl,2 +lre22_dev_qxscb,afr-afr,2 +lre22_dev_qyoqn,fra-ntf,9 +lre22_dev_qyrgs,nbl-nbl,3 +lre22_dev_qytdl,fra-ntf,9 +lre22_dev_qyyeb,eng-iaf,12 +lre22_dev_qyzqb,tso-tso,8 +lre22_dev_qzayi,orm-orm,12 +lre22_dev_qzexr,eng-iaf,5 +lre22_dev_qzrfi,ara-arq,10 +lre22_dev_qztjh,orm-orm,3 +lre22_dev_qztze,eng-iaf,12 +lre22_dev_raent,eng-iaf,2 +lre22_dev_ragjh,orm-orm,14 +lre22_dev_ramzu,ara-ayl,6 +lre22_dev_ratmr,ven-ven,7 +lre22_dev_rawak,ara-arq,9 +lre22_dev_rbbne,ven-ven,7 +lre22_dev_rbcul,eng-iaf,10 +lre22_dev_rbsoy,eng-iaf,12 +lre22_dev_rbxqy,tso-tso,9 +lre22_dev_rcejf,xho-xho,7 +lre22_dev_rdbzt,zul-zul,7 +lre22_dev_rdhpu,ara-aeb,8 +lre22_dev_rdsew,ven-ven,2 +lre22_dev_rdtkf,ven-ven,11 +lre22_dev_reeba,ara-ayl,6 +lre22_dev_relip,eng-iaf,11 +lre22_dev_rfdoh,ara-aeb,9 +lre22_dev_rfkja,xho-xho,11 +lre22_dev_rflev,ven-ven,3 +lre22_dev_rfqcx,nbl-nbl,14 +lre22_dev_rfwuv,eng-ens,1 +lre22_dev_rgsil,fra-ntf,6 +lre22_dev_rhcuj,ara-aeb,8 +lre22_dev_rhdgz,eng-iaf,12 +lre22_dev_rhpmn,ven-ven,7 +lre22_dev_rhtoe,eng-iaf,11 +lre22_dev_rhyqq,ara-aeb,2 +lre22_dev_riltn,ara-aeb,10 +lre22_dev_rinti,xho-xho,12 +lre22_dev_rioxh,xho-xho,12 +lre22_dev_ripix,tir-tir,10 +lre22_dev_rjbji,ven-ven,10 +lre22_dev_rjqbz,eng-iaf,0 +lre22_dev_rkemd,tir-tir,8 +lre22_dev_rktzl,nbl-nbl,13 +lre22_dev_rkuni,xho-xho,15 +lre22_dev_rlsgd,fra-ntf,5 +lre22_dev_rlypa,afr-afr,7 +lre22_dev_rmeav,ven-ven,8 +lre22_dev_rmejy,fra-ntf,12 +lre22_dev_rmeuz,zul-zul,6 +lre22_dev_rmjsj,nbl-nbl,5 +lre22_dev_rmtxj,eng-iaf,13 +lre22_dev_rnpyc,ara-ayl,2 +lre22_dev_rnunw,orm-orm,9 +lre22_dev_rnvvw,tso-tso,9 +lre22_dev_roavh,fra-ntf,6 +lre22_dev_rodbi,xho-xho,15 +lre22_dev_roeph,xho-xho,13 +lre22_dev_rolun,ara-ayl,3 +lre22_dev_roydh,xho-xho,7 +lre22_dev_rpajy,ara-aeb,8 +lre22_dev_rpdsm,ara-ayl,5 +lre22_dev_rpfae,afr-afr,9 +lre22_dev_rpvyc,eng-iaf,9 +lre22_dev_rqxot,tso-tso,9 +lre22_dev_rumiv,ara-aeb,9 +lre22_dev_runhh,afr-afr,6 +lre22_dev_ruvpd,eng-iaf,4 +lre22_dev_rvpkd,fra-ntf,1 +lre22_dev_rvqxq,orm-orm,12 +lre22_dev_rvstc,ara-arq,7 +lre22_dev_rwbea,tir-tir,9 +lre22_dev_rweyk,nbl-nbl,2 +lre22_dev_rwnfb,eng-ens,8 +lre22_dev_rwrhn,afr-afr,11 +lre22_dev_rxhkp,ara-arq,3 +lre22_dev_rxixz,nbl-nbl,15 +lre22_dev_rxmft,zul-zul,7 +lre22_dev_ryknh,ara-ayl,5 +lre22_dev_rytyf,zul-zul,12 +lre22_dev_rywss,tso-tso,1 +lre22_dev_rzjrd,nbl-nbl,7 +lre22_dev_rzpyx,tso-tso,2 +lre22_dev_satbk,ven-ven,7 +lre22_dev_sbfhc,fra-ntf,6 +lre22_dev_sboxi,xho-xho,15 +lre22_dev_scxxn,eng-iaf,5 +lre22_dev_scyvp,ara-aeb,6 +lre22_dev_sdbou,tir-tir,10 +lre22_dev_sddua,tir-tir,11 +lre22_dev_seasj,afr-afr,7 +lre22_dev_sevcw,tir-tir,12 +lre22_dev_sfevx,tso-tso,4 +lre22_dev_sfqgm,fra-ntf,1 +lre22_dev_sgaza,ara-aeb,8 +lre22_dev_sgkrh,afr-afr,9 +lre22_dev_sgmjh,nbl-nbl,14 +lre22_dev_shafn,ven-ven,8 +lre22_dev_shaob,orm-orm,10 +lre22_dev_shnns,afr-afr,6 +lre22_dev_siprc,ven-ven,7 +lre22_dev_sisge,afr-afr,13 +lre22_dev_siuwu,ara-arq,10 +lre22_dev_sivik,fra-ntf,2 +lre22_dev_sjyoo,afr-afr,1 +lre22_dev_skacz,fra-ntf,13 +lre22_dev_skcai,orm-orm,12 +lre22_dev_skctw,nbl-nbl,0 +lre22_dev_skygk,afr-afr,13 +lre22_dev_slraf,ara-aeb,6 +lre22_dev_slrzl,eng-ens,11 +lre22_dev_sltzh,xho-xho,6 +lre22_dev_sluki,ven-ven,1 +lre22_dev_slyez,tso-tso,8 +lre22_dev_slzuh,xho-xho,15 +lre22_dev_smdsm,nbl-nbl,7 +lre22_dev_smhae,ara-ayl,3 +lre22_dev_smxhe,ara-aeb,10 +lre22_dev_snayr,afr-afr,2 +lre22_dev_snbxs,eng-ens,8 +lre22_dev_sngol,tso-tso,9 +lre22_dev_snhun,fra-ntf,13 +lre22_dev_snkib,ven-ven,8 +lre22_dev_snqld,eng-iaf,2 +lre22_dev_sntvb,eng-ens,11 +lre22_dev_snzbl,tir-tir,12 +lre22_dev_sobid,afr-afr,3 +lre22_dev_soknx,orm-orm,15 +lre22_dev_spesw,ven-ven,13 +lre22_dev_sphuq,eng-iaf,12 +lre22_dev_spqcy,xho-xho,11 +lre22_dev_sqcyu,zul-zul,9 +lre22_dev_sqdkr,eng-iaf,13 +lre22_dev_sqfnt,ara-aeb,9 +lre22_dev_sqhrr,eng-ens,11 +lre22_dev_sqyiu,ara-ayl,4 +lre22_dev_srbwp,ara-aeb,10 +lre22_dev_srokn,afr-afr,6 +lre22_dev_srzck,ara-ayl,3 +lre22_dev_ssbei,tso-tso,10 +lre22_dev_ssfmz,eng-iaf,12 +lre22_dev_ssmgk,xho-xho,10 +lre22_dev_ssmsy,xho-xho,4 +lre22_dev_stgcb,afr-afr,10 +lre22_dev_stihb,afr-afr,0 +lre22_dev_stkav,ara-aeb,9 +lre22_dev_stkrw,xho-xho,3 +lre22_dev_sttnk,fra-ntf,8 +lre22_dev_stwkk,eng-iaf,12 +lre22_dev_stwrt,nbl-nbl,1 +lre22_dev_subio,afr-afr,1 +lre22_dev_sumjk,ara-arq,6 +lre22_dev_suocb,nbl-nbl,6 +lre22_dev_svcbx,tso-tso,9 +lre22_dev_svllg,fra-ntf,14 +lre22_dev_svvqs,afr-afr,3 +lre22_dev_svxyz,ara-ayl,1 +lre22_dev_swhlf,ara-aeb,10 +lre22_dev_swhnk,fra-ntf,12 +lre22_dev_swnrg,ven-ven,12 +lre22_dev_swofz,zul-zul,4 +lre22_dev_swuls,tso-tso,8 +lre22_dev_sxfkn,ara-aeb,2 +lre22_dev_sycoz,tir-tir,10 +lre22_dev_syoek,fra-ntf,5 +lre22_dev_sypnb,ven-ven,13 +lre22_dev_syvrt,eng-iaf,8 +lre22_dev_szmoc,ven-ven,6 +lre22_dev_szmwp,eng-ens,8 +lre22_dev_talec,ven-ven,11 +lre22_dev_tasfs,ven-ven,7 +lre22_dev_tbbrr,xho-xho,5 +lre22_dev_tbcun,ara-aeb,3 +lre22_dev_tbhnw,nbl-nbl,15 +lre22_dev_tblhf,ven-ven,12 +lre22_dev_tbozq,xho-xho,1 +lre22_dev_tcckd,ara-ayl,3 +lre22_dev_tcele,tso-tso,11 +lre22_dev_tciob,tso-tso,10 +lre22_dev_tcpxj,tir-tir,9 +lre22_dev_tdejo,tir-tir,6 +lre22_dev_tdfqo,tso-tso,0 +lre22_dev_tdhhf,zul-zul,10 +lre22_dev_tdjje,ven-ven,10 +lre22_dev_tdkrp,orm-orm,6 +lre22_dev_tebop,tso-tso,10 +lre22_dev_teeqm,ven-ven,6 +lre22_dev_tejsn,tir-tir,12 +lre22_dev_teptc,ara-arq,10 +lre22_dev_tetmt,orm-orm,9 +lre22_dev_tfkij,ara-aeb,2 +lre22_dev_tfnin,tir-tir,3 +lre22_dev_tfyqz,tir-tir,3 +lre22_dev_tgbui,ara-aeb,5 +lre22_dev_tgixi,xho-xho,13 +lre22_dev_tgmud,eng-iaf,6 +lre22_dev_tgult,eng-ens,2 +lre22_dev_thcjv,tso-tso,5 +lre22_dev_thzir,eng-ens,11 +lre22_dev_tisfm,fra-ntf,9 +lre22_dev_tixou,xho-xho,2 +lre22_dev_tiyuw,afr-afr,5 +lre22_dev_tjdcc,afr-afr,13 +lre22_dev_tjikt,zul-zul,12 +lre22_dev_tjpdw,ara-arq,8 +lre22_dev_tkadi,ven-ven,12 +lre22_dev_tkcbm,afr-afr,6 +lre22_dev_tkgfw,eng-ens,11 +lre22_dev_tkiks,ara-aeb,6 +lre22_dev_tlgzi,xho-xho,1 +lre22_dev_tlhlw,tir-tir,6 +lre22_dev_tloqn,afr-afr,6 +lre22_dev_tmcje,eng-ens,4 +lre22_dev_tmjpw,eng-iaf,2 +lre22_dev_tmxtu,ven-ven,2 +lre22_dev_tngwh,tir-tir,8 +lre22_dev_tnqdv,ara-aeb,9 +lre22_dev_tnqro,xho-xho,15 +lre22_dev_tnqzy,orm-orm,7 +lre22_dev_tnskm,xho-xho,12 +lre22_dev_tnvhc,ven-ven,12 +lre22_dev_tofhy,zul-zul,6 +lre22_dev_tohkd,zul-zul,9 +lre22_dev_tonqb,ven-ven,6 +lre22_dev_tpbib,tso-tso,1 +lre22_dev_tpejq,ara-arq,3 +lre22_dev_tpfir,eng-ens,11 +lre22_dev_tphgn,zul-zul,12 +lre22_dev_tpidd,ara-arq,6 +lre22_dev_tpkce,eng-ens,11 +lre22_dev_tpszi,orm-orm,15 +lre22_dev_tpwcn,eng-iaf,6 +lre22_dev_trdfy,ara-ayl,3 +lre22_dev_tsbms,ara-ayl,4 +lre22_dev_tslui,tso-tso,6 +lre22_dev_tsvvy,zul-zul,10 +lre22_dev_tsyey,xho-xho,10 +lre22_dev_ttlco,eng-iaf,12 +lre22_dev_tubpr,orm-orm,13 +lre22_dev_tugpl,eng-ens,9 +lre22_dev_tuoiq,tir-tir,4 +lre22_dev_tuxfx,zul-zul,3 +lre22_dev_tvahj,tir-tir,9 +lre22_dev_tvewc,eng-iaf,3 +lre22_dev_tvfvc,ara-ayl,8 +lre22_dev_tvkod,xho-xho,5 +lre22_dev_tvkwe,zul-zul,9 +lre22_dev_tvopo,xho-xho,12 +lre22_dev_tvqui,eng-ens,7 +lre22_dev_tvsbw,ara-arq,6 +lre22_dev_tvxvk,ven-ven,8 +lre22_dev_twbkf,nbl-nbl,9 +lre22_dev_twfot,ara-arq,6 +lre22_dev_twkns,ara-ayl,4 +lre22_dev_twuvf,eng-ens,10 +lre22_dev_txahv,eng-ens,8 +lre22_dev_txcob,ara-aeb,6 +lre22_dev_txnvi,zul-zul,3 +lre22_dev_txurh,afr-afr,7 +lre22_dev_txzkl,ara-arq,5 +lre22_dev_tyfad,tso-tso,7 +lre22_dev_tyhwp,ara-aeb,8 +lre22_dev_tzism,tir-tir,12 +lre22_dev_tzsfj,tir-tir,12 +lre22_dev_tzwof,eng-iaf,9 +lre22_dev_uahzm,afr-afr,5 +lre22_dev_uajwt,tso-tso,7 +lre22_dev_uanlr,zul-zul,13 +lre22_dev_uaoju,zul-zul,8 +lre22_dev_uaryk,xho-xho,15 +lre22_dev_ubfaf,ven-ven,12 +lre22_dev_ucbje,ara-aeb,8 +lre22_dev_ucrpa,ara-arq,3 +lre22_dev_udtzx,eng-iaf,7 +lre22_dev_uduja,fra-ntf,6 +lre22_dev_udxpl,tso-tso,2 +lre22_dev_uesmx,eng-iaf,5 +lre22_dev_ufewk,eng-iaf,8 +lre22_dev_ugjxy,tir-tir,4 +lre22_dev_ugsxl,eng-ens,3 +lre22_dev_ugvov,tso-tso,8 +lre22_dev_uhmdw,tso-tso,10 +lre22_dev_uhqng,nbl-nbl,12 +lre22_dev_uhymw,tir-tir,8 +lre22_dev_uhzmr,eng-ens,2 +lre22_dev_uimtg,ara-ayl,4 +lre22_dev_uirdr,nbl-nbl,13 +lre22_dev_uiszj,ara-aeb,8 +lre22_dev_ujada,ara-ayl,9 +lre22_dev_ujmqw,ven-ven,4 +lre22_dev_ujswr,afr-afr,11 +lre22_dev_ujvve,xho-xho,10 +lre22_dev_ukfha,ara-ayl,6 +lre22_dev_ukkpr,eng-ens,10 +lre22_dev_ukpdg,fra-ntf,13 +lre22_dev_ukpoy,nbl-nbl,15 +lre22_dev_uktod,ara-ayl,4 +lre22_dev_uktvh,zul-zul,13 +lre22_dev_ukuwo,ara-ayl,5 +lre22_dev_ukynv,zul-zul,12 +lre22_dev_ulepv,ara-ayl,5 +lre22_dev_ulgtj,zul-zul,7 +lre22_dev_ulofk,eng-iaf,11 +lre22_dev_uluog,ara-arq,3 +lre22_dev_umbpy,zul-zul,13 +lre22_dev_umjzo,tso-tso,5 +lre22_dev_uncdb,ara-arq,9 +lre22_dev_unffr,ara-ayl,8 +lre22_dev_unpif,eng-ens,9 +lre22_dev_uoikj,eng-iaf,13 +lre22_dev_uopfp,nbl-nbl,7 +lre22_dev_upenl,eng-iaf,13 +lre22_dev_uphuw,xho-xho,11 +lre22_dev_upkbw,ara-ayl,4 +lre22_dev_uplen,xho-xho,9 +lre22_dev_upqod,orm-orm,6 +lre22_dev_upspe,afr-afr,12 +lre22_dev_uqnkk,tir-tir,12 +lre22_dev_uqvxc,eng-ens,0 +lre22_dev_urgqx,ara-ayl,8 +lre22_dev_urkgk,tir-tir,12 +lre22_dev_uscky,xho-xho,3 +lre22_dev_usiwx,tir-tir,9 +lre22_dev_usnzj,zul-zul,5 +lre22_dev_usopt,xho-xho,8 +lre22_dev_uswgv,nbl-nbl,11 +lre22_dev_uszcb,ara-arq,4 +lre22_dev_utahf,ara-ayl,7 +lre22_dev_utaxq,tso-tso,9 +lre22_dev_utcwb,afr-afr,10 +lre22_dev_uuhry,tir-tir,9 +lre22_dev_uuprr,eng-ens,7 +lre22_dev_uuvqh,zul-zul,2 +lre22_dev_uwcmh,orm-orm,4 +lre22_dev_uwiev,zul-zul,13 +lre22_dev_uwjzb,ven-ven,10 +lre22_dev_uwony,orm-orm,1 +lre22_dev_uwqeq,orm-orm,2 +lre22_dev_uwvfl,nbl-nbl,5 +lre22_dev_uxdjn,xho-xho,12 +lre22_dev_uxqte,zul-zul,13 +lre22_dev_uxryh,ven-ven,11 +lre22_dev_uyhzp,orm-orm,15 +lre22_dev_uyrjl,tso-tso,10 +lre22_dev_uyzcl,eng-ens,11 +lre22_dev_uzbqz,fra-ntf,4 +lre22_dev_uzoxq,ara-aeb,9 +lre22_dev_vabxl,nbl-nbl,11 +lre22_dev_vafyo,nbl-nbl,15 +lre22_dev_vascl,nbl-nbl,0 +lre22_dev_vauqx,ara-arq,10 +lre22_dev_vbscm,xho-xho,3 +lre22_dev_vbulh,xho-xho,12 +lre22_dev_vbwwp,xho-xho,15 +lre22_dev_vbznk,ara-arq,6 +lre22_dev_vcibu,nbl-nbl,9 +lre22_dev_vcjun,zul-zul,12 +lre22_dev_vckxt,xho-xho,7 +lre22_dev_vdkjy,fra-ntf,14 +lre22_dev_vdmyt,ara-ayl,0 +lre22_dev_vdoif,ven-ven,13 +lre22_dev_vdvjv,orm-orm,12 +lre22_dev_vebet,ara-aeb,1 +lre22_dev_velkr,ara-aeb,1 +lre22_dev_vgbmm,tir-tir,9 +lre22_dev_vgucw,nbl-nbl,7 +lre22_dev_vhiyb,afr-afr,9 +lre22_dev_vhoej,tir-tir,5 +lre22_dev_vhryd,orm-orm,13 +lre22_dev_vhzdh,tso-tso,10 +lre22_dev_viapx,tso-tso,3 +lre22_dev_vifdj,ara-ayl,4 +lre22_dev_vijbo,zul-zul,12 +lre22_dev_virnr,eng-ens,6 +lre22_dev_vjhbd,orm-orm,6 +lre22_dev_vjoca,ara-aeb,10 +lre22_dev_vjtou,eng-ens,5 +lre22_dev_vjxpv,ara-aeb,10 +lre22_dev_vkmab,fra-ntf,2 +lre22_dev_vkrvz,tir-tir,8 +lre22_dev_vkwwf,tso-tso,9 +lre22_dev_vlbdk,zul-zul,6 +lre22_dev_vliie,orm-orm,9 +lre22_dev_vlrve,eng-iaf,2 +lre22_dev_vmaet,tir-tir,3 +lre22_dev_vmdhi,eng-ens,10 +lre22_dev_vmdjw,nbl-nbl,13 +lre22_dev_vmjut,fra-ntf,9 +lre22_dev_vmrrg,eng-ens,3 +lre22_dev_vnjxn,nbl-nbl,7 +lre22_dev_vnmxm,ven-ven,12 +lre22_dev_vnykj,zul-zul,10 +lre22_dev_vovab,zul-zul,11 +lre22_dev_vovvl,zul-zul,11 +lre22_dev_vpcey,tir-tir,6 +lre22_dev_vpodd,nbl-nbl,11 +lre22_dev_vptke,eng-ens,4 +lre22_dev_vpulr,xho-xho,15 +lre22_dev_vpuve,tir-tir,8 +lre22_dev_vqttr,eng-iaf,12 +lre22_dev_vqzae,eng-iaf,11 +lre22_dev_vrnsg,tso-tso,8 +lre22_dev_vshpc,ara-aeb,6 +lre22_dev_vslbh,ara-arq,9 +lre22_dev_vsmaz,tir-tir,5 +lre22_dev_vsnez,tso-tso,8 +lre22_dev_vsnjp,fra-ntf,14 +lre22_dev_vsocn,ven-ven,7 +lre22_dev_vsvom,afr-afr,8 +lre22_dev_vtnfc,tir-tir,4 +lre22_dev_vtnlb,eng-ens,4 +lre22_dev_vubwb,eng-ens,8 +lre22_dev_vufsn,ara-aeb,3 +lre22_dev_vuiqu,tir-tir,8 +lre22_dev_vumeq,xho-xho,0 +lre22_dev_vupse,ven-ven,6 +lre22_dev_vvauz,xho-xho,14 +lre22_dev_vvfze,eng-ens,11 +lre22_dev_vviyr,zul-zul,12 +lre22_dev_vvwiq,fra-ntf,5 +lre22_dev_vwnkj,zul-zul,5 +lre22_dev_vwoww,orm-orm,7 +lre22_dev_vwtne,afr-afr,5 +lre22_dev_vwxgt,ara-arq,10 +lre22_dev_vxabl,eng-ens,8 +lre22_dev_vxnsl,afr-afr,7 +lre22_dev_vxslj,tir-tir,10 +lre22_dev_vxsvc,tir-tir,11 +lre22_dev_vxuiz,ara-aeb,10 +lre22_dev_vzarl,ara-ayl,7 +lre22_dev_vzeew,ven-ven,6 +lre22_dev_vzjtc,ara-arq,0 +lre22_dev_vzkdb,tso-tso,10 +lre22_dev_vzvpq,ara-arq,9 +lre22_dev_waqyh,xho-xho,15 +lre22_dev_wawwu,xho-xho,14 +lre22_dev_wbgqi,tso-tso,11 +lre22_dev_wcctp,eng-ens,10 +lre22_dev_wdcer,afr-afr,3 +lre22_dev_wdeor,fra-ntf,14 +lre22_dev_wdfdd,eng-iaf,2 +lre22_dev_wdkvb,eng-ens,11 +lre22_dev_wdogx,ara-aeb,7 +lre22_dev_wdqdq,ara-arq,10 +lre22_dev_wdxwu,tir-tir,5 +lre22_dev_weaek,ara-arq,4 +lre22_dev_wefui,tso-tso,10 +lre22_dev_wehjh,tir-tir,10 +lre22_dev_weypz,nbl-nbl,12 +lre22_dev_wffdy,zul-zul,12 +lre22_dev_wffgq,tso-tso,8 +lre22_dev_wfvlh,ven-ven,8 +lre22_dev_wgago,eng-ens,5 +lre22_dev_wglzd,afr-afr,11 +lre22_dev_wgsbu,afr-afr,5 +lre22_dev_whdhw,nbl-nbl,7 +lre22_dev_whogu,eng-iaf,13 +lre22_dev_whpee,tso-tso,9 +lre22_dev_whqpd,ara-aeb,9 +lre22_dev_wikrr,ven-ven,11 +lre22_dev_witju,fra-ntf,11 +lre22_dev_wjcme,orm-orm,10 +lre22_dev_wkare,ara-arq,2 +lre22_dev_wkbfe,afr-afr,9 +lre22_dev_wkecn,xho-xho,13 +lre22_dev_wkhxo,afr-afr,9 +lre22_dev_wlgae,ara-arq,6 +lre22_dev_wlnls,eng-iaf,7 +lre22_dev_wlsxb,eng-ens,1 +lre22_dev_wlwuc,nbl-nbl,8 +lre22_dev_wnaqr,nbl-nbl,9 +lre22_dev_wndpq,fra-ntf,13 +lre22_dev_wnkdc,ara-ayl,2 +lre22_dev_wnknc,nbl-nbl,9 +lre22_dev_wnppz,orm-orm,15 +lre22_dev_wpzgm,afr-afr,13 +lre22_dev_wqhqj,ara-ayl,9 +lre22_dev_wqreb,afr-afr,11 +lre22_dev_wqrez,eng-ens,4 +lre22_dev_wqtsf,ara-arq,8 +lre22_dev_wqwtc,orm-orm,3 +lre22_dev_wrfwf,ven-ven,7 +lre22_dev_wrqqt,orm-orm,15 +lre22_dev_wrutf,afr-afr,7 +lre22_dev_wrvzk,nbl-nbl,1 +lre22_dev_wrxly,fra-ntf,13 +lre22_dev_wsbiw,ara-aeb,8 +lre22_dev_wshay,zul-zul,8 +lre22_dev_wsous,tso-tso,5 +lre22_dev_wszpj,ven-ven,7 +lre22_dev_wtksi,afr-afr,8 +lre22_dev_wugbw,xho-xho,6 +lre22_dev_wujfv,afr-afr,11 +lre22_dev_wuwek,xho-xho,12 +lre22_dev_wvhhk,fra-ntf,2 +lre22_dev_wvosz,nbl-nbl,3 +lre22_dev_wwagu,xho-xho,14 +lre22_dev_wwbuj,eng-iaf,2 +lre22_dev_wwgnr,afr-afr,10 +lre22_dev_wwjev,afr-afr,12 +lre22_dev_wwmsu,ara-arq,4 +lre22_dev_wwrmy,ven-ven,7 +lre22_dev_wwvhd,ara-arq,9 +lre22_dev_wxdjv,ara-ayl,6 +lre22_dev_wygox,tir-tir,6 +lre22_dev_wyhuq,zul-zul,13 +lre22_dev_wzoir,xho-xho,15 +lre22_dev_wzvwa,orm-orm,6 +lre22_dev_xapvn,tso-tso,8 +lre22_dev_xarkl,eng-ens,5 +lre22_dev_xavhh,nbl-nbl,10 +lre22_dev_xazuy,orm-orm,3 +lre22_dev_xbnft,eng-iaf,0 +lre22_dev_xbqbc,fra-ntf,7 +lre22_dev_xbzfw,tir-tir,11 +lre22_dev_xccde,ara-arq,3 +lre22_dev_xcdty,zul-zul,8 +lre22_dev_xcjkb,ara-ayl,7 +lre22_dev_xcmty,ara-arq,10 +lre22_dev_xcsbc,tso-tso,1 +lre22_dev_xdkjb,nbl-nbl,11 +lre22_dev_xdknq,nbl-nbl,11 +lre22_dev_xdoik,eng-ens,10 +lre22_dev_xdtyd,nbl-nbl,4 +lre22_dev_xearl,eng-iaf,3 +lre22_dev_xedqa,nbl-nbl,11 +lre22_dev_xefnx,eng-ens,11 +lre22_dev_xeipr,tir-tir,11 +lre22_dev_xekhs,zul-zul,9 +lre22_dev_xelzr,ara-aeb,9 +lre22_dev_xenhb,ara-aeb,3 +lre22_dev_xfdsx,xho-xho,12 +lre22_dev_xfggl,xho-xho,9 +lre22_dev_xgspz,eng-iaf,13 +lre22_dev_xgwmu,tso-tso,8 +lre22_dev_xhbmk,orm-orm,15 +lre22_dev_xhdtl,orm-orm,3 +lre22_dev_xisjn,ara-arq,8 +lre22_dev_xitdz,nbl-nbl,10 +lre22_dev_xizbg,xho-xho,14 +lre22_dev_xjcph,xho-xho,10 +lre22_dev_xjcvd,zul-zul,7 +lre22_dev_xjlgm,ara-aeb,3 +lre22_dev_xjxzy,eng-ens,2 +lre22_dev_xkfsd,ven-ven,12 +lre22_dev_xkktj,eng-iaf,12 +lre22_dev_xkmmy,ara-aeb,10 +lre22_dev_xltgz,ara-ayl,5 +lre22_dev_xmbby,orm-orm,3 +lre22_dev_xmcmv,xho-xho,14 +lre22_dev_xngam,fra-ntf,14 +lre22_dev_xnsev,ara-ayl,8 +lre22_dev_xnwsq,ara-arq,8 +lre22_dev_xnwwh,zul-zul,13 +lre22_dev_xobeh,tir-tir,11 +lre22_dev_xolau,ven-ven,13 +lre22_dev_xoqtn,eng-iaf,10 +lre22_dev_xovpd,eng-iaf,10 +lre22_dev_xpaff,eng-ens,9 +lre22_dev_xpahm,ara-arq,4 +lre22_dev_xpcrs,tso-tso,5 +lre22_dev_xpdsg,eng-iaf,5 +lre22_dev_xpjqj,nbl-nbl,6 +lre22_dev_xqwtk,ara-arq,10 +lre22_dev_xrfge,ara-arq,8 +lre22_dev_xrhka,orm-orm,9 +lre22_dev_xrpup,zul-zul,8 +lre22_dev_xsbff,ara-aeb,9 +lre22_dev_xsffv,tso-tso,1 +lre22_dev_xstnu,eng-ens,5 +lre22_dev_xthfd,ara-aeb,8 +lre22_dev_xthzz,ven-ven,4 +lre22_dev_xtmgg,eng-iaf,13 +lre22_dev_xtyic,nbl-nbl,14 +lre22_dev_xucyl,eng-ens,7 +lre22_dev_xudii,ara-ayl,3 +lre22_dev_xugux,afr-afr,0 +lre22_dev_xuqnj,ara-ayl,4 +lre22_dev_xvaoh,nbl-nbl,9 +lre22_dev_xvclh,afr-afr,9 +lre22_dev_xveae,xho-xho,4 +lre22_dev_xxpqz,ara-arq,9 +lre22_dev_xxqad,tso-tso,10 +lre22_dev_xybed,tir-tir,9 +lre22_dev_xyrex,eng-ens,11 +lre22_dev_xzlas,eng-iaf,9 +lre22_dev_xztyr,orm-orm,9 +lre22_dev_yaxkb,zul-zul,12 +lre22_dev_ybcvu,xho-xho,13 +lre22_dev_ybjon,orm-orm,2 +lre22_dev_ybubm,ven-ven,5 +lre22_dev_ycarc,eng-ens,6 +lre22_dev_ychjj,orm-orm,2 +lre22_dev_ycnyc,tir-tir,7 +lre22_dev_ycsvt,afr-afr,12 +lre22_dev_ydaxa,nbl-nbl,8 +lre22_dev_ydrxu,nbl-nbl,1 +lre22_dev_yeekw,fra-ntf,13 +lre22_dev_yevan,tir-tir,11 +lre22_dev_yfaan,tir-tir,10 +lre22_dev_yfayx,afr-afr,6 +lre22_dev_yfpsd,fra-ntf,1 +lre22_dev_yfxkm,ven-ven,7 +lre22_dev_yguqk,ven-ven,3 +lre22_dev_yhrgj,afr-afr,8 +lre22_dev_yhzyq,ara-ayl,5 +lre22_dev_yiqui,eng-iaf,12 +lre22_dev_yjens,ara-ayl,7 +lre22_dev_yjkxx,eng-ens,8 +lre22_dev_yjypk,ara-ayl,9 +lre22_dev_ykchd,ven-ven,8 +lre22_dev_ykktl,xho-xho,0 +lre22_dev_ylhwh,orm-orm,9 +lre22_dev_ylnms,tso-tso,2 +lre22_dev_ylsdz,ven-ven,7 +lre22_dev_ymcmp,eng-iaf,8 +lre22_dev_ymfzx,tso-tso,7 +lre22_dev_ymizm,fra-ntf,0 +lre22_dev_ympvj,tir-tir,9 +lre22_dev_ymslh,tir-tir,12 +lre22_dev_ynavg,zul-zul,9 +lre22_dev_ynhlk,tir-tir,9 +lre22_dev_ynnkb,eng-ens,10 +lre22_dev_yogkc,fra-ntf,7 +lre22_dev_yokld,eng-ens,4 +lre22_dev_yokve,tir-tir,6 +lre22_dev_yomdz,ara-ayl,6 +lre22_dev_yomuu,xho-xho,12 +lre22_dev_yoobm,ara-ayl,8 +lre22_dev_yoocz,eng-ens,10 +lre22_dev_yopyf,eng-iaf,5 +lre22_dev_yoxoc,tir-tir,8 +lre22_dev_ypaem,afr-afr,5 +lre22_dev_ypamp,afr-afr,7 +lre22_dev_ypjpq,tir-tir,8 +lre22_dev_yplba,ara-arq,9 +lre22_dev_ypnrh,fra-ntf,1 +lre22_dev_ypqfg,eng-ens,7 +lre22_dev_yrdsl,eng-ens,2 +lre22_dev_yrtkv,afr-afr,7 +lre22_dev_yrwrb,nbl-nbl,9 +lre22_dev_ysmlk,eng-ens,11 +lre22_dev_yspja,orm-orm,5 +lre22_dev_ytfnn,fra-ntf,14 +lre22_dev_yturp,ara-aeb,6 +lre22_dev_ytvbd,afr-afr,4 +lre22_dev_yuhvo,tso-tso,8 +lre22_dev_yundi,ara-arq,3 +lre22_dev_yvmnx,ara-arq,10 +lre22_dev_yvqud,xho-xho,15 +lre22_dev_yvxdd,ara-ayl,4 +lre22_dev_ywjtq,xho-xho,5 +lre22_dev_ywnza,fra-ntf,12 +lre22_dev_yxnno,tso-tso,10 +lre22_dev_yxoww,tir-tir,7 +lre22_dev_yxpgi,ara-arq,5 +lre22_dev_yxsta,eng-ens,7 +lre22_dev_yyltz,xho-xho,8 +lre22_dev_yyqqx,fra-ntf,12 +lre22_dev_yzloh,ara-ayl,7 +lre22_dev_zacdy,ara-ayl,3 +lre22_dev_zadkk,tir-tir,9 +lre22_dev_zalpc,afr-afr,6 +lre22_dev_zarod,orm-orm,8 +lre22_dev_zasvb,afr-afr,11 +lre22_dev_zazom,ara-arq,9 +lre22_dev_zbfqk,afr-afr,13 +lre22_dev_zbqew,tso-tso,2 +lre22_dev_zbrkn,eng-ens,7 +lre22_dev_zbubp,zul-zul,9 +lre22_dev_zbytc,ara-arq,8 +lre22_dev_zcfns,tir-tir,6 +lre22_dev_zcfzk,afr-afr,7 +lre22_dev_zcrgv,ara-arq,10 +lre22_dev_zdxdn,ara-ayl,7 +lre22_dev_zdydi,eng-ens,1 +lre22_dev_zebzq,ven-ven,4 +lre22_dev_zedlk,xho-xho,14 +lre22_dev_zeqpp,tir-tir,12 +lre22_dev_zfjbm,ara-arq,10 +lre22_dev_zfkne,nbl-nbl,13 +lre22_dev_zflnr,ven-ven,13 +lre22_dev_zfoyd,xho-xho,4 +lre22_dev_zgdyu,eng-iaf,8 +lre22_dev_zgmja,zul-zul,9 +lre22_dev_zgvfs,ara-arq,6 +lre22_dev_zhmud,orm-orm,14 +lre22_dev_zhoml,tso-tso,9 +lre22_dev_zijcb,xho-xho,10 +lre22_dev_ziktm,ara-aeb,10 +lre22_dev_zipxy,ara-arq,9 +lre22_dev_ziqxc,eng-iaf,1 +lre22_dev_zjhir,ven-ven,7 +lre22_dev_zjmqp,orm-orm,13 +lre22_dev_zjrrk,tso-tso,11 +lre22_dev_zjtwd,ara-aeb,3 +lre22_dev_zkfcf,xho-xho,6 +lre22_dev_zkftc,nbl-nbl,4 +lre22_dev_zkqei,ara-ayl,7 +lre22_dev_zkwqo,zul-zul,11 +lre22_dev_zlamn,nbl-nbl,6 +lre22_dev_zlbor,xho-xho,14 +lre22_dev_zloet,ven-ven,8 +lre22_dev_zlvhk,zul-zul,5 +lre22_dev_zlzqv,fra-ntf,12 +lre22_dev_zmobq,ara-ayl,7 +lre22_dev_zmuiv,zul-zul,9 +lre22_dev_znvqw,zul-zul,4 +lre22_dev_znzuu,tir-tir,0 +lre22_dev_zoava,eng-iaf,6 +lre22_dev_zodvu,tso-tso,0 +lre22_dev_zosdw,nbl-nbl,15 +lre22_dev_zpnvq,xho-xho,6 +lre22_dev_zqeby,eng-iaf,12 +lre22_dev_zqgdd,nbl-nbl,9 +lre22_dev_zqhaw,nbl-nbl,5 +lre22_dev_zqkau,orm-orm,8 +lre22_dev_zqkel,ara-ayl,9 +lre22_dev_zqlnd,ara-aeb,8 +lre22_dev_zrnpw,orm-orm,8 +lre22_dev_zrqvc,afr-afr,9 +lre22_dev_zrrgq,ven-ven,8 +lre22_dev_zryit,zul-zul,8 +lre22_dev_zsckt,zul-zul,4 +lre22_dev_zucqq,orm-orm,4 +lre22_dev_zusln,orm-orm,11 +lre22_dev_zuxzw,tir-tir,0 +lre22_dev_zvabs,tir-tir,11 +lre22_dev_zvlid,tso-tso,11 +lre22_dev_zvned,eng-iaf,5 +lre22_dev_zvtwr,xho-xho,11 +lre22_dev_zwmim,orm-orm,11 +lre22_dev_zwnsu,ara-arq,8 +lre22_dev_zwtxn,ara-arq,10 +lre22_dev_zxfcm,orm-orm,3 +lre22_dev_zxsgm,tir-tir,5 +lre22_dev_zybya,eng-iaf,10 +lre22_dev_zygak,zul-zul,1 +lre22_dev_zylqc,eng-ens,3 +lre22_dev_zyppc,fra-ntf,8 +lre22_dev_zywem,eng-ens,8 +lre22_dev_zzapx,ara-ayl,5 +lre22_dev_zzumc,ara-arq,2 +lre22_dev_zzvdl,fra-ntf,5 +lre22_dev_zzvjv,nbl-nbl,14 diff --git a/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/train_segments.csv b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/train_segments.csv new file mode 100644 index 00000000..4d50b6a5 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/train_segments.csv @@ -0,0 +1,2088 @@ +id,class_id,subclass_idx +lre22_dev_aayck,ara-aeb,12 +lre22_dev_aayto,eng-iaf,14 +lre22_dev_abaha,zul-zul,17 +lre22_dev_abetm,fra-ntf,15 +lre22_dev_abnwz,zul-zul,19 +lre22_dev_abvjt,zul-zul,19 +lre22_dev_abwgm,ara-ayl,13 +lre22_dev_acepd,eng-iaf,19 +lre22_dev_acspt,eng-ens,12 +lre22_dev_aczdh,eng-ens,13 +lre22_dev_adkkm,tso-tso,19 +lre22_dev_adpus,tso-tso,13 +lre22_dev_adwju,ara-aeb,14 +lre22_dev_afnfn,afr-afr,20 +lre22_dev_afohq,ara-aeb,13 +lre22_dev_agnnp,afr-afr,17 +lre22_dev_agquw,fra-ntf,20 +lre22_dev_ahoow,ara-ayl,15 +lre22_dev_ahqxq,fra-ntf,22 +lre22_dev_aieqr,eng-iaf,17 +lre22_dev_ainix,eng-iaf,16 +lre22_dev_aiojl,fra-ntf,18 +lre22_dev_aiypg,nbl-nbl,17 +lre22_dev_ajcpi,orm-orm,22 +lre22_dev_ajeqv,ara-aeb,11 +lre22_dev_ajlqy,xho-xho,16 +lre22_dev_ajlyw,orm-orm,21 +lre22_dev_ajmrs,ara-aeb,11 +lre22_dev_ajzjc,eng-iaf,16 +lre22_dev_ajzyq,ara-ayl,14 +lre22_dev_akmfp,orm-orm,19 +lre22_dev_aleeu,ara-arq,14 +lre22_dev_aliba,ara-aeb,15 +lre22_dev_alkwi,eng-iaf,14 +lre22_dev_aluwk,nbl-nbl,16 +lre22_dev_alvdl,ara-arq,14 +lre22_dev_amrca,ara-aeb,11 +lre22_dev_aoanh,ara-ayl,15 +lre22_dev_aoeql,eng-ens,16 +lre22_dev_apfpk,eng-iaf,14 +lre22_dev_apufs,tir-tir,17 +lre22_dev_apvko,orm-orm,20 +lre22_dev_arefe,orm-orm,23 +lre22_dev_arvyp,ara-ayl,11 +lre22_dev_arwsc,fra-ntf,20 +lre22_dev_asqwa,ara-aeb,14 +lre22_dev_asrng,fra-ntf,18 +lre22_dev_aswjo,afr-afr,18 +lre22_dev_aulzk,ven-ven,21 +lre22_dev_aupcr,zul-zul,18 +lre22_dev_auqcy,eng-ens,18 +lre22_dev_auxdy,nbl-nbl,16 +lre22_dev_auycg,ara-ayl,11 +lre22_dev_aviiv,tso-tso,14 +lre22_dev_avrwo,tso-tso,19 +lre22_dev_avwim,ara-arq,13 +lre22_dev_avzdv,zul-zul,18 +lre22_dev_awtna,ara-arq,13 +lre22_dev_awxbj,orm-orm,23 +lre22_dev_axejc,fra-ntf,17 +lre22_dev_axtso,eng-ens,16 +lre22_dev_axwoo,ara-aeb,15 +lre22_dev_axyma,ara-arq,15 +lre22_dev_aycai,ven-ven,17 +lre22_dev_ayfjz,orm-orm,20 +lre22_dev_aylrz,eng-iaf,16 +lre22_dev_aynwz,tso-tso,18 +lre22_dev_aypyt,ara-aeb,11 +lre22_dev_ayszn,zul-zul,18 +lre22_dev_ayvge,ara-aeb,11 +lre22_dev_ayvmo,afr-afr,23 +lre22_dev_ayzdz,xho-xho,20 +lre22_dev_azbmt,xho-xho,19 +lre22_dev_azjsr,tir-tir,19 +lre22_dev_azkdh,nbl-nbl,20 +lre22_dev_azwrd,fra-ntf,15 +lre22_dev_badwe,ara-aeb,13 +lre22_dev_baiaf,zul-zul,17 +lre22_dev_baiwb,ara-aeb,13 +lre22_dev_baxuo,zul-zul,18 +lre22_dev_bbbtf,eng-ens,18 +lre22_dev_bbdws,ara-ayl,12 +lre22_dev_bbitq,eng-ens,16 +lre22_dev_bbnvu,ara-arq,13 +lre22_dev_bbunq,eng-iaf,14 +lre22_dev_bcinm,ara-aeb,14 +lre22_dev_bcrhs,zul-zul,17 +lre22_dev_bcwpu,ara-aeb,13 +lre22_dev_bcxdq,fra-ntf,21 +lre22_dev_bdgbr,ara-aeb,12 +lre22_dev_bdgrw,orm-orm,17 +lre22_dev_bdiml,ara-aeb,11 +lre22_dev_bdyue,xho-xho,21 +lre22_dev_bdzsj,tir-tir,13 +lre22_dev_beanp,tso-tso,12 +lre22_dev_beigo,ara-aeb,14 +lre22_dev_belhi,orm-orm,23 +lre22_dev_bfoej,ven-ven,20 +lre22_dev_bfznf,ara-ayl,11 +lre22_dev_bgeiq,ven-ven,15 +lre22_dev_bgeyp,ara-aeb,11 +lre22_dev_bgomt,afr-afr,14 +lre22_dev_bgrfd,nbl-nbl,19 +lre22_dev_bgwlu,tir-tir,17 +lre22_dev_bifkp,nbl-nbl,18 +lre22_dev_bipvh,nbl-nbl,17 +lre22_dev_biuyu,eng-ens,12 +lre22_dev_bixnf,ara-ayl,11 +lre22_dev_bjhdf,tso-tso,17 +lre22_dev_bjsmm,ara-ayl,10 +lre22_dev_bkhqg,eng-ens,17 +lre22_dev_bkpah,ven-ven,14 +lre22_dev_blaco,afr-afr,17 +lre22_dev_bleum,xho-xho,18 +lre22_dev_bnhvt,nbl-nbl,16 +lre22_dev_bowyn,ara-arq,14 +lre22_dev_bpeqb,xho-xho,21 +lre22_dev_bpgqs,tir-tir,13 +lre22_dev_bpzpv,afr-afr,16 +lre22_dev_bqenu,eng-ens,12 +lre22_dev_bqfxw,zul-zul,14 +lre22_dev_bqowg,tir-tir,19 +lre22_dev_bqxyq,tir-tir,19 +lre22_dev_brjud,xho-xho,21 +lre22_dev_bruwl,xho-xho,16 +lre22_dev_brzld,fra-ntf,20 +lre22_dev_bsgqz,eng-ens,13 +lre22_dev_bsocl,eng-ens,12 +lre22_dev_bszou,ara-arq,13 +lre22_dev_btapz,zul-zul,15 +lre22_dev_btjlk,ara-aeb,14 +lre22_dev_btkry,xho-xho,19 +lre22_dev_btyeu,ara-ayl,15 +lre22_dev_bvnuu,fra-ntf,19 +lre22_dev_bvqag,eng-iaf,20 +lre22_dev_bvvho,eng-ens,16 +lre22_dev_bvwaj,tir-tir,14 +lre22_dev_bvymi,eng-ens,15 +lre22_dev_bwgmj,eng-iaf,20 +lre22_dev_bwqpz,ara-arq,14 +lre22_dev_bwyrh,ara-aeb,12 +lre22_dev_bxkrj,ven-ven,18 +lre22_dev_bxkti,afr-afr,20 +lre22_dev_bxzms,nbl-nbl,17 +lre22_dev_bygrw,tso-tso,18 +lre22_dev_byjqr,ven-ven,18 +lre22_dev_bylkl,eng-iaf,16 +lre22_dev_bzmkn,fra-ntf,22 +lre22_dev_bzntz,ara-arq,13 +lre22_dev_bzwkf,eng-iaf,19 +lre22_dev_caijh,ven-ven,18 +lre22_dev_canou,tir-tir,19 +lre22_dev_caqxh,afr-afr,20 +lre22_dev_cayuc,eng-ens,12 +lre22_dev_cbruy,xho-xho,23 +lre22_dev_cbyyw,ara-arq,14 +lre22_dev_cbzbe,afr-afr,22 +lre22_dev_cclfh,ara-arq,15 +lre22_dev_ccovd,ara-arq,11 +lre22_dev_ccpns,eng-ens,17 +lre22_dev_ccsjt,eng-iaf,16 +lre22_dev_ccsql,fra-ntf,21 +lre22_dev_ccugm,eng-ens,18 +lre22_dev_ccyfn,afr-afr,23 +lre22_dev_cdmgw,tir-tir,16 +lre22_dev_cdshg,eng-iaf,17 +lre22_dev_ceccy,orm-orm,20 +lre22_dev_cecwt,fra-ntf,22 +lre22_dev_cegvk,ara-arq,11 +lre22_dev_cferi,zul-zul,15 +lre22_dev_cfojx,ara-arq,11 +lre22_dev_cfzoe,tir-tir,20 +lre22_dev_cgfna,zul-zul,18 +lre22_dev_cggzh,ara-ayl,13 +lre22_dev_cgims,tir-tir,20 +lre22_dev_cgixe,tir-tir,19 +lre22_dev_cgjov,zul-zul,14 +lre22_dev_chhio,ara-aeb,14 +lre22_dev_chnvd,tir-tir,13 +lre22_dev_chpww,nbl-nbl,21 +lre22_dev_churq,ara-ayl,13 +lre22_dev_cifqp,zul-zul,17 +lre22_dev_cijnx,xho-xho,22 +lre22_dev_ciozp,nbl-nbl,16 +lre22_dev_citpi,ara-aeb,12 +lre22_dev_cjrav,tir-tir,15 +lre22_dev_cksrw,ara-aeb,14 +lre22_dev_cktce,tir-tir,17 +lre22_dev_ckzhf,nbl-nbl,20 +lre22_dev_cleyn,ara-aeb,11 +lre22_dev_clhmt,fra-ntf,19 +lre22_dev_clrjd,orm-orm,21 +lre22_dev_clssx,eng-iaf,14 +lre22_dev_cluxm,ara-ayl,13 +lre22_dev_clzwe,ara-aeb,14 +lre22_dev_cminq,ara-aeb,11 +lre22_dev_cmmap,afr-afr,23 +lre22_dev_cmssr,orm-orm,20 +lre22_dev_cmufu,tso-tso,16 +lre22_dev_cnapz,orm-orm,19 +lre22_dev_cndba,tso-tso,12 +lre22_dev_cnkjh,tso-tso,15 +lre22_dev_cnvfe,orm-orm,18 +lre22_dev_cobbz,ara-arq,12 +lre22_dev_coppu,nbl-nbl,21 +lre22_dev_coqoj,eng-ens,17 +lre22_dev_cotun,ven-ven,16 +lre22_dev_cowrt,xho-xho,19 +lre22_dev_cppma,afr-afr,20 +lre22_dev_cpqkz,ara-arq,14 +lre22_dev_cpraw,afr-afr,17 +lre22_dev_cpsrb,fra-ntf,20 +lre22_dev_cpuax,zul-zul,16 +lre22_dev_cpudb,nbl-nbl,16 +lre22_dev_cqqds,afr-afr,22 +lre22_dev_cquib,ven-ven,21 +lre22_dev_cqwxe,nbl-nbl,16 +lre22_dev_cqyad,eng-iaf,15 +lre22_dev_crkut,eng-ens,17 +lre22_dev_crozj,fra-ntf,17 +lre22_dev_crrro,orm-orm,16 +lre22_dev_csavn,ara-aeb,15 +lre22_dev_cschy,afr-afr,16 +lre22_dev_csegr,tso-tso,14 +lre22_dev_csgvq,fra-ntf,17 +lre22_dev_csltj,ara-aeb,14 +lre22_dev_csmtr,ara-ayl,14 +lre22_dev_csqxl,ven-ven,20 +lre22_dev_ctjqw,nbl-nbl,16 +lre22_dev_ctxxt,nbl-nbl,17 +lre22_dev_cuaoy,ara-aeb,13 +lre22_dev_cudpj,ara-arq,13 +lre22_dev_cuhdf,afr-afr,21 +lre22_dev_cuoju,ven-ven,21 +lre22_dev_cupti,nbl-nbl,21 +lre22_dev_cusej,ara-aeb,14 +lre22_dev_cvfle,tir-tir,14 +lre22_dev_cvnqu,eng-ens,14 +lre22_dev_cvvjc,zul-zul,18 +lre22_dev_cvwht,fra-ntf,18 +lre22_dev_cvwtu,fra-ntf,21 +lre22_dev_cwlvk,tso-tso,16 +lre22_dev_cwnky,xho-xho,17 +lre22_dev_cxdlr,afr-afr,14 +lre22_dev_cxfii,ara-arq,13 +lre22_dev_cxpzt,zul-zul,16 +lre22_dev_cxqri,fra-ntf,21 +lre22_dev_cyaug,xho-xho,22 +lre22_dev_czdbd,fra-ntf,15 +lre22_dev_czvoy,ven-ven,16 +lre22_dev_czzrm,afr-afr,17 +lre22_dev_dahzr,ven-ven,17 +lre22_dev_dapny,ven-ven,17 +lre22_dev_dapug,nbl-nbl,19 +lre22_dev_dcbnz,xho-xho,16 +lre22_dev_dciaf,nbl-nbl,22 +lre22_dev_dcljn,afr-afr,19 +lre22_dev_dcmrn,afr-afr,20 +lre22_dev_dcobq,xho-xho,16 +lre22_dev_dcohp,tir-tir,16 +lre22_dev_dcsep,tso-tso,12 +lre22_dev_dctlw,ara-arq,12 +lre22_dev_dctvv,ara-arq,12 +lre22_dev_dcyoy,eng-iaf,17 +lre22_dev_ddgeb,xho-xho,23 +lre22_dev_ddsab,eng-ens,18 +lre22_dev_ddtpk,eng-ens,18 +lre22_dev_debjr,xho-xho,16 +lre22_dev_defkv,eng-ens,15 +lre22_dev_dejub,ara-arq,11 +lre22_dev_delok,eng-ens,14 +lre22_dev_dezlg,nbl-nbl,17 +lre22_dev_dffbj,fra-ntf,21 +lre22_dev_dfkox,xho-xho,19 +lre22_dev_dfpcn,ara-ayl,13 +lre22_dev_dfqgl,afr-afr,18 +lre22_dev_dfras,eng-iaf,19 +lre22_dev_dftpm,eng-iaf,20 +lre22_dev_dfvta,tso-tso,17 +lre22_dev_dgarp,eng-ens,13 +lre22_dev_dgntq,zul-zul,17 +lre22_dev_dgssb,tir-tir,19 +lre22_dev_dgvtc,xho-xho,23 +lre22_dev_dhdvp,ara-ayl,10 +lre22_dev_dhmbl,fra-ntf,22 +lre22_dev_diiry,orm-orm,16 +lre22_dev_disrs,afr-afr,16 +lre22_dev_ditsk,xho-xho,21 +lre22_dev_djbbz,ara-arq,14 +lre22_dev_djevu,tso-tso,16 +lre22_dev_djlaf,tir-tir,20 +lre22_dev_djoim,zul-zul,15 +lre22_dev_djvvp,zul-zul,17 +lre22_dev_djwyo,ven-ven,18 +lre22_dev_dkbfm,ara-ayl,12 +lre22_dev_dkpcy,ara-aeb,12 +lre22_dev_dlxzj,orm-orm,19 +lre22_dev_dmnjo,ven-ven,14 +lre22_dev_dmtsm,zul-zul,16 +lre22_dev_dnaql,orm-orm,23 +lre22_dev_dnkpf,ara-aeb,15 +lre22_dev_dnscr,tso-tso,12 +lre22_dev_dnygt,eng-ens,15 +lre22_dev_dobre,xho-xho,19 +lre22_dev_dohlp,xho-xho,23 +lre22_dev_doioo,orm-orm,19 +lre22_dev_donaq,ara-aeb,13 +lre22_dev_dooht,ara-arq,11 +lre22_dev_dpmbt,zul-zul,14 +lre22_dev_dptyy,xho-xho,17 +lre22_dev_dqmud,eng-iaf,15 +lre22_dev_dqmxb,xho-xho,20 +lre22_dev_dqopt,eng-ens,14 +lre22_dev_dqpgr,ara-aeb,14 +lre22_dev_drkux,eng-ens,14 +lre22_dev_dsfha,ven-ven,18 +lre22_dev_dsftc,tso-tso,16 +lre22_dev_dskaq,ven-ven,15 +lre22_dev_dtdmp,zul-zul,18 +lre22_dev_dtdux,afr-afr,14 +lre22_dev_dtyki,ara-arq,11 +lre22_dev_durlr,orm-orm,18 +lre22_dev_dutdz,tso-tso,12 +lre22_dev_dvbol,ara-ayl,15 +lre22_dev_dwesk,nbl-nbl,22 +lre22_dev_dwtjw,ven-ven,14 +lre22_dev_dxckb,tso-tso,12 +lre22_dev_dxizq,eng-iaf,14 +lre22_dev_dxtnq,fra-ntf,18 +lre22_dev_dxvib,zul-zul,14 +lre22_dev_dyago,eng-iaf,16 +lre22_dev_dyipl,eng-iaf,18 +lre22_dev_dyqlo,ara-arq,13 +lre22_dev_dyvml,eng-iaf,15 +lre22_dev_dzkui,tso-tso,12 +lre22_dev_dzqta,ven-ven,20 +lre22_dev_dzxio,eng-ens,18 +lre22_dev_eachn,tir-tir,16 +lre22_dev_eapvu,eng-iaf,20 +lre22_dev_ebfdv,ara-ayl,10 +lre22_dev_ebgbd,eng-ens,17 +lre22_dev_eblhy,eng-iaf,20 +lre22_dev_ebtrq,ara-aeb,13 +lre22_dev_ebymv,tir-tir,14 +lre22_dev_ebzhg,nbl-nbl,21 +lre22_dev_ecbwo,ven-ven,21 +lre22_dev_ecllm,fra-ntf,21 +lre22_dev_eclpf,ven-ven,16 +lre22_dev_ecmhd,ara-aeb,14 +lre22_dev_ecnqi,eng-ens,14 +lre22_dev_ecpdc,ara-ayl,10 +lre22_dev_ecslx,afr-afr,22 +lre22_dev_ecuyo,xho-xho,23 +lre22_dev_edgur,tso-tso,16 +lre22_dev_edjtb,nbl-nbl,22 +lre22_dev_edsls,tso-tso,16 +lre22_dev_edssc,orm-orm,23 +lre22_dev_edvab,zul-zul,19 +lre22_dev_eehzu,zul-zul,18 +lre22_dev_eekci,afr-afr,15 +lre22_dev_eekcw,zul-zul,17 +lre22_dev_efihg,nbl-nbl,16 +lre22_dev_efsxw,tso-tso,16 +lre22_dev_efxjv,ara-aeb,14 +lre22_dev_efymf,ara-aeb,14 +lre22_dev_ehcvr,tir-tir,19 +lre22_dev_ehehw,xho-xho,20 +lre22_dev_ehewh,eng-ens,18 +lre22_dev_ehvyp,zul-zul,14 +lre22_dev_eifqv,zul-zul,19 +lre22_dev_eifxu,ara-ayl,10 +lre22_dev_ejcvy,fra-ntf,18 +lre22_dev_ejeek,eng-ens,16 +lre22_dev_ejfyn,fra-ntf,22 +lre22_dev_ejjqg,tso-tso,12 +lre22_dev_ejtox,ven-ven,19 +lre22_dev_ejwch,fra-ntf,21 +lre22_dev_ejzhx,xho-xho,17 +lre22_dev_ekbkm,afr-afr,21 +lre22_dev_ekzhk,ara-ayl,10 +lre22_dev_elanj,tso-tso,18 +lre22_dev_elvvn,tir-tir,16 +lre22_dev_emadg,xho-xho,22 +lre22_dev_emkzr,afr-afr,21 +lre22_dev_emmck,ara-arq,15 +lre22_dev_enwfu,afr-afr,15 +lre22_dev_eodro,ara-arq,15 +lre22_dev_eoisu,ven-ven,18 +lre22_dev_eomzr,xho-xho,23 +lre22_dev_eorva,xho-xho,21 +lre22_dev_epbwh,nbl-nbl,17 +lre22_dev_epeou,xho-xho,20 +lre22_dev_epifq,nbl-nbl,22 +lre22_dev_epqqo,ara-ayl,14 +lre22_dev_epsld,tso-tso,12 +lre22_dev_epsza,ara-ayl,12 +lre22_dev_eqmgm,ara-aeb,12 +lre22_dev_eqrhr,afr-afr,22 +lre22_dev_eqvan,ara-ayl,13 +lre22_dev_ersgd,orm-orm,22 +lre22_dev_erxig,zul-zul,15 +lre22_dev_esbrw,fra-ntf,19 +lre22_dev_esuug,nbl-nbl,20 +lre22_dev_etczk,tir-tir,14 +lre22_dev_etelz,fra-ntf,21 +lre22_dev_ettsh,fra-ntf,20 +lre22_dev_etuwp,ven-ven,19 +lre22_dev_eubgy,fra-ntf,18 +lre22_dev_euewj,orm-orm,18 +lre22_dev_euzyb,ara-aeb,13 +lre22_dev_ewatn,zul-zul,18 +lre22_dev_ewehs,orm-orm,17 +lre22_dev_ewexz,fra-ntf,18 +lre22_dev_ewgop,tir-tir,20 +lre22_dev_ewmgd,fra-ntf,21 +lre22_dev_ewzma,orm-orm,18 +lre22_dev_expvn,xho-xho,17 +lre22_dev_eyoqu,tir-tir,16 +lre22_dev_eyylz,nbl-nbl,16 +lre22_dev_eyzqu,tir-tir,18 +lre22_dev_ezdty,afr-afr,18 +lre22_dev_ezgcl,ara-aeb,13 +lre22_dev_eznzd,zul-zul,19 +lre22_dev_ezzwj,eng-iaf,18 +lre22_dev_facyr,zul-zul,18 +lre22_dev_faejb,tso-tso,16 +lre22_dev_famjw,orm-orm,18 +lre22_dev_favzh,ara-arq,11 +lre22_dev_fbsre,orm-orm,23 +lre22_dev_fbtkl,fra-ntf,22 +lre22_dev_fbvxh,ara-ayl,14 +lre22_dev_fbyhp,nbl-nbl,20 +lre22_dev_fbysf,nbl-nbl,17 +lre22_dev_fcckx,ara-arq,12 +lre22_dev_fczba,eng-iaf,17 +lre22_dev_fdouw,eng-ens,14 +lre22_dev_fdtmf,tso-tso,13 +lre22_dev_fdtnc,fra-ntf,20 +lre22_dev_fdwme,afr-afr,19 +lre22_dev_fdyhr,eng-ens,18 +lre22_dev_feanh,fra-ntf,22 +lre22_dev_femmc,ara-arq,12 +lre22_dev_fevab,orm-orm,19 +lre22_dev_fexsi,orm-orm,17 +lre22_dev_fflai,ara-aeb,14 +lre22_dev_fgblw,tso-tso,14 +lre22_dev_fglhf,nbl-nbl,22 +lre22_dev_fhucm,ara-ayl,14 +lre22_dev_fhzwp,nbl-nbl,17 +lre22_dev_fifon,eng-iaf,14 +lre22_dev_fipff,orm-orm,19 +lre22_dev_fipyx,zul-zul,14 +lre22_dev_firtn,zul-zul,18 +lre22_dev_fjdqb,nbl-nbl,16 +lre22_dev_fjdxl,tir-tir,14 +lre22_dev_fjocp,ara-ayl,12 +lre22_dev_fjudb,ara-aeb,15 +lre22_dev_fkbjz,afr-afr,22 +lre22_dev_fkwaq,afr-afr,19 +lre22_dev_flbgp,afr-afr,16 +lre22_dev_flgxs,tir-tir,13 +lre22_dev_fljfm,tir-tir,19 +lre22_dev_fmauu,tso-tso,18 +lre22_dev_fmbvf,fra-ntf,19 +lre22_dev_fmhfa,ara-arq,12 +lre22_dev_fmije,ara-ayl,13 +lre22_dev_fnafq,tir-tir,20 +lre22_dev_fofmo,eng-ens,15 +lre22_dev_foikm,tir-tir,16 +lre22_dev_fosfi,eng-iaf,19 +lre22_dev_fotti,eng-ens,13 +lre22_dev_fozzx,zul-zul,15 +lre22_dev_fpehr,ara-aeb,12 +lre22_dev_fpiig,orm-orm,21 +lre22_dev_fqfag,ara-ayl,16 +lre22_dev_fqogo,tir-tir,13 +lre22_dev_frdqe,ara-arq,11 +lre22_dev_fremq,afr-afr,22 +lre22_dev_frjdx,zul-zul,18 +lre22_dev_fruha,ara-ayl,12 +lre22_dev_frxmu,eng-iaf,18 +lre22_dev_fsbeo,tso-tso,13 +lre22_dev_fsijy,fra-ntf,22 +lre22_dev_fsjwh,nbl-nbl,18 +lre22_dev_fspmb,tso-tso,19 +lre22_dev_ftbak,tir-tir,13 +lre22_dev_ftxuo,eng-iaf,20 +lre22_dev_fupee,ara-aeb,13 +lre22_dev_fupla,ara-aeb,11 +lre22_dev_fvmdq,fra-ntf,22 +lre22_dev_fvmjb,fra-ntf,20 +lre22_dev_fvubo,fra-ntf,22 +lre22_dev_fvwze,afr-afr,23 +lre22_dev_fvxxt,ara-arq,13 +lre22_dev_fwcye,ven-ven,21 +lre22_dev_fwkwv,orm-orm,18 +lre22_dev_fxezd,orm-orm,17 +lre22_dev_fxuir,nbl-nbl,19 +lre22_dev_fzgcm,zul-zul,14 +lre22_dev_fzncb,nbl-nbl,16 +lre22_dev_gaezu,ara-aeb,11 +lre22_dev_gawox,ara-aeb,13 +lre22_dev_gbcfq,zul-zul,14 +lre22_dev_gbdkv,orm-orm,17 +lre22_dev_gbevf,eng-iaf,20 +lre22_dev_gchke,ara-aeb,12 +lre22_dev_gcncr,ara-arq,13 +lre22_dev_gdeqd,ara-ayl,14 +lre22_dev_gdncj,eng-iaf,14 +lre22_dev_gdobt,ven-ven,21 +lre22_dev_geeoy,xho-xho,22 +lre22_dev_geraa,afr-afr,20 +lre22_dev_gfigd,nbl-nbl,16 +lre22_dev_gfjzm,ara-ayl,12 +lre22_dev_gftlv,tir-tir,20 +lre22_dev_ggaux,xho-xho,16 +lre22_dev_ggbgc,zul-zul,15 +lre22_dev_gghhn,zul-zul,18 +lre22_dev_ggrwj,eng-iaf,17 +lre22_dev_ghdur,eng-ens,15 +lre22_dev_ghgbo,ara-ayl,14 +lre22_dev_ghhop,nbl-nbl,20 +lre22_dev_ghnwg,ara-ayl,14 +lre22_dev_ghpmd,ara-ayl,14 +lre22_dev_ghqbh,orm-orm,19 +lre22_dev_gihvo,eng-ens,16 +lre22_dev_giueq,tso-tso,19 +lre22_dev_giuix,ara-aeb,15 +lre22_dev_gjaqj,eng-iaf,20 +lre22_dev_gjgcw,xho-xho,18 +lre22_dev_gjirh,eng-iaf,16 +lre22_dev_gjvwy,nbl-nbl,22 +lre22_dev_gkeql,eng-iaf,16 +lre22_dev_gkhas,tso-tso,16 +lre22_dev_glmyp,nbl-nbl,16 +lre22_dev_glqft,eng-ens,18 +lre22_dev_glsnb,afr-afr,17 +lre22_dev_gmfcb,eng-iaf,16 +lre22_dev_gmlwo,afr-afr,16 +lre22_dev_gmpjq,tso-tso,12 +lre22_dev_gmrvk,ara-aeb,14 +lre22_dev_gmryq,ara-ayl,13 +lre22_dev_gmsds,eng-ens,16 +lre22_dev_gmztl,xho-xho,16 +lre22_dev_gnbyu,eng-iaf,15 +lre22_dev_gntym,zul-zul,17 +lre22_dev_gocpa,tso-tso,15 +lre22_dev_gpyxs,orm-orm,17 +lre22_dev_grgvb,afr-afr,16 +lre22_dev_grspj,orm-orm,19 +lre22_dev_grvjm,xho-xho,19 +lre22_dev_gsidj,eng-ens,18 +lre22_dev_gslzy,afr-afr,22 +lre22_dev_gtwcl,tir-tir,14 +lre22_dev_gulky,orm-orm,21 +lre22_dev_gvlhy,tir-tir,20 +lre22_dev_gvljx,tso-tso,15 +lre22_dev_gvmma,tso-tso,13 +lre22_dev_gvtvb,afr-afr,23 +lre22_dev_gweym,xho-xho,19 +lre22_dev_gwljh,ara-aeb,11 +lre22_dev_gwxtn,ara-ayl,14 +lre22_dev_gxdpw,fra-ntf,16 +lre22_dev_gxext,afr-afr,15 +lre22_dev_gxkqq,nbl-nbl,19 +lre22_dev_gxkxo,xho-xho,21 +lre22_dev_gxnkr,xho-xho,18 +lre22_dev_gxxbk,fra-ntf,21 +lre22_dev_gydvv,afr-afr,20 +lre22_dev_gytkt,ara-arq,12 +lre22_dev_gzmvp,afr-afr,18 +lre22_dev_gzoou,ven-ven,19 +lre22_dev_gzvza,tir-tir,15 +lre22_dev_gzwee,eng-iaf,17 +lre22_dev_haewp,tir-tir,19 +lre22_dev_haokb,fra-ntf,19 +lre22_dev_hazis,nbl-nbl,20 +lre22_dev_hbbbc,eng-ens,16 +lre22_dev_hblqa,nbl-nbl,17 +lre22_dev_hbmfy,zul-zul,15 +lre22_dev_hbndl,zul-zul,17 +lre22_dev_hcgfc,eng-ens,13 +lre22_dev_hcjnx,orm-orm,17 +lre22_dev_hcont,tir-tir,17 +lre22_dev_hcvik,tso-tso,13 +lre22_dev_hczom,zul-zul,19 +lre22_dev_hdaca,xho-xho,19 +lre22_dev_hdijt,fra-ntf,15 +lre22_dev_hdkyr,afr-afr,18 +lre22_dev_hdnoq,orm-orm,23 +lre22_dev_hdtlb,eng-iaf,16 +lre22_dev_hever,nbl-nbl,18 +lre22_dev_hfirj,nbl-nbl,17 +lre22_dev_hgbxp,xho-xho,21 +lre22_dev_hgcax,xho-xho,19 +lre22_dev_hgkwa,tso-tso,13 +lre22_dev_hgljd,ara-arq,15 +lre22_dev_hgvrh,nbl-nbl,21 +lre22_dev_hhovn,eng-iaf,16 +lre22_dev_hhpzm,fra-ntf,22 +lre22_dev_hhuab,ven-ven,20 +lre22_dev_hicev,ven-ven,18 +lre22_dev_hickz,ara-arq,12 +lre22_dev_hilii,orm-orm,23 +lre22_dev_hjenx,eng-iaf,19 +lre22_dev_hjiui,orm-orm,18 +lre22_dev_hkfts,eng-ens,18 +lre22_dev_hkhvl,zul-zul,19 +lre22_dev_hkobh,xho-xho,17 +lre22_dev_hkvay,ara-arq,13 +lre22_dev_hkvtj,orm-orm,21 +lre22_dev_hlevc,fra-ntf,17 +lre22_dev_hliut,ara-aeb,14 +lre22_dev_hlntc,zul-zul,18 +lre22_dev_hlprm,zul-zul,18 +lre22_dev_hmeav,ven-ven,17 +lre22_dev_hnelt,tir-tir,15 +lre22_dev_hniiy,ara-arq,15 +lre22_dev_hoepv,ara-aeb,13 +lre22_dev_hofkm,orm-orm,19 +lre22_dev_hoilz,tir-tir,19 +lre22_dev_hookr,ara-aeb,13 +lre22_dev_hpbhl,tir-tir,16 +lre22_dev_hpbzf,ara-aeb,11 +lre22_dev_hpizl,eng-ens,15 +lre22_dev_hplhi,ara-ayl,13 +lre22_dev_hplrq,xho-xho,20 +lre22_dev_hqdva,ven-ven,21 +lre22_dev_hqnus,xho-xho,16 +lre22_dev_hqoiz,orm-orm,18 +lre22_dev_hrerz,eng-ens,14 +lre22_dev_hrgjq,tir-tir,19 +lre22_dev_hrrhr,zul-zul,17 +lre22_dev_hsfbi,ara-ayl,14 +lre22_dev_hsjlg,tir-tir,17 +lre22_dev_hskug,afr-afr,16 +lre22_dev_hszzt,tso-tso,19 +lre22_dev_htgrl,tso-tso,18 +lre22_dev_htxah,zul-zul,17 +lre22_dev_htxrs,xho-xho,23 +lre22_dev_hudwz,nbl-nbl,17 +lre22_dev_huuqj,fra-ntf,18 +lre22_dev_hvsds,afr-afr,21 +lre22_dev_hwbhz,orm-orm,23 +lre22_dev_hwbvs,tso-tso,13 +lre22_dev_hwdlb,tso-tso,19 +lre22_dev_hwyki,eng-iaf,16 +lre22_dev_hxcmj,eng-iaf,20 +lre22_dev_hxdly,ara-arq,11 +lre22_dev_hyeqm,xho-xho,19 +lre22_dev_hyofm,ara-arq,12 +lre22_dev_hyogg,ara-arq,13 +lre22_dev_hyouu,tso-tso,13 +lre22_dev_hzfpc,fra-ntf,16 +lre22_dev_hzkjt,ara-aeb,12 +lre22_dev_hzrgv,fra-ntf,20 +lre22_dev_hzuus,tir-tir,19 +lre22_dev_hzzbp,xho-xho,19 +lre22_dev_iautt,afr-afr,20 +lre22_dev_ibdnu,tir-tir,13 +lre22_dev_ibuww,ara-aeb,13 +lre22_dev_icbuo,ven-ven,21 +lre22_dev_icqmr,tso-tso,14 +lre22_dev_ictwj,tir-tir,14 +lre22_dev_ifumz,ven-ven,14 +lre22_dev_igcgi,tso-tso,19 +lre22_dev_igder,tir-tir,19 +lre22_dev_igexm,xho-xho,21 +lre22_dev_igfxi,fra-ntf,20 +lre22_dev_igoxr,afr-afr,15 +lre22_dev_igxyt,ven-ven,21 +lre22_dev_ihqtn,ara-aeb,11 +lre22_dev_ihxfl,tir-tir,13 +lre22_dev_ihyrb,nbl-nbl,18 +lre22_dev_iifuu,tir-tir,15 +lre22_dev_iiien,xho-xho,20 +lre22_dev_ijccu,eng-iaf,16 +lre22_dev_ijrun,afr-afr,18 +lre22_dev_ijwlx,ara-arq,14 +lre22_dev_ijydw,xho-xho,21 +lre22_dev_ikdjt,xho-xho,23 +lre22_dev_iklbv,ara-arq,13 +lre22_dev_ikyai,fra-ntf,18 +lre22_dev_ildmr,orm-orm,21 +lre22_dev_ilebo,orm-orm,19 +lre22_dev_ilptc,eng-ens,18 +lre22_dev_ilsku,fra-ntf,16 +lre22_dev_ilyti,ara-arq,11 +lre22_dev_imnqh,zul-zul,17 +lre22_dev_imxdr,eng-ens,16 +lre22_dev_indww,fra-ntf,19 +lre22_dev_iokar,eng-iaf,15 +lre22_dev_iomtu,eng-iaf,15 +lre22_dev_ioobz,tir-tir,14 +lre22_dev_iosom,zul-zul,17 +lre22_dev_iowyd,ara-arq,14 +lre22_dev_iphzy,nbl-nbl,18 +lre22_dev_ipmrc,nbl-nbl,16 +lre22_dev_ipomi,ara-aeb,12 +lre22_dev_ipour,afr-afr,15 +lre22_dev_ippjq,ara-ayl,16 +lre22_dev_ipvjc,ara-aeb,13 +lre22_dev_iqfdc,ven-ven,19 +lre22_dev_iqppw,tso-tso,15 +lre22_dev_iqtde,tso-tso,14 +lre22_dev_irlee,eng-iaf,14 +lre22_dev_irxuq,ara-aeb,14 +lre22_dev_isjzo,ara-arq,14 +lre22_dev_isnwz,ara-ayl,14 +lre22_dev_isqvk,afr-afr,15 +lre22_dev_isqww,orm-orm,19 +lre22_dev_istdz,tir-tir,18 +lre22_dev_iszhe,fra-ntf,20 +lre22_dev_itblz,ven-ven,18 +lre22_dev_itfez,ara-arq,13 +lre22_dev_itjqm,zul-zul,18 +lre22_dev_itnap,nbl-nbl,21 +lre22_dev_itrms,xho-xho,21 +lre22_dev_itroi,fra-ntf,17 +lre22_dev_ittds,zul-zul,16 +lre22_dev_iuknz,tso-tso,16 +lre22_dev_iumnm,ara-ayl,15 +lre22_dev_iunul,afr-afr,23 +lre22_dev_iverq,ven-ven,16 +lre22_dev_ivwzd,ara-ayl,14 +lre22_dev_ivzjf,tso-tso,12 +lre22_dev_iwbta,nbl-nbl,16 +lre22_dev_iwdeh,orm-orm,21 +lre22_dev_iwgel,ara-aeb,11 +lre22_dev_ixbhj,ara-aeb,11 +lre22_dev_ixbnl,fra-ntf,16 +lre22_dev_ixcef,ven-ven,20 +lre22_dev_ixfdf,orm-orm,18 +lre22_dev_ixjey,orm-orm,19 +lre22_dev_ixlve,tir-tir,17 +lre22_dev_ixutu,ara-ayl,12 +lre22_dev_ixxoj,xho-xho,23 +lre22_dev_ixyko,afr-afr,22 +lre22_dev_iylls,eng-iaf,19 +lre22_dev_izegw,orm-orm,23 +lre22_dev_izglb,ara-ayl,13 +lre22_dev_iziar,ara-arq,13 +lre22_dev_jadvz,afr-afr,18 +lre22_dev_jajtw,ara-aeb,14 +lre22_dev_janvu,tso-tso,16 +lre22_dev_japrb,xho-xho,21 +lre22_dev_jarvz,ara-aeb,12 +lre22_dev_jazcn,tso-tso,13 +lre22_dev_jbfxj,tso-tso,12 +lre22_dev_jbnfg,fra-ntf,15 +lre22_dev_jbwgd,afr-afr,20 +lre22_dev_jceug,tso-tso,15 +lre22_dev_jcqtd,eng-ens,14 +lre22_dev_jcxry,ven-ven,20 +lre22_dev_jdbli,tir-tir,20 +lre22_dev_jegmb,orm-orm,18 +lre22_dev_jegqj,ara-ayl,12 +lre22_dev_jenns,xho-xho,22 +lre22_dev_jfarf,ven-ven,14 +lre22_dev_jfcve,zul-zul,17 +lre22_dev_jfgyq,xho-xho,23 +lre22_dev_jftnz,afr-afr,14 +lre22_dev_jftsj,afr-afr,22 +lre22_dev_jgnid,nbl-nbl,16 +lre22_dev_jgsju,eng-ens,13 +lre22_dev_jifal,orm-orm,19 +lre22_dev_jihsd,orm-orm,21 +lre22_dev_jihwf,ara-ayl,11 +lre22_dev_jiptp,eng-iaf,15 +lre22_dev_jizij,tir-tir,14 +lre22_dev_jjpzg,orm-orm,23 +lre22_dev_jkezw,fra-ntf,18 +lre22_dev_jkmux,fra-ntf,20 +lre22_dev_jkpnt,orm-orm,22 +lre22_dev_jlkfj,eng-ens,18 +lre22_dev_jlmtf,ven-ven,19 +lre22_dev_jlrfm,ara-arq,12 +lre22_dev_jmojg,orm-orm,19 +lre22_dev_jmrcv,ara-aeb,13 +lre22_dev_jmsxc,eng-iaf,16 +lre22_dev_jnjpw,tir-tir,14 +lre22_dev_jnzvu,ara-aeb,14 +lre22_dev_jocyh,xho-xho,19 +lre22_dev_joezr,tso-tso,16 +lre22_dev_jofqy,ara-arq,11 +lre22_dev_jpbyf,eng-ens,15 +lre22_dev_jppuy,ara-arq,13 +lre22_dev_jptts,ara-aeb,12 +lre22_dev_jqdyx,fra-ntf,22 +lre22_dev_jqjbq,zul-zul,17 +lre22_dev_jqpnb,ven-ven,21 +lre22_dev_jqqin,zul-zul,17 +lre22_dev_jqzkq,ara-ayl,13 +lre22_dev_jrroq,orm-orm,21 +lre22_dev_jruru,eng-ens,16 +lre22_dev_jskbr,ara-arq,11 +lre22_dev_jskdd,nbl-nbl,19 +lre22_dev_jslnc,eng-ens,12 +lre22_dev_jsmat,orm-orm,17 +lre22_dev_jsmdw,ara-aeb,11 +lre22_dev_jsvaz,afr-afr,19 +lre22_dev_jsxcy,afr-afr,21 +lre22_dev_jszgk,eng-iaf,19 +lre22_dev_jthui,ven-ven,20 +lre22_dev_jtpvz,ven-ven,17 +lre22_dev_jtwdi,ven-ven,14 +lre22_dev_jtwfh,ven-ven,18 +lre22_dev_juwid,tir-tir,20 +lre22_dev_jvdww,fra-ntf,21 +lre22_dev_jweyx,tir-tir,19 +lre22_dev_jwuto,afr-afr,19 +lre22_dev_jwwgs,afr-afr,19 +lre22_dev_jxhxf,nbl-nbl,17 +lre22_dev_jxtxk,orm-orm,20 +lre22_dev_jxzvy,eng-ens,15 +lre22_dev_jyjlm,nbl-nbl,19 +lre22_dev_jynvf,ara-ayl,13 +lre22_dev_jyzmh,nbl-nbl,19 +lre22_dev_jzivf,eng-ens,14 +lre22_dev_jzpns,tso-tso,14 +lre22_dev_kadwu,fra-ntf,18 +lre22_dev_kbnbi,tir-tir,13 +lre22_dev_kbqbd,fra-ntf,16 +lre22_dev_kbscm,tso-tso,15 +lre22_dev_kbxko,ara-aeb,12 +lre22_dev_kcegv,tso-tso,15 +lre22_dev_kcibo,afr-afr,17 +lre22_dev_kcmky,ara-ayl,14 +lre22_dev_kctrd,nbl-nbl,22 +lre22_dev_kcvbf,fra-ntf,16 +lre22_dev_kdbqy,zul-zul,15 +lre22_dev_kdgpz,ara-arq,14 +lre22_dev_kdhgq,nbl-nbl,22 +lre22_dev_kdvtu,eng-iaf,16 +lre22_dev_kdyhm,tso-tso,12 +lre22_dev_keeyz,zul-zul,18 +lre22_dev_kejvy,ven-ven,18 +lre22_dev_kerpr,ven-ven,21 +lre22_dev_keweh,ara-aeb,13 +lre22_dev_keysx,orm-orm,23 +lre22_dev_kezyv,ara-ayl,13 +lre22_dev_kgbiq,ven-ven,18 +lre22_dev_kgovz,tso-tso,15 +lre22_dev_kgxka,eng-ens,16 +lre22_dev_khkcx,fra-ntf,20 +lre22_dev_khobl,orm-orm,19 +lre22_dev_khttn,afr-afr,17 +lre22_dev_khvss,tir-tir,15 +lre22_dev_kiezl,tso-tso,16 +lre22_dev_kihlw,eng-ens,14 +lre22_dev_kipuq,ara-arq,14 +lre22_dev_kiqcx,tir-tir,16 +lre22_dev_kjiks,xho-xho,19 +lre22_dev_kjmpa,zul-zul,18 +lre22_dev_kjocf,eng-iaf,16 +lre22_dev_kkbur,ven-ven,16 +lre22_dev_kksdi,xho-xho,22 +lre22_dev_kkytv,ara-aeb,11 +lre22_dev_kmkgx,nbl-nbl,17 +lre22_dev_kmpkm,zul-zul,19 +lre22_dev_kmyzy,ara-ayl,13 +lre22_dev_knfsj,afr-afr,15 +lre22_dev_knyuq,orm-orm,19 +lre22_dev_koacp,orm-orm,19 +lre22_dev_koket,eng-ens,18 +lre22_dev_kovdn,zul-zul,15 +lre22_dev_kowqf,ven-ven,19 +lre22_dev_kozfr,nbl-nbl,21 +lre22_dev_kpmyz,orm-orm,19 +lre22_dev_kqfdc,eng-ens,17 +lre22_dev_kqumw,fra-ntf,22 +lre22_dev_kqwdi,nbl-nbl,16 +lre22_dev_krczb,ven-ven,19 +lre22_dev_kremz,nbl-nbl,16 +lre22_dev_ksruw,ven-ven,18 +lre22_dev_kszdw,eng-iaf,20 +lre22_dev_ktgvi,ara-arq,11 +lre22_dev_ktjax,fra-ntf,20 +lre22_dev_ktlvc,orm-orm,19 +lre22_dev_kvqgp,afr-afr,21 +lre22_dev_kvyoz,afr-afr,20 +lre22_dev_kvzim,afr-afr,14 +lre22_dev_kvzwc,eng-iaf,14 +lre22_dev_kwcwa,ara-arq,14 +lre22_dev_kwomo,zul-zul,19 +lre22_dev_kwxau,xho-xho,18 +lre22_dev_kxawf,tir-tir,19 +lre22_dev_kxjhn,ara-aeb,11 +lre22_dev_kxklh,tir-tir,19 +lre22_dev_kxlgg,tir-tir,16 +lre22_dev_kyqbp,fra-ntf,21 +lre22_dev_kyzio,ven-ven,20 +lre22_dev_kzcgh,ara-ayl,13 +lre22_dev_kzeyf,ven-ven,18 +lre22_dev_kzfwf,fra-ntf,19 +lre22_dev_kzjuz,orm-orm,21 +lre22_dev_kzjwx,ara-ayl,11 +lre22_dev_lamjl,tso-tso,17 +lre22_dev_laowh,xho-xho,16 +lre22_dev_larex,ara-ayl,11 +lre22_dev_laycs,tso-tso,12 +lre22_dev_lbxfn,eng-iaf,20 +lre22_dev_lcrog,zul-zul,18 +lre22_dev_ldczz,xho-xho,17 +lre22_dev_ldkgv,ara-aeb,13 +lre22_dev_ldkst,fra-ntf,20 +lre22_dev_ldkwr,orm-orm,22 +lre22_dev_lenxf,ven-ven,14 +lre22_dev_lfbey,ara-ayl,12 +lre22_dev_lfmml,fra-ntf,18 +lre22_dev_lfmxu,ven-ven,18 +lre22_dev_lfqfj,afr-afr,17 +lre22_dev_lgetu,ara-aeb,14 +lre22_dev_lgleu,ara-ayl,11 +lre22_dev_lgoat,eng-iaf,16 +lre22_dev_lhgaj,tso-tso,15 +lre22_dev_lhqyw,nbl-nbl,17 +lre22_dev_lhrmr,eng-iaf,17 +lre22_dev_lhtsd,tir-tir,19 +lre22_dev_lhydp,fra-ntf,22 +lre22_dev_livbf,tir-tir,15 +lre22_dev_ljdrg,ara-arq,13 +lre22_dev_ljniw,tso-tso,16 +lre22_dev_ljpmq,tso-tso,12 +lre22_dev_lkjon,tso-tso,15 +lre22_dev_lkszp,nbl-nbl,19 +lre22_dev_llbim,ara-ayl,15 +lre22_dev_llkkt,fra-ntf,15 +lre22_dev_llvcc,orm-orm,22 +lre22_dev_lmbug,ara-arq,12 +lre22_dev_lmmmw,nbl-nbl,19 +lre22_dev_lmsek,ven-ven,16 +lre22_dev_lmudp,ara-ayl,10 +lre22_dev_lmzmv,eng-iaf,19 +lre22_dev_lnlae,ara-arq,14 +lre22_dev_lnlvt,zul-zul,17 +lre22_dev_lnppu,ara-ayl,13 +lre22_dev_lnpyc,tso-tso,19 +lre22_dev_lolkv,xho-xho,19 +lre22_dev_lorcx,nbl-nbl,20 +lre22_dev_lparq,xho-xho,16 +lre22_dev_lqlft,ara-arq,11 +lre22_dev_lqlyq,ara-arq,12 +lre22_dev_lqoeu,tso-tso,14 +lre22_dev_lqueh,ara-ayl,11 +lre22_dev_lquzk,ara-arq,12 +lre22_dev_lqvav,zul-zul,18 +lre22_dev_lrgpy,eng-iaf,16 +lre22_dev_lrjbn,ven-ven,21 +lre22_dev_lrtad,ara-arq,14 +lre22_dev_lrtxd,ara-aeb,11 +lre22_dev_lrvkn,ven-ven,16 +lre22_dev_lrzwy,ara-ayl,13 +lre22_dev_lsefk,ara-arq,13 +lre22_dev_ltmmt,orm-orm,22 +lre22_dev_lutgh,ara-aeb,15 +lre22_dev_lvhmd,tso-tso,14 +lre22_dev_lvqim,ara-aeb,14 +lre22_dev_lvuuo,fra-ntf,17 +lre22_dev_lvzri,ven-ven,16 +lre22_dev_lweml,ara-arq,14 +lre22_dev_lwstj,eng-iaf,16 +lre22_dev_lwzdj,afr-afr,18 +lre22_dev_lxdsk,eng-ens,16 +lre22_dev_lxlcr,ara-aeb,13 +lre22_dev_lxshv,eng-iaf,20 +lre22_dev_lxxvv,eng-ens,16 +lre22_dev_lyfhc,ven-ven,18 +lre22_dev_lyikp,zul-zul,19 +lre22_dev_lyjix,tso-tso,14 +lre22_dev_lyxyh,eng-iaf,19 +lre22_dev_lyzxd,tir-tir,17 +lre22_dev_lzguf,orm-orm,21 +lre22_dev_lzpmk,tir-tir,16 +lre22_dev_lzugv,xho-xho,19 +lre22_dev_maeeb,tir-tir,15 +lre22_dev_maemn,zul-zul,16 +lre22_dev_manpw,orm-orm,19 +lre22_dev_mavli,ara-aeb,12 +lre22_dev_mbywd,orm-orm,19 +lre22_dev_mcath,nbl-nbl,22 +lre22_dev_mcjtw,xho-xho,16 +lre22_dev_mcndd,ven-ven,15 +lre22_dev_mcxqb,tir-tir,13 +lre22_dev_mdlia,fra-ntf,16 +lre22_dev_mdxsp,eng-ens,18 +lre22_dev_menex,eng-iaf,16 +lre22_dev_merfk,orm-orm,21 +lre22_dev_mfipk,zul-zul,16 +lre22_dev_mfuqh,ara-arq,14 +lre22_dev_mgcvo,xho-xho,19 +lre22_dev_mggbx,zul-zul,18 +lre22_dev_mgghl,tso-tso,12 +lre22_dev_mgwqd,ara-arq,14 +lre22_dev_mhswt,ara-ayl,15 +lre22_dev_mhwmt,tso-tso,16 +lre22_dev_miayn,ara-aeb,12 +lre22_dev_miley,tso-tso,16 +lre22_dev_mjfmb,nbl-nbl,21 +lre22_dev_mkbyx,tir-tir,19 +lre22_dev_mlbzi,xho-xho,23 +lre22_dev_mlduq,xho-xho,16 +lre22_dev_mljnp,ara-arq,14 +lre22_dev_mljpb,orm-orm,22 +lre22_dev_mlrsm,xho-xho,17 +lre22_dev_mlwzr,eng-ens,13 +lre22_dev_mlyeo,ven-ven,15 +lre22_dev_mmaed,ara-ayl,14 +lre22_dev_mmbns,eng-ens,12 +lre22_dev_mneyt,xho-xho,17 +lre22_dev_mnhsk,ven-ven,14 +lre22_dev_mnnvk,eng-ens,15 +lre22_dev_mnswo,tso-tso,16 +lre22_dev_mntdk,eng-ens,18 +lre22_dev_mogwl,orm-orm,22 +lre22_dev_mpbun,nbl-nbl,21 +lre22_dev_mpmuf,ara-aeb,14 +lre22_dev_mpoet,nbl-nbl,16 +lre22_dev_mptyi,afr-afr,18 +lre22_dev_mpzxy,orm-orm,18 +lre22_dev_mqxni,ara-arq,11 +lre22_dev_mqzga,tso-tso,19 +lre22_dev_mrgdh,xho-xho,17 +lre22_dev_mrgko,afr-afr,18 +lre22_dev_mrksc,tir-tir,19 +lre22_dev_mrogp,eng-iaf,15 +lre22_dev_mscwd,fra-ntf,16 +lre22_dev_mshco,ara-ayl,12 +lre22_dev_msptn,ara-ayl,16 +lre22_dev_msslk,ara-aeb,14 +lre22_dev_mtaus,fra-ntf,19 +lre22_dev_mtpgl,tso-tso,13 +lre22_dev_mttly,tir-tir,19 +lre22_dev_mubqn,fra-ntf,15 +lre22_dev_muskv,tso-tso,12 +lre22_dev_muzkp,ara-arq,14 +lre22_dev_mvdus,ven-ven,19 +lre22_dev_mvngl,xho-xho,19 +lre22_dev_mvrpq,tso-tso,12 +lre22_dev_mvtcj,afr-afr,22 +lre22_dev_mwhsu,xho-xho,21 +lre22_dev_mwkyp,nbl-nbl,20 +lre22_dev_mxcey,ara-ayl,12 +lre22_dev_mxcub,ara-aeb,12 +lre22_dev_myekh,ara-aeb,11 +lre22_dev_mzxhf,zul-zul,17 +lre22_dev_mzyru,ara-arq,12 +lre22_dev_nakax,eng-iaf,15 +lre22_dev_naymc,ara-ayl,13 +lre22_dev_nbgid,orm-orm,19 +lre22_dev_nbmnl,xho-xho,16 +lre22_dev_ncffi,zul-zul,14 +lre22_dev_ncjtj,fra-ntf,22 +lre22_dev_ncpix,ara-ayl,11 +lre22_dev_nctqc,xho-xho,16 +lre22_dev_ndkuo,orm-orm,20 +lre22_dev_ndqfw,nbl-nbl,17 +lre22_dev_nedes,ven-ven,15 +lre22_dev_neomw,zul-zul,18 +lre22_dev_neziz,tir-tir,19 +lre22_dev_nfcvg,eng-iaf,17 +lre22_dev_nfdfc,afr-afr,17 +lre22_dev_ngijv,xho-xho,21 +lre22_dev_ngrxk,ara-ayl,13 +lre22_dev_ngzja,ara-aeb,13 +lre22_dev_nhaub,tso-tso,13 +lre22_dev_nhkro,xho-xho,23 +lre22_dev_nhlvt,ara-arq,14 +lre22_dev_nhlxm,eng-ens,14 +lre22_dev_nhyjy,afr-afr,17 +lre22_dev_nifei,zul-zul,19 +lre22_dev_nikpx,ven-ven,18 +lre22_dev_njceq,afr-afr,18 +lre22_dev_njmlt,eng-ens,17 +lre22_dev_njqfj,orm-orm,18 +lre22_dev_nkdje,eng-iaf,19 +lre22_dev_nkkqo,nbl-nbl,22 +lre22_dev_nknrw,orm-orm,21 +lre22_dev_nkogd,fra-ntf,19 +lre22_dev_nksfc,tir-tir,19 +lre22_dev_nkwmm,orm-orm,22 +lre22_dev_nmhdg,ara-ayl,10 +lre22_dev_nmoux,ven-ven,20 +lre22_dev_nmrsq,ven-ven,21 +lre22_dev_nnbhc,fra-ntf,20 +lre22_dev_nnbpy,tir-tir,18 +lre22_dev_nnpwd,ara-aeb,13 +lre22_dev_nodin,ara-ayl,14 +lre22_dev_nogji,nbl-nbl,20 +lre22_dev_nonvr,afr-afr,15 +lre22_dev_notcl,eng-iaf,19 +lre22_dev_noufn,ara-aeb,11 +lre22_dev_noveb,ara-ayl,11 +lre22_dev_npajm,nbl-nbl,19 +lre22_dev_npehj,ara-ayl,14 +lre22_dev_nqdaj,tso-tso,12 +lre22_dev_nqkon,xho-xho,18 +lre22_dev_nqlhw,ara-aeb,13 +lre22_dev_nraqr,eng-ens,14 +lre22_dev_nrino,tso-tso,14 +lre22_dev_nrzgt,xho-xho,16 +lre22_dev_nscrg,orm-orm,18 +lre22_dev_nstgp,orm-orm,23 +lre22_dev_ntgqz,afr-afr,23 +lre22_dev_nthzr,eng-iaf,18 +lre22_dev_ntwzb,afr-afr,16 +lre22_dev_nudwv,eng-ens,14 +lre22_dev_nuerz,eng-iaf,18 +lre22_dev_nujfy,xho-xho,21 +lre22_dev_nurlx,eng-ens,13 +lre22_dev_nvakd,zul-zul,17 +lre22_dev_nvgkj,eng-ens,17 +lre22_dev_nvhvv,fra-ntf,20 +lre22_dev_nwbnz,ara-arq,14 +lre22_dev_nwjed,nbl-nbl,19 +lre22_dev_nwrto,ara-aeb,11 +lre22_dev_nwunl,zul-zul,14 +lre22_dev_nwvyy,tir-tir,19 +lre22_dev_nxwlo,nbl-nbl,17 +lre22_dev_nxxzy,zul-zul,16 +lre22_dev_nxzpp,nbl-nbl,20 +lre22_dev_nyhwg,ara-arq,14 +lre22_dev_nykvr,eng-ens,17 +lre22_dev_nyvkc,tir-tir,15 +lre22_dev_nyyui,ara-arq,11 +lre22_dev_nzbfh,zul-zul,19 +lre22_dev_nzxsk,xho-xho,21 +lre22_dev_oasrh,ara-arq,11 +lre22_dev_oavaf,xho-xho,21 +lre22_dev_obfrf,orm-orm,20 +lre22_dev_obocn,ara-arq,14 +lre22_dev_obumo,eng-ens,15 +lre22_dev_ocbuj,eng-ens,12 +lre22_dev_ocbxu,nbl-nbl,21 +lre22_dev_ocdvw,ara-ayl,13 +lre22_dev_ocdzj,xho-xho,19 +lre22_dev_ocveq,fra-ntf,22 +lre22_dev_odest,ara-ayl,11 +lre22_dev_odjlq,ven-ven,18 +lre22_dev_odpoq,ara-ayl,12 +lre22_dev_odrcm,fra-ntf,21 +lre22_dev_oeavx,ara-arq,12 +lre22_dev_oefoy,ara-aeb,12 +lre22_dev_oefqy,ven-ven,16 +lre22_dev_oehxk,ara-ayl,12 +lre22_dev_oeqbo,ara-aeb,14 +lre22_dev_oeqjq,fra-ntf,20 +lre22_dev_ofdgy,ara-ayl,15 +lre22_dev_ofgkq,fra-ntf,21 +lre22_dev_ofpva,ara-arq,11 +lre22_dev_ofufy,eng-iaf,17 +lre22_dev_ogglz,ara-aeb,13 +lre22_dev_oggtr,nbl-nbl,19 +lre22_dev_ogpxk,ara-aeb,11 +lre22_dev_ogsay,tso-tso,19 +lre22_dev_ogtvj,zul-zul,19 +lre22_dev_ohqwz,ara-arq,13 +lre22_dev_ohuxo,afr-afr,20 +lre22_dev_ohweb,ven-ven,16 +lre22_dev_ohzpg,fra-ntf,21 +lre22_dev_oijcy,xho-xho,19 +lre22_dev_oijgv,tir-tir,16 +lre22_dev_oikqj,eng-iaf,17 +lre22_dev_oinvl,ven-ven,15 +lre22_dev_oiofr,fra-ntf,19 +lre22_dev_oipks,eng-ens,17 +lre22_dev_ojzos,ara-arq,14 +lre22_dev_okbnu,ara-ayl,10 +lre22_dev_okpcp,eng-iaf,18 +lre22_dev_okwpq,tso-tso,16 +lre22_dev_oleie,ara-arq,12 +lre22_dev_oljep,ven-ven,21 +lre22_dev_oljsa,fra-ntf,16 +lre22_dev_olkup,nbl-nbl,16 +lre22_dev_olqbh,ara-ayl,14 +lre22_dev_omjqo,ara-aeb,14 +lre22_dev_omwiy,ara-ayl,12 +lre22_dev_omxnk,ara-arq,13 +lre22_dev_onqke,eng-iaf,16 +lre22_dev_onzje,tir-tir,13 +lre22_dev_ooktw,afr-afr,18 +lre22_dev_oosff,ara-aeb,12 +lre22_dev_ootbi,xho-xho,21 +lre22_dev_opciz,orm-orm,23 +lre22_dev_opgny,xho-xho,19 +lre22_dev_opifd,ara-arq,12 +lre22_dev_oporo,eng-iaf,19 +lre22_dev_opryj,nbl-nbl,16 +lre22_dev_opuzh,eng-ens,12 +lre22_dev_oqbaw,ven-ven,18 +lre22_dev_oqeuj,tir-tir,14 +lre22_dev_oqmhb,xho-xho,21 +lre22_dev_oqmrs,ara-arq,14 +lre22_dev_oqqwq,tso-tso,12 +lre22_dev_oquaq,xho-xho,17 +lre22_dev_oriap,fra-ntf,20 +lre22_dev_orsjj,tir-tir,20 +lre22_dev_orvna,fra-ntf,21 +lre22_dev_oskoe,orm-orm,20 +lre22_dev_otlyk,nbl-nbl,18 +lre22_dev_oujnj,nbl-nbl,17 +lre22_dev_oumka,ven-ven,14 +lre22_dev_ouqsx,ara-arq,13 +lre22_dev_outyl,zul-zul,16 +lre22_dev_owlwt,ara-ayl,14 +lre22_dev_owvfd,orm-orm,18 +lre22_dev_oxizc,tir-tir,15 +lre22_dev_oxpht,eng-ens,18 +lre22_dev_oxqlz,afr-afr,15 +lre22_dev_oydiw,nbl-nbl,16 +lre22_dev_oyfcl,fra-ntf,22 +lre22_dev_oyhba,eng-ens,18 +lre22_dev_oyiif,afr-afr,17 +lre22_dev_oyslg,afr-afr,21 +lre22_dev_ozfpi,tir-tir,15 +lre22_dev_ozlww,ven-ven,19 +lre22_dev_paxnc,eng-ens,17 +lre22_dev_pbbgx,eng-iaf,14 +lre22_dev_pcfmw,nbl-nbl,21 +lre22_dev_pclpc,fra-ntf,15 +lre22_dev_pcmmj,afr-afr,16 +lre22_dev_pcsqz,tso-tso,18 +lre22_dev_pdcfm,ara-ayl,10 +lre22_dev_pdtuf,eng-ens,18 +lre22_dev_pdzuj,zul-zul,17 +lre22_dev_pehfu,fra-ntf,15 +lre22_dev_pewpj,orm-orm,22 +lre22_dev_pexjz,orm-orm,17 +lre22_dev_pfioj,eng-iaf,15 +lre22_dev_pfkcf,eng-iaf,16 +lre22_dev_pfknl,ara-arq,14 +lre22_dev_pfucv,ara-ayl,12 +lre22_dev_pfyha,fra-ntf,21 +lre22_dev_pgavf,ara-ayl,13 +lre22_dev_phket,nbl-nbl,22 +lre22_dev_piabk,afr-afr,19 +lre22_dev_picvg,orm-orm,17 +lre22_dev_piina,eng-ens,14 +lre22_dev_pjahm,afr-afr,20 +lre22_dev_pjcso,nbl-nbl,17 +lre22_dev_pjggp,ven-ven,16 +lre22_dev_pjohw,xho-xho,19 +lre22_dev_pkpxo,ara-ayl,11 +lre22_dev_pktgk,nbl-nbl,22 +lre22_dev_plojq,eng-ens,12 +lre22_dev_pmayg,ven-ven,21 +lre22_dev_pmjyi,xho-xho,20 +lre22_dev_pmkcp,nbl-nbl,20 +lre22_dev_pnfhk,fra-ntf,18 +lre22_dev_pnust,nbl-nbl,20 +lre22_dev_pnwey,eng-iaf,15 +lre22_dev_pnwti,ara-aeb,13 +lre22_dev_pohmm,afr-afr,14 +lre22_dev_pojvr,nbl-nbl,22 +lre22_dev_poxsw,ara-aeb,13 +lre22_dev_ppjvq,tir-tir,16 +lre22_dev_ppkfc,fra-ntf,19 +lre22_dev_ppmnu,tso-tso,12 +lre22_dev_ppzno,tso-tso,12 +lre22_dev_pqksl,afr-afr,14 +lre22_dev_pqnvh,zul-zul,19 +lre22_dev_prcus,tso-tso,15 +lre22_dev_prhoh,tir-tir,19 +lre22_dev_prkth,ara-arq,12 +lre22_dev_prnhd,xho-xho,18 +lre22_dev_psjma,fra-ntf,18 +lre22_dev_psldq,tir-tir,19 +lre22_dev_psnvo,afr-afr,15 +lre22_dev_psnzj,zul-zul,19 +lre22_dev_pudqr,eng-ens,17 +lre22_dev_pufnl,orm-orm,19 +lre22_dev_pusxa,nbl-nbl,22 +lre22_dev_pvsqi,ara-arq,11 +lre22_dev_pvteg,fra-ntf,17 +lre22_dev_pvvay,tir-tir,14 +lre22_dev_pvxcv,ara-aeb,15 +lre22_dev_pvygc,ara-aeb,11 +lre22_dev_pwcxu,tir-tir,13 +lre22_dev_pwhdm,nbl-nbl,17 +lre22_dev_pwnkz,ven-ven,20 +lre22_dev_pwrqe,ara-aeb,14 +lre22_dev_pxbhi,afr-afr,16 +lre22_dev_pxeyk,zul-zul,18 +lre22_dev_pxkzd,ara-arq,14 +lre22_dev_pydgm,afr-afr,19 +lre22_dev_pyiju,ven-ven,20 +lre22_dev_pzhrc,tso-tso,13 +lre22_dev_pzkea,ven-ven,14 +lre22_dev_pzqka,ara-arq,11 +lre22_dev_pzuis,ara-arq,13 +lre22_dev_qabac,ven-ven,19 +lre22_dev_qahym,ara-ayl,11 +lre22_dev_qaxfr,xho-xho,17 +lre22_dev_qazyc,ara-ayl,14 +lre22_dev_qbcoz,nbl-nbl,22 +lre22_dev_qcavr,eng-iaf,20 +lre22_dev_qcbkh,fra-ntf,18 +lre22_dev_qcbtt,afr-afr,18 +lre22_dev_qclly,xho-xho,22 +lre22_dev_qcqdt,eng-iaf,18 +lre22_dev_qdqzp,zul-zul,17 +lre22_dev_qdwut,eng-ens,16 +lre22_dev_qehxr,afr-afr,22 +lre22_dev_qeqah,tir-tir,16 +lre22_dev_qeyjd,afr-afr,17 +lre22_dev_qfprv,ara-ayl,13 +lre22_dev_qfqhi,ara-ayl,15 +lre22_dev_qgoge,tso-tso,13 +lre22_dev_qgrlb,eng-iaf,16 +lre22_dev_qgrsu,zul-zul,14 +lre22_dev_qheor,xho-xho,23 +lre22_dev_qhfdz,tso-tso,14 +lre22_dev_qhlol,ven-ven,21 +lre22_dev_qhnfr,zul-zul,15 +lre22_dev_qhvuq,tso-tso,14 +lre22_dev_qibby,afr-afr,23 +lre22_dev_qicen,orm-orm,16 +lre22_dev_qiehd,eng-iaf,14 +lre22_dev_qjbfh,eng-iaf,15 +lre22_dev_qjdln,afr-afr,19 +lre22_dev_qjmro,ara-ayl,11 +lre22_dev_qkgor,zul-zul,16 +lre22_dev_qlgvf,ara-aeb,12 +lre22_dev_qlpjn,eng-iaf,16 +lre22_dev_qmoop,nbl-nbl,16 +lre22_dev_qmqhy,afr-afr,20 +lre22_dev_qmreh,ara-ayl,10 +lre22_dev_qmucf,ven-ven,18 +lre22_dev_qmvnu,fra-ntf,15 +lre22_dev_qmzke,ara-ayl,13 +lre22_dev_qmzxw,orm-orm,21 +lre22_dev_qnams,ven-ven,20 +lre22_dev_qnefv,xho-xho,23 +lre22_dev_qodht,zul-zul,19 +lre22_dev_qoqtk,eng-ens,16 +lre22_dev_qotto,fra-ntf,18 +lre22_dev_qoudd,tso-tso,18 +lre22_dev_qpego,ara-ayl,14 +lre22_dev_qphcb,fra-ntf,22 +lre22_dev_qqkiv,ara-arq,13 +lre22_dev_qqmeu,eng-ens,17 +lre22_dev_qqudk,orm-orm,21 +lre22_dev_qqvdr,orm-orm,23 +lre22_dev_qrbmq,ara-arq,12 +lre22_dev_qrfvx,fra-ntf,22 +lre22_dev_qrsqg,zul-zul,19 +lre22_dev_qrylo,eng-ens,18 +lre22_dev_qsbdh,nbl-nbl,16 +lre22_dev_qsqzo,afr-afr,14 +lre22_dev_qsudg,nbl-nbl,22 +lre22_dev_qszwt,fra-ntf,21 +lre22_dev_qtcmx,nbl-nbl,21 +lre22_dev_qtfpf,zul-zul,16 +lre22_dev_qtkhk,afr-afr,22 +lre22_dev_qtydg,afr-afr,22 +lre22_dev_qujmp,zul-zul,19 +lre22_dev_qulse,eng-ens,17 +lre22_dev_qutbz,eng-ens,18 +lre22_dev_quvqg,ara-aeb,13 +lre22_dev_qvpjs,eng-iaf,19 +lre22_dev_qvtdy,tso-tso,12 +lre22_dev_qvzol,orm-orm,19 +lre22_dev_qwvgm,ara-ayl,13 +lre22_dev_qwzxt,zul-zul,19 +lre22_dev_qxigw,tir-tir,19 +lre22_dev_qxkuu,tso-tso,13 +lre22_dev_qxtss,afr-afr,15 +lre22_dev_qxvbe,nbl-nbl,17 +lre22_dev_qxysh,afr-afr,22 +lre22_dev_qyfba,zul-zul,14 +lre22_dev_qyfov,fra-ntf,19 +lre22_dev_qyjgj,afr-afr,22 +lre22_dev_qyuwy,ara-aeb,15 +lre22_dev_qzfdr,nbl-nbl,18 +lre22_dev_qzldb,eng-iaf,19 +lre22_dev_ranrd,nbl-nbl,22 +lre22_dev_raurj,eng-ens,12 +lre22_dev_rbntq,ara-arq,11 +lre22_dev_rbssw,ara-aeb,11 +lre22_dev_rbwgx,ara-ayl,16 +lre22_dev_rcooi,fra-ntf,18 +lre22_dev_rcyom,ara-ayl,11 +lre22_dev_rdcns,zul-zul,18 +lre22_dev_rdrhv,ara-arq,11 +lre22_dev_rdyxn,eng-iaf,19 +lre22_dev_repec,tir-tir,19 +lre22_dev_rgbby,tso-tso,19 +lre22_dev_rgdvt,fra-ntf,20 +lre22_dev_rguqm,tso-tso,14 +lre22_dev_rgwjy,afr-afr,19 +lre22_dev_rijeq,orm-orm,19 +lre22_dev_rincv,tir-tir,16 +lre22_dev_rindo,zul-zul,17 +lre22_dev_rirhy,ara-arq,11 +lre22_dev_rjikw,fra-ntf,20 +lre22_dev_rjsik,tso-tso,16 +lre22_dev_rjvvj,tso-tso,19 +lre22_dev_rksid,nbl-nbl,22 +lre22_dev_rkycg,ven-ven,21 +lre22_dev_rlamm,zul-zul,15 +lre22_dev_rllya,tso-tso,15 +lre22_dev_rlzrk,eng-ens,14 +lre22_dev_rmxbg,tir-tir,14 +lre22_dev_rnrsy,tir-tir,19 +lre22_dev_rokej,xho-xho,17 +lre22_dev_rooaf,fra-ntf,17 +lre22_dev_rorob,ven-ven,15 +lre22_dev_rowwe,nbl-nbl,17 +lre22_dev_rqcuw,ara-ayl,11 +lre22_dev_rqdte,ara-ayl,10 +lre22_dev_rqpau,tso-tso,15 +lre22_dev_rquba,ven-ven,19 +lre22_dev_rrbgv,afr-afr,20 +lre22_dev_rsvjn,fra-ntf,16 +lre22_dev_rsynm,tir-tir,19 +lre22_dev_rtezn,tir-tir,19 +lre22_dev_rtkum,orm-orm,21 +lre22_dev_rturg,zul-zul,17 +lre22_dev_runwu,tir-tir,16 +lre22_dev_rvbmf,tso-tso,12 +lre22_dev_rvfls,tso-tso,16 +lre22_dev_rvhxb,ara-aeb,11 +lre22_dev_rvufk,orm-orm,20 +lre22_dev_rvzbo,ara-ayl,14 +lre22_dev_rwhfu,xho-xho,16 +lre22_dev_rwhiz,ara-ayl,10 +lre22_dev_rwimz,ven-ven,16 +lre22_dev_rwish,eng-ens,16 +lre22_dev_rwpzp,xho-xho,19 +lre22_dev_rwqlq,tir-tir,19 +lre22_dev_rwsnw,afr-afr,15 +lre22_dev_rwzwb,tso-tso,19 +lre22_dev_rxcjq,ara-arq,13 +lre22_dev_rxcka,ara-arq,14 +lre22_dev_rxgxu,tir-tir,19 +lre22_dev_rxqxn,nbl-nbl,20 +lre22_dev_rxwip,ara-ayl,10 +lre22_dev_rycca,ven-ven,14 +lre22_dev_rydpu,eng-ens,17 +lre22_dev_ryksb,ven-ven,14 +lre22_dev_rysmu,afr-afr,23 +lre22_dev_rzisy,ara-aeb,13 +lre22_dev_rzpus,ara-arq,15 +lre22_dev_rzqyn,ara-ayl,11 +lre22_dev_rzzca,orm-orm,21 +lre22_dev_sazdy,tso-tso,15 +lre22_dev_sbkip,afr-afr,14 +lre22_dev_sbyek,ara-arq,11 +lre22_dev_scjzn,xho-xho,21 +lre22_dev_scobo,ven-ven,17 +lre22_dev_scqui,orm-orm,16 +lre22_dev_sdccf,ara-arq,14 +lre22_dev_sdcty,tso-tso,19 +lre22_dev_sdebh,ara-ayl,12 +lre22_dev_sedif,orm-orm,21 +lre22_dev_sedug,xho-xho,18 +lre22_dev_seynu,tso-tso,13 +lre22_dev_seyxt,ara-aeb,13 +lre22_dev_sezun,ara-aeb,14 +lre22_dev_sfeyl,ara-aeb,12 +lre22_dev_sfnux,afr-afr,18 +lre22_dev_sfqnk,zul-zul,15 +lre22_dev_sftvb,ara-ayl,11 +lre22_dev_sfwkd,ven-ven,17 +lre22_dev_shgbp,fra-ntf,22 +lre22_dev_shikk,tir-tir,19 +lre22_dev_shpve,afr-afr,21 +lre22_dev_sidjm,ara-ayl,10 +lre22_dev_sihvc,orm-orm,17 +lre22_dev_siiaw,ven-ven,16 +lre22_dev_sinfr,xho-xho,19 +lre22_dev_sipnk,eng-iaf,16 +lre22_dev_sjbcr,tir-tir,19 +lre22_dev_sjdzp,eng-iaf,16 +lre22_dev_sjmsx,ven-ven,19 +lre22_dev_sjsnf,afr-afr,16 +lre22_dev_sjwmd,tir-tir,19 +lre22_dev_sjxce,nbl-nbl,16 +lre22_dev_sjzcc,eng-ens,13 +lre22_dev_sjzsv,fra-ntf,22 +lre22_dev_skegk,afr-afr,18 +lre22_dev_skpib,ven-ven,14 +lre22_dev_slgub,orm-orm,18 +lre22_dev_slryu,nbl-nbl,17 +lre22_dev_slupt,ara-ayl,13 +lre22_dev_smfbl,ara-aeb,14 +lre22_dev_smfon,xho-xho,20 +lre22_dev_smvms,afr-afr,18 +lre22_dev_snegl,xho-xho,18 +lre22_dev_snvvg,tso-tso,14 +lre22_dev_sobpf,orm-orm,19 +lre22_dev_soely,eng-iaf,14 +lre22_dev_sorzd,tir-tir,19 +lre22_dev_spixz,nbl-nbl,18 +lre22_dev_spjcl,fra-ntf,17 +lre22_dev_spzra,tso-tso,17 +lre22_dev_sqaei,xho-xho,23 +lre22_dev_sqime,ven-ven,14 +lre22_dev_srgaw,eng-iaf,15 +lre22_dev_srnhq,ven-ven,16 +lre22_dev_srsng,orm-orm,21 +lre22_dev_srysc,nbl-nbl,17 +lre22_dev_srzgk,eng-ens,16 +lre22_dev_srzsi,ara-aeb,14 +lre22_dev_ssjtt,nbl-nbl,16 +lre22_dev_stajf,xho-xho,21 +lre22_dev_sttfd,ara-aeb,15 +lre22_dev_suevr,ara-aeb,15 +lre22_dev_sumum,afr-afr,18 +lre22_dev_svukm,fra-ntf,20 +lre22_dev_swkzf,tir-tir,17 +lre22_dev_sxqmv,ara-aeb,11 +lre22_dev_sxvuf,ara-aeb,11 +lre22_dev_sydqt,eng-ens,18 +lre22_dev_syooe,eng-ens,14 +lre22_dev_szpip,tir-tir,17 +lre22_dev_szsgp,fra-ntf,19 +lre22_dev_szzuj,ara-ayl,11 +lre22_dev_tabof,orm-orm,19 +lre22_dev_tavcw,ven-ven,19 +lre22_dev_tbjal,xho-xho,22 +lre22_dev_tbxzb,fra-ntf,21 +lre22_dev_tdalr,nbl-nbl,18 +lre22_dev_tdfzf,eng-iaf,17 +lre22_dev_tdlyk,tir-tir,15 +lre22_dev_tefms,fra-ntf,15 +lre22_dev_telgo,xho-xho,19 +lre22_dev_teric,eng-ens,14 +lre22_dev_tfcgx,orm-orm,21 +lre22_dev_tgiid,xho-xho,19 +lre22_dev_tgoea,ara-ayl,13 +lre22_dev_tgrrk,eng-iaf,18 +lre22_dev_tgtyv,tso-tso,12 +lre22_dev_tgzex,tso-tso,12 +lre22_dev_thone,nbl-nbl,17 +lre22_dev_thpnk,afr-afr,18 +lre22_dev_thwls,ven-ven,17 +lre22_dev_tibov,tir-tir,14 +lre22_dev_tidld,tso-tso,16 +lre22_dev_tiezu,eng-ens,17 +lre22_dev_tioqa,nbl-nbl,16 +lre22_dev_tiuym,zul-zul,15 +lre22_dev_tjivp,afr-afr,22 +lre22_dev_tjltd,orm-orm,20 +lre22_dev_tkcqj,ara-aeb,12 +lre22_dev_tkpij,tir-tir,19 +lre22_dev_tkpwp,orm-orm,19 +lre22_dev_tkyuh,tso-tso,12 +lre22_dev_tlkrm,zul-zul,19 +lre22_dev_tlspo,zul-zul,18 +lre22_dev_tmdvx,zul-zul,17 +lre22_dev_tmynp,afr-afr,20 +lre22_dev_tntmu,xho-xho,22 +lre22_dev_tnwok,orm-orm,21 +lre22_dev_toccu,eng-iaf,16 +lre22_dev_tofur,tir-tir,14 +lre22_dev_tokhl,ven-ven,21 +lre22_dev_tonkq,zul-zul,15 +lre22_dev_topxu,zul-zul,14 +lre22_dev_touna,ara-arq,15 +lre22_dev_towvr,tso-tso,12 +lre22_dev_tpasn,tir-tir,15 +lre22_dev_tpmen,ara-ayl,10 +lre22_dev_tpuws,tir-tir,19 +lre22_dev_tqbqi,xho-xho,17 +lre22_dev_tqtfo,tso-tso,17 +lre22_dev_traqh,fra-ntf,21 +lre22_dev_trdfp,ara-ayl,15 +lre22_dev_trdml,xho-xho,23 +lre22_dev_trmpg,nbl-nbl,19 +lre22_dev_tsdyg,tso-tso,19 +lre22_dev_tsvmo,ara-ayl,11 +lre22_dev_ttcul,afr-afr,19 +lre22_dev_ttrfr,ara-arq,12 +lre22_dev_tuhrp,ven-ven,14 +lre22_dev_twaba,afr-afr,15 +lre22_dev_twcnd,tir-tir,13 +lre22_dev_twtog,ven-ven,15 +lre22_dev_twvne,tir-tir,19 +lre22_dev_txcqg,orm-orm,19 +lre22_dev_txjsy,eng-ens,18 +lre22_dev_txmpu,afr-afr,19 +lre22_dev_txqde,eng-iaf,16 +lre22_dev_tyaup,eng-ens,17 +lre22_dev_tyaym,afr-afr,17 +lre22_dev_tybrl,nbl-nbl,16 +lre22_dev_tyduc,eng-ens,17 +lre22_dev_tyhsa,fra-ntf,21 +lre22_dev_tyigo,ara-ayl,11 +lre22_dev_tykte,zul-zul,18 +lre22_dev_tymil,tir-tir,16 +lre22_dev_tyofb,ven-ven,20 +lre22_dev_tysph,fra-ntf,16 +lre22_dev_tzamn,ara-aeb,11 +lre22_dev_tzrpp,ven-ven,15 +lre22_dev_tzukm,ara-aeb,12 +lre22_dev_uabum,xho-xho,19 +lre22_dev_uankd,nbl-nbl,18 +lre22_dev_uazyk,ara-ayl,14 +lre22_dev_ubdfa,eng-iaf,15 +lre22_dev_ubugi,orm-orm,22 +lre22_dev_ucetp,ven-ven,21 +lre22_dev_ucsxt,eng-ens,12 +lre22_dev_uczke,zul-zul,14 +lre22_dev_udldh,ara-arq,11 +lre22_dev_uejdk,orm-orm,17 +lre22_dev_uekog,zul-zul,17 +lre22_dev_uemql,xho-xho,16 +lre22_dev_ueovt,eng-ens,14 +lre22_dev_uesao,zul-zul,19 +lre22_dev_ueyxm,ara-ayl,13 +lre22_dev_ufafi,tir-tir,17 +lre22_dev_ufaig,tso-tso,12 +lre22_dev_uffpc,ara-arq,14 +lre22_dev_ufrmg,ven-ven,20 +lre22_dev_ugieb,ara-aeb,12 +lre22_dev_ugoiy,ara-ayl,10 +lre22_dev_ugzkq,ara-aeb,12 +lre22_dev_uhdrj,xho-xho,18 +lre22_dev_uhjdn,ara-ayl,16 +lre22_dev_uhkcq,ara-ayl,11 +lre22_dev_uhrjo,ara-aeb,13 +lre22_dev_uhrow,afr-afr,16 +lre22_dev_uikqm,ara-arq,12 +lre22_dev_uitct,eng-ens,13 +lre22_dev_uitqu,ara-ayl,12 +lre22_dev_ujiby,eng-ens,18 +lre22_dev_ujmtl,orm-orm,22 +lre22_dev_ukdpu,ven-ven,17 +lre22_dev_ukfpb,xho-xho,19 +lre22_dev_ukklw,fra-ntf,22 +lre22_dev_ukwjy,xho-xho,17 +lre22_dev_uljbx,fra-ntf,20 +lre22_dev_uljgh,tir-tir,13 +lre22_dev_uljvo,fra-ntf,21 +lre22_dev_undfd,orm-orm,20 +lre22_dev_unmiu,ara-arq,14 +lre22_dev_updar,nbl-nbl,17 +lre22_dev_uprkv,eng-iaf,16 +lre22_dev_urkok,ara-ayl,11 +lre22_dev_urolj,orm-orm,22 +lre22_dev_uscpv,eng-ens,14 +lre22_dev_ushtk,fra-ntf,20 +lre22_dev_usiey,ven-ven,19 +lre22_dev_usitw,ara-arq,14 +lre22_dev_utkxp,nbl-nbl,19 +lre22_dev_utnvo,tir-tir,16 +lre22_dev_utyjg,tso-tso,18 +lre22_dev_uuwaa,ara-arq,12 +lre22_dev_uuxla,eng-iaf,15 +lre22_dev_uuzuj,ara-arq,14 +lre22_dev_uvcxs,eng-ens,12 +lre22_dev_uveah,ven-ven,17 +lre22_dev_uvfqy,ara-arq,13 +lre22_dev_uvnhb,fra-ntf,20 +lre22_dev_uvqbm,afr-afr,19 +lre22_dev_uvsus,zul-zul,15 +lre22_dev_uvyev,fra-ntf,20 +lre22_dev_uwicd,tso-tso,12 +lre22_dev_uwnlz,zul-zul,18 +lre22_dev_uwwyj,afr-afr,20 +lre22_dev_uwyxc,eng-iaf,17 +lre22_dev_uxjzh,xho-xho,21 +lre22_dev_uxpyg,tso-tso,15 +lre22_dev_uxrxr,tso-tso,12 +lre22_dev_uyciz,eng-ens,14 +lre22_dev_uycza,xho-xho,17 +lre22_dev_uyvyb,eng-ens,17 +lre22_dev_uziar,zul-zul,15 +lre22_dev_uzlxd,fra-ntf,22 +lre22_dev_uznjr,tir-tir,13 +lre22_dev_vagda,ara-ayl,12 +lre22_dev_vanjm,ven-ven,18 +lre22_dev_vaqia,tir-tir,19 +lre22_dev_vasjz,ara-arq,11 +lre22_dev_vcexs,tir-tir,17 +lre22_dev_vchpm,fra-ntf,21 +lre22_dev_vctsa,nbl-nbl,19 +lre22_dev_vcxit,ven-ven,15 +lre22_dev_vcyqv,xho-xho,19 +lre22_dev_vdjlh,afr-afr,22 +lre22_dev_vdogx,ven-ven,15 +lre22_dev_veutb,eng-ens,16 +lre22_dev_vezrd,tso-tso,12 +lre22_dev_vfbfg,tso-tso,12 +lre22_dev_vffqd,orm-orm,21 +lre22_dev_vfhum,afr-afr,16 +lre22_dev_vfjtw,ara-arq,11 +lre22_dev_vfnjb,eng-ens,15 +lre22_dev_vgbbh,ara-arq,13 +lre22_dev_vgcao,eng-iaf,20 +lre22_dev_vgpnk,xho-xho,19 +lre22_dev_vityk,zul-zul,18 +lre22_dev_vjeuy,tir-tir,19 +lre22_dev_vjltt,zul-zul,17 +lre22_dev_vjqrm,tir-tir,13 +lre22_dev_vjvbs,tso-tso,18 +lre22_dev_vlcbq,tso-tso,16 +lre22_dev_vlnlb,tso-tso,13 +lre22_dev_vlscu,ara-ayl,15 +lre22_dev_vlwhz,fra-ntf,22 +lre22_dev_vlyeh,tso-tso,16 +lre22_dev_vmnps,zul-zul,14 +lre22_dev_vmqxk,tso-tso,18 +lre22_dev_vmrez,ven-ven,18 +lre22_dev_vmsnh,ara-aeb,11 +lre22_dev_vmuti,ara-aeb,14 +lre22_dev_vncre,afr-afr,22 +lre22_dev_vnkqv,afr-afr,15 +lre22_dev_vnmlt,zul-zul,18 +lre22_dev_vpkra,ara-ayl,11 +lre22_dev_vpoit,ara-arq,14 +lre22_dev_vpruu,orm-orm,23 +lre22_dev_vptiv,tir-tir,18 +lre22_dev_vqhcn,tso-tso,16 +lre22_dev_vqura,tir-tir,16 +lre22_dev_vrqfs,xho-xho,23 +lre22_dev_vrvtr,zul-zul,15 +lre22_dev_vrxvj,fra-ntf,17 +lre22_dev_vsbay,eng-iaf,19 +lre22_dev_vsbvi,fra-ntf,19 +lre22_dev_vslkb,eng-ens,12 +lre22_dev_vsrdg,tso-tso,12 +lre22_dev_vsrnz,zul-zul,14 +lre22_dev_vsryb,nbl-nbl,19 +lre22_dev_vtlab,zul-zul,19 +lre22_dev_vtrff,eng-iaf,17 +lre22_dev_vtztf,ara-aeb,11 +lre22_dev_vucth,eng-ens,14 +lre22_dev_vucug,orm-orm,21 +lre22_dev_vufuu,eng-ens,18 +lre22_dev_vujbs,zul-zul,19 +lre22_dev_vuufm,afr-afr,19 +lre22_dev_vvgdf,eng-ens,18 +lre22_dev_vvlcx,ara-aeb,12 +lre22_dev_vvvho,tir-tir,18 +lre22_dev_vwait,eng-iaf,14 +lre22_dev_vwdcw,ara-arq,14 +lre22_dev_vwyzq,ara-arq,14 +lre22_dev_vwzon,eng-ens,12 +lre22_dev_vxhoc,ara-aeb,11 +lre22_dev_vxkgz,ven-ven,18 +lre22_dev_vxlgl,tir-tir,18 +lre22_dev_vxsqt,eng-ens,15 +lre22_dev_vyqsd,nbl-nbl,17 +lre22_dev_vzcai,zul-zul,19 +lre22_dev_vzgoj,eng-iaf,14 +lre22_dev_vzlon,zul-zul,16 +lre22_dev_vznrg,nbl-nbl,16 +lre22_dev_vzqme,xho-xho,19 +lre22_dev_wabqx,ven-ven,18 +lre22_dev_wafdh,fra-ntf,21 +lre22_dev_wagmt,eng-iaf,18 +lre22_dev_waocz,ven-ven,20 +lre22_dev_wavrh,zul-zul,16 +lre22_dev_wawqg,ara-ayl,13 +lre22_dev_waznj,nbl-nbl,22 +lre22_dev_wbepu,fra-ntf,19 +lre22_dev_wbygw,eng-ens,16 +lre22_dev_wccgz,tso-tso,17 +lre22_dev_wcpwx,tir-tir,18 +lre22_dev_wczkn,eng-iaf,17 +lre22_dev_wdfmt,tir-tir,17 +lre22_dev_wdgbh,ara-arq,12 +lre22_dev_wdind,tso-tso,19 +lre22_dev_wdkit,nbl-nbl,16 +lre22_dev_wdmpt,eng-ens,17 +lre22_dev_wdpya,nbl-nbl,16 +lre22_dev_wdrxo,orm-orm,21 +lre22_dev_wdyiy,ara-ayl,13 +lre22_dev_weccy,afr-afr,15 +lre22_dev_wfmco,ara-arq,14 +lre22_dev_wfnon,nbl-nbl,17 +lre22_dev_wgdui,eng-iaf,14 +lre22_dev_wgkmr,eng-iaf,17 +lre22_dev_wgnex,tir-tir,19 +lre22_dev_wgucy,eng-iaf,18 +lre22_dev_wgwdn,eng-iaf,17 +lre22_dev_whqhx,eng-iaf,15 +lre22_dev_whxwv,eng-ens,14 +lre22_dev_witnq,fra-ntf,17 +lre22_dev_wixzu,tso-tso,16 +lre22_dev_wjhbw,eng-iaf,16 +lre22_dev_wjist,orm-orm,16 +lre22_dev_wjnhh,zul-zul,19 +lre22_dev_wjnyo,ven-ven,20 +lre22_dev_wjtnm,orm-orm,19 +lre22_dev_wjzhz,ara-aeb,13 +lre22_dev_wkacx,eng-iaf,15 +lre22_dev_wkqey,fra-ntf,16 +lre22_dev_wldli,zul-zul,14 +lre22_dev_wlnst,nbl-nbl,16 +lre22_dev_wltvq,zul-zul,17 +lre22_dev_wlwhq,orm-orm,19 +lre22_dev_wmdan,xho-xho,21 +lre22_dev_wmfce,nbl-nbl,20 +lre22_dev_wmigl,ven-ven,20 +lre22_dev_wmwmc,eng-iaf,19 +lre22_dev_wmypk,xho-xho,19 +lre22_dev_wmzpv,eng-ens,17 +lre22_dev_wnjpz,ven-ven,19 +lre22_dev_wnmkt,orm-orm,23 +lre22_dev_wnpep,nbl-nbl,16 +lre22_dev_wnqhz,nbl-nbl,16 +lre22_dev_wnxpz,ven-ven,15 +lre22_dev_wnxrw,ven-ven,18 +lre22_dev_woawg,ven-ven,18 +lre22_dev_wobzv,eng-ens,14 +lre22_dev_wocbv,tso-tso,18 +lre22_dev_woerb,fra-ntf,21 +lre22_dev_wojrt,orm-orm,19 +lre22_dev_wosus,tir-tir,17 +lre22_dev_wozuc,xho-xho,19 +lre22_dev_wqcyu,tso-tso,15 +lre22_dev_wqfuv,eng-ens,17 +lre22_dev_wqhag,zul-zul,19 +lre22_dev_wqmsd,tir-tir,13 +lre22_dev_wqthl,ara-aeb,12 +lre22_dev_wqtvm,eng-ens,15 +lre22_dev_wrmnw,zul-zul,18 +lre22_dev_wrtec,zul-zul,17 +lre22_dev_wrvls,zul-zul,14 +lre22_dev_wscfs,nbl-nbl,16 +lre22_dev_wssqw,eng-ens,15 +lre22_dev_wtbdf,tir-tir,14 +lre22_dev_wtcpe,ara-aeb,11 +lre22_dev_wthrk,orm-orm,18 +lre22_dev_wtofd,eng-iaf,20 +lre22_dev_wtuol,tso-tso,18 +lre22_dev_wuqez,ara-aeb,11 +lre22_dev_wuquc,tir-tir,18 +lre22_dev_wvlde,tso-tso,13 +lre22_dev_wwbmg,ara-aeb,11 +lre22_dev_wwduf,fra-ntf,18 +lre22_dev_wwvuw,ara-arq,13 +lre22_dev_wxaev,orm-orm,17 +lre22_dev_wycsj,ven-ven,18 +lre22_dev_wypwj,ara-ayl,10 +lre22_dev_wytpq,fra-ntf,17 +lre22_dev_wzhqk,xho-xho,22 +lre22_dev_wzpmq,eng-ens,12 +lre22_dev_wztdj,zul-zul,19 +lre22_dev_wzxgv,ven-ven,18 +lre22_dev_xacjk,fra-ntf,18 +lre22_dev_xaevp,tir-tir,14 +lre22_dev_xaldr,eng-iaf,14 +lre22_dev_xapdy,ara-aeb,12 +lre22_dev_xaurw,nbl-nbl,16 +lre22_dev_xawdd,tir-tir,20 +lre22_dev_xbcpb,ara-arq,12 +lre22_dev_xbfrs,ven-ven,17 +lre22_dev_xbqsr,nbl-nbl,22 +lre22_dev_xbvcc,nbl-nbl,17 +lre22_dev_xbvqw,orm-orm,23 +lre22_dev_xcame,xho-xho,16 +lre22_dev_xcrnp,ara-aeb,13 +lre22_dev_xcswu,ven-ven,18 +lre22_dev_xcuok,orm-orm,21 +lre22_dev_xcvkj,tso-tso,16 +lre22_dev_xdtdp,fra-ntf,17 +lre22_dev_xdyea,ara-ayl,10 +lre22_dev_xerqi,fra-ntf,17 +lre22_dev_xetdb,eng-ens,14 +lre22_dev_xfecy,nbl-nbl,16 +lre22_dev_xfgcu,eng-iaf,19 +lre22_dev_xfing,tir-tir,20 +lre22_dev_xgaig,ara-aeb,15 +lre22_dev_xgoyq,eng-ens,18 +lre22_dev_xhdtx,eng-iaf,14 +lre22_dev_xhvkx,orm-orm,19 +lre22_dev_xiblr,tir-tir,17 +lre22_dev_xifty,ara-aeb,12 +lre22_dev_xigtx,ara-arq,14 +lre22_dev_xijus,tso-tso,14 +lre22_dev_xipox,xho-xho,20 +lre22_dev_xittq,ara-aeb,13 +lre22_dev_xjpwq,ara-ayl,15 +lre22_dev_xjrla,afr-afr,20 +lre22_dev_xkdof,ara-ayl,13 +lre22_dev_xkiba,eng-ens,18 +lre22_dev_xlcxh,fra-ntf,18 +lre22_dev_xlsxb,tso-tso,16 +lre22_dev_xmhpj,ven-ven,20 +lre22_dev_xnqct,ara-arq,11 +lre22_dev_xoayi,eng-ens,13 +lre22_dev_xohps,ara-arq,11 +lre22_dev_xokpn,zul-zul,18 +lre22_dev_xonym,eng-ens,14 +lre22_dev_xozod,afr-afr,14 +lre22_dev_xpenp,ara-arq,11 +lre22_dev_xpnti,ara-aeb,11 +lre22_dev_xpqyr,orm-orm,22 +lre22_dev_xpswt,orm-orm,23 +lre22_dev_xpumn,ven-ven,14 +lre22_dev_xpvcf,orm-orm,20 +lre22_dev_xqhoa,ara-ayl,13 +lre22_dev_xqnpt,orm-orm,22 +lre22_dev_xqooi,xho-xho,20 +lre22_dev_xqupu,fra-ntf,21 +lre22_dev_xresy,eng-iaf,17 +lre22_dev_xrouj,ara-ayl,16 +lre22_dev_xsnxu,ara-aeb,12 +lre22_dev_xtaof,ara-ayl,13 +lre22_dev_xtbxk,orm-orm,20 +lre22_dev_xtgak,nbl-nbl,20 +lre22_dev_xuauh,ara-aeb,13 +lre22_dev_xubei,eng-iaf,17 +lre22_dev_xubol,ara-aeb,11 +lre22_dev_xuieb,orm-orm,19 +lre22_dev_xunxs,ara-ayl,14 +lre22_dev_xutjo,nbl-nbl,20 +lre22_dev_xvbos,afr-afr,22 +lre22_dev_xvcfn,eng-ens,16 +lre22_dev_xvgqo,eng-ens,12 +lre22_dev_xwemk,zul-zul,18 +lre22_dev_xwsyq,ara-ayl,14 +lre22_dev_xxdbg,tso-tso,18 +lre22_dev_xyoua,fra-ntf,22 +lre22_dev_xzoej,ara-aeb,13 +lre22_dev_xzrdl,ara-arq,13 +lre22_dev_xztsz,tso-tso,16 +lre22_dev_xzxbd,zul-zul,15 +lre22_dev_yagvv,tso-tso,13 +lre22_dev_ybqju,tso-tso,13 +lre22_dev_ybrji,ara-arq,11 +lre22_dev_ybsmy,ven-ven,21 +lre22_dev_ycbaf,ara-aeb,14 +lre22_dev_ychsm,ven-ven,14 +lre22_dev_ycrlj,xho-xho,17 +lre22_dev_ycuhc,orm-orm,21 +lre22_dev_ydhqc,ara-arq,13 +lre22_dev_ydmnb,nbl-nbl,17 +lre22_dev_yduem,xho-xho,21 +lre22_dev_yemzu,ara-aeb,11 +lre22_dev_yeoyx,eng-ens,18 +lre22_dev_yersp,ara-ayl,13 +lre22_dev_yeshv,eng-iaf,17 +lre22_dev_yexec,ven-ven,20 +lre22_dev_yeyna,ara-ayl,14 +lre22_dev_yfxmd,ara-arq,14 +lre22_dev_yfzah,ara-arq,14 +lre22_dev_ygkvo,ara-arq,11 +lre22_dev_yhgvr,ara-arq,15 +lre22_dev_yhwin,ara-arq,12 +lre22_dev_yirig,ara-ayl,16 +lre22_dev_yixgu,xho-xho,16 +lre22_dev_yjbfl,xho-xho,19 +lre22_dev_yjodc,eng-ens,14 +lre22_dev_yjoht,ara-aeb,12 +lre22_dev_yjqkb,ara-arq,14 +lre22_dev_yjrkq,ara-arq,15 +lre22_dev_yjrng,afr-afr,16 +lre22_dev_ykpzq,afr-afr,21 +lre22_dev_yktop,eng-iaf,20 +lre22_dev_ylfah,zul-zul,15 +lre22_dev_ylgex,tso-tso,14 +lre22_dev_ylkds,nbl-nbl,17 +lre22_dev_ylvyc,xho-xho,20 +lre22_dev_ylzic,eng-iaf,20 +lre22_dev_ymoon,afr-afr,17 +lre22_dev_yncqr,ara-arq,13 +lre22_dev_ynjtn,ven-ven,18 +lre22_dev_ynmzy,tso-tso,16 +lre22_dev_ynozi,fra-ntf,21 +lre22_dev_yntec,orm-orm,19 +lre22_dev_ynurl,tso-tso,14 +lre22_dev_ypdtt,ara-aeb,11 +lre22_dev_yprom,tso-tso,13 +lre22_dev_yptsk,xho-xho,23 +lre22_dev_ypyft,eng-iaf,14 +lre22_dev_yqhwt,orm-orm,23 +lre22_dev_yqtxe,eng-iaf,19 +lre22_dev_yquja,ara-ayl,10 +lre22_dev_yqxhl,eng-ens,14 +lre22_dev_yqyby,nbl-nbl,18 +lre22_dev_yqzua,fra-ntf,16 +lre22_dev_yrfxo,ven-ven,21 +lre22_dev_yrgzf,ara-aeb,13 +lre22_dev_yruqe,tso-tso,17 +lre22_dev_yrwgb,zul-zul,18 +lre22_dev_yrxsi,orm-orm,21 +lre22_dev_ysdkl,tso-tso,15 +lre22_dev_ytgav,xho-xho,16 +lre22_dev_ytoet,ara-arq,14 +lre22_dev_yuabg,eng-ens,16 +lre22_dev_yundm,tso-tso,14 +lre22_dev_yuvux,ara-ayl,13 +lre22_dev_yvdcv,fra-ntf,21 +lre22_dev_yvoli,orm-orm,23 +lre22_dev_yweox,orm-orm,21 +lre22_dev_ywgoc,eng-iaf,19 +lre22_dev_ywoyx,ven-ven,18 +lre22_dev_ywxql,zul-zul,19 +lre22_dev_yxkyl,eng-iaf,15 +lre22_dev_yxtmn,ara-aeb,14 +lre22_dev_yycsn,ara-ayl,12 +lre22_dev_yyswd,eng-iaf,16 +lre22_dev_yyugr,ven-ven,21 +lre22_dev_yzitu,orm-orm,20 +lre22_dev_yzwmi,eng-ens,16 +lre22_dev_yzzww,zul-zul,17 +lre22_dev_zabub,ara-ayl,16 +lre22_dev_zabuv,eng-iaf,14 +lre22_dev_zacuc,zul-zul,19 +lre22_dev_zavru,zul-zul,19 +lre22_dev_zbfgy,ara-arq,12 +lre22_dev_zbjez,nbl-nbl,17 +lre22_dev_zbtpo,ven-ven,18 +lre22_dev_zbzip,tso-tso,19 +lre22_dev_zcevz,nbl-nbl,16 +lre22_dev_zcnsv,afr-afr,21 +lre22_dev_zcqkl,eng-iaf,20 +lre22_dev_zczer,ven-ven,14 +lre22_dev_zdcdt,nbl-nbl,18 +lre22_dev_zddua,xho-xho,19 +lre22_dev_zdvsh,ara-arq,14 +lre22_dev_zdwxx,ara-ayl,14 +lre22_dev_zdyxi,tir-tir,14 +lre22_dev_zetju,eng-iaf,17 +lre22_dev_zfsek,ara-arq,11 +lre22_dev_zfvfa,eng-ens,18 +lre22_dev_zggiu,zul-zul,19 +lre22_dev_zgndz,tso-tso,14 +lre22_dev_zgxth,eng-ens,16 +lre22_dev_zhlxa,ara-ayl,14 +lre22_dev_zhnsb,ara-ayl,15 +lre22_dev_zhsmo,ara-aeb,13 +lre22_dev_zhvbf,xho-xho,18 +lre22_dev_zhzrh,eng-iaf,15 +lre22_dev_ziigd,orm-orm,21 +lre22_dev_zilud,tir-tir,19 +lre22_dev_zjivp,zul-zul,19 +lre22_dev_zjleg,zul-zul,19 +lre22_dev_zjquq,orm-orm,16 +lre22_dev_zkgjo,nbl-nbl,22 +lre22_dev_zkhes,fra-ntf,16 +lre22_dev_zkioq,ara-aeb,12 +lre22_dev_zkwaw,afr-afr,21 +lre22_dev_zlapc,ara-ayl,13 +lre22_dev_zlntm,zul-zul,19 +lre22_dev_zmmyn,xho-xho,23 +lre22_dev_zmxld,ven-ven,17 +lre22_dev_znhcf,ven-ven,21 +lre22_dev_znwsk,afr-afr,22 +lre22_dev_znxvg,eng-ens,18 +lre22_dev_znycz,ara-aeb,13 +lre22_dev_zoayx,zul-zul,18 +lre22_dev_zogte,nbl-nbl,16 +lre22_dev_zoldl,ara-aeb,12 +lre22_dev_zoqzl,eng-ens,17 +lre22_dev_zorfv,eng-iaf,16 +lre22_dev_zoseh,ara-arq,12 +lre22_dev_zpotb,xho-xho,16 +lre22_dev_zptbg,tir-tir,14 +lre22_dev_zqjzi,ara-aeb,11 +lre22_dev_zqljj,ara-aeb,14 +lre22_dev_zqlri,orm-orm,18 +lre22_dev_zqoif,zul-zul,19 +lre22_dev_zqorv,ara-aeb,12 +lre22_dev_zqwgs,fra-ntf,18 +lre22_dev_zrhbt,tir-tir,19 +lre22_dev_zrqar,ara-aeb,13 +lre22_dev_zrqec,eng-iaf,17 +lre22_dev_ztdrx,fra-ntf,15 +lre22_dev_ztdwr,orm-orm,17 +lre22_dev_zthiv,ara-arq,15 +lre22_dev_ztknh,xho-xho,18 +lre22_dev_ztlcq,ara-aeb,13 +lre22_dev_ztufj,fra-ntf,19 +lre22_dev_zubjl,fra-ntf,20 +lre22_dev_zunuw,tso-tso,17 +lre22_dev_zutul,tir-tir,13 +lre22_dev_zutvv,eng-ens,12 +lre22_dev_zuugc,eng-iaf,17 +lre22_dev_zuvqx,eng-iaf,14 +lre22_dev_zvthu,orm-orm,20 +lre22_dev_zvvov,ara-aeb,11 +lre22_dev_zvyuh,ara-arq,14 +lre22_dev_zwfqq,eng-iaf,17 +lre22_dev_zwosr,xho-xho,16 +lre22_dev_zwvhw,tso-tso,12 +lre22_dev_zxihz,ven-ven,14 +lre22_dev_zydma,eng-ens,12 +lre22_dev_zyqlz,zul-zul,19 +lre22_dev_zyyie,orm-orm,23 +lre22_dev_zyywo,eng-iaf,14 +lre22_dev_zzyze,ara-ayl,12 diff --git a/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/test_segments.csv b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/test_segments.csv new file mode 100644 index 00000000..4d50b6a5 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/test_segments.csv @@ -0,0 +1,2088 @@ +id,class_id,subclass_idx +lre22_dev_aayck,ara-aeb,12 +lre22_dev_aayto,eng-iaf,14 +lre22_dev_abaha,zul-zul,17 +lre22_dev_abetm,fra-ntf,15 +lre22_dev_abnwz,zul-zul,19 +lre22_dev_abvjt,zul-zul,19 +lre22_dev_abwgm,ara-ayl,13 +lre22_dev_acepd,eng-iaf,19 +lre22_dev_acspt,eng-ens,12 +lre22_dev_aczdh,eng-ens,13 +lre22_dev_adkkm,tso-tso,19 +lre22_dev_adpus,tso-tso,13 +lre22_dev_adwju,ara-aeb,14 +lre22_dev_afnfn,afr-afr,20 +lre22_dev_afohq,ara-aeb,13 +lre22_dev_agnnp,afr-afr,17 +lre22_dev_agquw,fra-ntf,20 +lre22_dev_ahoow,ara-ayl,15 +lre22_dev_ahqxq,fra-ntf,22 +lre22_dev_aieqr,eng-iaf,17 +lre22_dev_ainix,eng-iaf,16 +lre22_dev_aiojl,fra-ntf,18 +lre22_dev_aiypg,nbl-nbl,17 +lre22_dev_ajcpi,orm-orm,22 +lre22_dev_ajeqv,ara-aeb,11 +lre22_dev_ajlqy,xho-xho,16 +lre22_dev_ajlyw,orm-orm,21 +lre22_dev_ajmrs,ara-aeb,11 +lre22_dev_ajzjc,eng-iaf,16 +lre22_dev_ajzyq,ara-ayl,14 +lre22_dev_akmfp,orm-orm,19 +lre22_dev_aleeu,ara-arq,14 +lre22_dev_aliba,ara-aeb,15 +lre22_dev_alkwi,eng-iaf,14 +lre22_dev_aluwk,nbl-nbl,16 +lre22_dev_alvdl,ara-arq,14 +lre22_dev_amrca,ara-aeb,11 +lre22_dev_aoanh,ara-ayl,15 +lre22_dev_aoeql,eng-ens,16 +lre22_dev_apfpk,eng-iaf,14 +lre22_dev_apufs,tir-tir,17 +lre22_dev_apvko,orm-orm,20 +lre22_dev_arefe,orm-orm,23 +lre22_dev_arvyp,ara-ayl,11 +lre22_dev_arwsc,fra-ntf,20 +lre22_dev_asqwa,ara-aeb,14 +lre22_dev_asrng,fra-ntf,18 +lre22_dev_aswjo,afr-afr,18 +lre22_dev_aulzk,ven-ven,21 +lre22_dev_aupcr,zul-zul,18 +lre22_dev_auqcy,eng-ens,18 +lre22_dev_auxdy,nbl-nbl,16 +lre22_dev_auycg,ara-ayl,11 +lre22_dev_aviiv,tso-tso,14 +lre22_dev_avrwo,tso-tso,19 +lre22_dev_avwim,ara-arq,13 +lre22_dev_avzdv,zul-zul,18 +lre22_dev_awtna,ara-arq,13 +lre22_dev_awxbj,orm-orm,23 +lre22_dev_axejc,fra-ntf,17 +lre22_dev_axtso,eng-ens,16 +lre22_dev_axwoo,ara-aeb,15 +lre22_dev_axyma,ara-arq,15 +lre22_dev_aycai,ven-ven,17 +lre22_dev_ayfjz,orm-orm,20 +lre22_dev_aylrz,eng-iaf,16 +lre22_dev_aynwz,tso-tso,18 +lre22_dev_aypyt,ara-aeb,11 +lre22_dev_ayszn,zul-zul,18 +lre22_dev_ayvge,ara-aeb,11 +lre22_dev_ayvmo,afr-afr,23 +lre22_dev_ayzdz,xho-xho,20 +lre22_dev_azbmt,xho-xho,19 +lre22_dev_azjsr,tir-tir,19 +lre22_dev_azkdh,nbl-nbl,20 +lre22_dev_azwrd,fra-ntf,15 +lre22_dev_badwe,ara-aeb,13 +lre22_dev_baiaf,zul-zul,17 +lre22_dev_baiwb,ara-aeb,13 +lre22_dev_baxuo,zul-zul,18 +lre22_dev_bbbtf,eng-ens,18 +lre22_dev_bbdws,ara-ayl,12 +lre22_dev_bbitq,eng-ens,16 +lre22_dev_bbnvu,ara-arq,13 +lre22_dev_bbunq,eng-iaf,14 +lre22_dev_bcinm,ara-aeb,14 +lre22_dev_bcrhs,zul-zul,17 +lre22_dev_bcwpu,ara-aeb,13 +lre22_dev_bcxdq,fra-ntf,21 +lre22_dev_bdgbr,ara-aeb,12 +lre22_dev_bdgrw,orm-orm,17 +lre22_dev_bdiml,ara-aeb,11 +lre22_dev_bdyue,xho-xho,21 +lre22_dev_bdzsj,tir-tir,13 +lre22_dev_beanp,tso-tso,12 +lre22_dev_beigo,ara-aeb,14 +lre22_dev_belhi,orm-orm,23 +lre22_dev_bfoej,ven-ven,20 +lre22_dev_bfznf,ara-ayl,11 +lre22_dev_bgeiq,ven-ven,15 +lre22_dev_bgeyp,ara-aeb,11 +lre22_dev_bgomt,afr-afr,14 +lre22_dev_bgrfd,nbl-nbl,19 +lre22_dev_bgwlu,tir-tir,17 +lre22_dev_bifkp,nbl-nbl,18 +lre22_dev_bipvh,nbl-nbl,17 +lre22_dev_biuyu,eng-ens,12 +lre22_dev_bixnf,ara-ayl,11 +lre22_dev_bjhdf,tso-tso,17 +lre22_dev_bjsmm,ara-ayl,10 +lre22_dev_bkhqg,eng-ens,17 +lre22_dev_bkpah,ven-ven,14 +lre22_dev_blaco,afr-afr,17 +lre22_dev_bleum,xho-xho,18 +lre22_dev_bnhvt,nbl-nbl,16 +lre22_dev_bowyn,ara-arq,14 +lre22_dev_bpeqb,xho-xho,21 +lre22_dev_bpgqs,tir-tir,13 +lre22_dev_bpzpv,afr-afr,16 +lre22_dev_bqenu,eng-ens,12 +lre22_dev_bqfxw,zul-zul,14 +lre22_dev_bqowg,tir-tir,19 +lre22_dev_bqxyq,tir-tir,19 +lre22_dev_brjud,xho-xho,21 +lre22_dev_bruwl,xho-xho,16 +lre22_dev_brzld,fra-ntf,20 +lre22_dev_bsgqz,eng-ens,13 +lre22_dev_bsocl,eng-ens,12 +lre22_dev_bszou,ara-arq,13 +lre22_dev_btapz,zul-zul,15 +lre22_dev_btjlk,ara-aeb,14 +lre22_dev_btkry,xho-xho,19 +lre22_dev_btyeu,ara-ayl,15 +lre22_dev_bvnuu,fra-ntf,19 +lre22_dev_bvqag,eng-iaf,20 +lre22_dev_bvvho,eng-ens,16 +lre22_dev_bvwaj,tir-tir,14 +lre22_dev_bvymi,eng-ens,15 +lre22_dev_bwgmj,eng-iaf,20 +lre22_dev_bwqpz,ara-arq,14 +lre22_dev_bwyrh,ara-aeb,12 +lre22_dev_bxkrj,ven-ven,18 +lre22_dev_bxkti,afr-afr,20 +lre22_dev_bxzms,nbl-nbl,17 +lre22_dev_bygrw,tso-tso,18 +lre22_dev_byjqr,ven-ven,18 +lre22_dev_bylkl,eng-iaf,16 +lre22_dev_bzmkn,fra-ntf,22 +lre22_dev_bzntz,ara-arq,13 +lre22_dev_bzwkf,eng-iaf,19 +lre22_dev_caijh,ven-ven,18 +lre22_dev_canou,tir-tir,19 +lre22_dev_caqxh,afr-afr,20 +lre22_dev_cayuc,eng-ens,12 +lre22_dev_cbruy,xho-xho,23 +lre22_dev_cbyyw,ara-arq,14 +lre22_dev_cbzbe,afr-afr,22 +lre22_dev_cclfh,ara-arq,15 +lre22_dev_ccovd,ara-arq,11 +lre22_dev_ccpns,eng-ens,17 +lre22_dev_ccsjt,eng-iaf,16 +lre22_dev_ccsql,fra-ntf,21 +lre22_dev_ccugm,eng-ens,18 +lre22_dev_ccyfn,afr-afr,23 +lre22_dev_cdmgw,tir-tir,16 +lre22_dev_cdshg,eng-iaf,17 +lre22_dev_ceccy,orm-orm,20 +lre22_dev_cecwt,fra-ntf,22 +lre22_dev_cegvk,ara-arq,11 +lre22_dev_cferi,zul-zul,15 +lre22_dev_cfojx,ara-arq,11 +lre22_dev_cfzoe,tir-tir,20 +lre22_dev_cgfna,zul-zul,18 +lre22_dev_cggzh,ara-ayl,13 +lre22_dev_cgims,tir-tir,20 +lre22_dev_cgixe,tir-tir,19 +lre22_dev_cgjov,zul-zul,14 +lre22_dev_chhio,ara-aeb,14 +lre22_dev_chnvd,tir-tir,13 +lre22_dev_chpww,nbl-nbl,21 +lre22_dev_churq,ara-ayl,13 +lre22_dev_cifqp,zul-zul,17 +lre22_dev_cijnx,xho-xho,22 +lre22_dev_ciozp,nbl-nbl,16 +lre22_dev_citpi,ara-aeb,12 +lre22_dev_cjrav,tir-tir,15 +lre22_dev_cksrw,ara-aeb,14 +lre22_dev_cktce,tir-tir,17 +lre22_dev_ckzhf,nbl-nbl,20 +lre22_dev_cleyn,ara-aeb,11 +lre22_dev_clhmt,fra-ntf,19 +lre22_dev_clrjd,orm-orm,21 +lre22_dev_clssx,eng-iaf,14 +lre22_dev_cluxm,ara-ayl,13 +lre22_dev_clzwe,ara-aeb,14 +lre22_dev_cminq,ara-aeb,11 +lre22_dev_cmmap,afr-afr,23 +lre22_dev_cmssr,orm-orm,20 +lre22_dev_cmufu,tso-tso,16 +lre22_dev_cnapz,orm-orm,19 +lre22_dev_cndba,tso-tso,12 +lre22_dev_cnkjh,tso-tso,15 +lre22_dev_cnvfe,orm-orm,18 +lre22_dev_cobbz,ara-arq,12 +lre22_dev_coppu,nbl-nbl,21 +lre22_dev_coqoj,eng-ens,17 +lre22_dev_cotun,ven-ven,16 +lre22_dev_cowrt,xho-xho,19 +lre22_dev_cppma,afr-afr,20 +lre22_dev_cpqkz,ara-arq,14 +lre22_dev_cpraw,afr-afr,17 +lre22_dev_cpsrb,fra-ntf,20 +lre22_dev_cpuax,zul-zul,16 +lre22_dev_cpudb,nbl-nbl,16 +lre22_dev_cqqds,afr-afr,22 +lre22_dev_cquib,ven-ven,21 +lre22_dev_cqwxe,nbl-nbl,16 +lre22_dev_cqyad,eng-iaf,15 +lre22_dev_crkut,eng-ens,17 +lre22_dev_crozj,fra-ntf,17 +lre22_dev_crrro,orm-orm,16 +lre22_dev_csavn,ara-aeb,15 +lre22_dev_cschy,afr-afr,16 +lre22_dev_csegr,tso-tso,14 +lre22_dev_csgvq,fra-ntf,17 +lre22_dev_csltj,ara-aeb,14 +lre22_dev_csmtr,ara-ayl,14 +lre22_dev_csqxl,ven-ven,20 +lre22_dev_ctjqw,nbl-nbl,16 +lre22_dev_ctxxt,nbl-nbl,17 +lre22_dev_cuaoy,ara-aeb,13 +lre22_dev_cudpj,ara-arq,13 +lre22_dev_cuhdf,afr-afr,21 +lre22_dev_cuoju,ven-ven,21 +lre22_dev_cupti,nbl-nbl,21 +lre22_dev_cusej,ara-aeb,14 +lre22_dev_cvfle,tir-tir,14 +lre22_dev_cvnqu,eng-ens,14 +lre22_dev_cvvjc,zul-zul,18 +lre22_dev_cvwht,fra-ntf,18 +lre22_dev_cvwtu,fra-ntf,21 +lre22_dev_cwlvk,tso-tso,16 +lre22_dev_cwnky,xho-xho,17 +lre22_dev_cxdlr,afr-afr,14 +lre22_dev_cxfii,ara-arq,13 +lre22_dev_cxpzt,zul-zul,16 +lre22_dev_cxqri,fra-ntf,21 +lre22_dev_cyaug,xho-xho,22 +lre22_dev_czdbd,fra-ntf,15 +lre22_dev_czvoy,ven-ven,16 +lre22_dev_czzrm,afr-afr,17 +lre22_dev_dahzr,ven-ven,17 +lre22_dev_dapny,ven-ven,17 +lre22_dev_dapug,nbl-nbl,19 +lre22_dev_dcbnz,xho-xho,16 +lre22_dev_dciaf,nbl-nbl,22 +lre22_dev_dcljn,afr-afr,19 +lre22_dev_dcmrn,afr-afr,20 +lre22_dev_dcobq,xho-xho,16 +lre22_dev_dcohp,tir-tir,16 +lre22_dev_dcsep,tso-tso,12 +lre22_dev_dctlw,ara-arq,12 +lre22_dev_dctvv,ara-arq,12 +lre22_dev_dcyoy,eng-iaf,17 +lre22_dev_ddgeb,xho-xho,23 +lre22_dev_ddsab,eng-ens,18 +lre22_dev_ddtpk,eng-ens,18 +lre22_dev_debjr,xho-xho,16 +lre22_dev_defkv,eng-ens,15 +lre22_dev_dejub,ara-arq,11 +lre22_dev_delok,eng-ens,14 +lre22_dev_dezlg,nbl-nbl,17 +lre22_dev_dffbj,fra-ntf,21 +lre22_dev_dfkox,xho-xho,19 +lre22_dev_dfpcn,ara-ayl,13 +lre22_dev_dfqgl,afr-afr,18 +lre22_dev_dfras,eng-iaf,19 +lre22_dev_dftpm,eng-iaf,20 +lre22_dev_dfvta,tso-tso,17 +lre22_dev_dgarp,eng-ens,13 +lre22_dev_dgntq,zul-zul,17 +lre22_dev_dgssb,tir-tir,19 +lre22_dev_dgvtc,xho-xho,23 +lre22_dev_dhdvp,ara-ayl,10 +lre22_dev_dhmbl,fra-ntf,22 +lre22_dev_diiry,orm-orm,16 +lre22_dev_disrs,afr-afr,16 +lre22_dev_ditsk,xho-xho,21 +lre22_dev_djbbz,ara-arq,14 +lre22_dev_djevu,tso-tso,16 +lre22_dev_djlaf,tir-tir,20 +lre22_dev_djoim,zul-zul,15 +lre22_dev_djvvp,zul-zul,17 +lre22_dev_djwyo,ven-ven,18 +lre22_dev_dkbfm,ara-ayl,12 +lre22_dev_dkpcy,ara-aeb,12 +lre22_dev_dlxzj,orm-orm,19 +lre22_dev_dmnjo,ven-ven,14 +lre22_dev_dmtsm,zul-zul,16 +lre22_dev_dnaql,orm-orm,23 +lre22_dev_dnkpf,ara-aeb,15 +lre22_dev_dnscr,tso-tso,12 +lre22_dev_dnygt,eng-ens,15 +lre22_dev_dobre,xho-xho,19 +lre22_dev_dohlp,xho-xho,23 +lre22_dev_doioo,orm-orm,19 +lre22_dev_donaq,ara-aeb,13 +lre22_dev_dooht,ara-arq,11 +lre22_dev_dpmbt,zul-zul,14 +lre22_dev_dptyy,xho-xho,17 +lre22_dev_dqmud,eng-iaf,15 +lre22_dev_dqmxb,xho-xho,20 +lre22_dev_dqopt,eng-ens,14 +lre22_dev_dqpgr,ara-aeb,14 +lre22_dev_drkux,eng-ens,14 +lre22_dev_dsfha,ven-ven,18 +lre22_dev_dsftc,tso-tso,16 +lre22_dev_dskaq,ven-ven,15 +lre22_dev_dtdmp,zul-zul,18 +lre22_dev_dtdux,afr-afr,14 +lre22_dev_dtyki,ara-arq,11 +lre22_dev_durlr,orm-orm,18 +lre22_dev_dutdz,tso-tso,12 +lre22_dev_dvbol,ara-ayl,15 +lre22_dev_dwesk,nbl-nbl,22 +lre22_dev_dwtjw,ven-ven,14 +lre22_dev_dxckb,tso-tso,12 +lre22_dev_dxizq,eng-iaf,14 +lre22_dev_dxtnq,fra-ntf,18 +lre22_dev_dxvib,zul-zul,14 +lre22_dev_dyago,eng-iaf,16 +lre22_dev_dyipl,eng-iaf,18 +lre22_dev_dyqlo,ara-arq,13 +lre22_dev_dyvml,eng-iaf,15 +lre22_dev_dzkui,tso-tso,12 +lre22_dev_dzqta,ven-ven,20 +lre22_dev_dzxio,eng-ens,18 +lre22_dev_eachn,tir-tir,16 +lre22_dev_eapvu,eng-iaf,20 +lre22_dev_ebfdv,ara-ayl,10 +lre22_dev_ebgbd,eng-ens,17 +lre22_dev_eblhy,eng-iaf,20 +lre22_dev_ebtrq,ara-aeb,13 +lre22_dev_ebymv,tir-tir,14 +lre22_dev_ebzhg,nbl-nbl,21 +lre22_dev_ecbwo,ven-ven,21 +lre22_dev_ecllm,fra-ntf,21 +lre22_dev_eclpf,ven-ven,16 +lre22_dev_ecmhd,ara-aeb,14 +lre22_dev_ecnqi,eng-ens,14 +lre22_dev_ecpdc,ara-ayl,10 +lre22_dev_ecslx,afr-afr,22 +lre22_dev_ecuyo,xho-xho,23 +lre22_dev_edgur,tso-tso,16 +lre22_dev_edjtb,nbl-nbl,22 +lre22_dev_edsls,tso-tso,16 +lre22_dev_edssc,orm-orm,23 +lre22_dev_edvab,zul-zul,19 +lre22_dev_eehzu,zul-zul,18 +lre22_dev_eekci,afr-afr,15 +lre22_dev_eekcw,zul-zul,17 +lre22_dev_efihg,nbl-nbl,16 +lre22_dev_efsxw,tso-tso,16 +lre22_dev_efxjv,ara-aeb,14 +lre22_dev_efymf,ara-aeb,14 +lre22_dev_ehcvr,tir-tir,19 +lre22_dev_ehehw,xho-xho,20 +lre22_dev_ehewh,eng-ens,18 +lre22_dev_ehvyp,zul-zul,14 +lre22_dev_eifqv,zul-zul,19 +lre22_dev_eifxu,ara-ayl,10 +lre22_dev_ejcvy,fra-ntf,18 +lre22_dev_ejeek,eng-ens,16 +lre22_dev_ejfyn,fra-ntf,22 +lre22_dev_ejjqg,tso-tso,12 +lre22_dev_ejtox,ven-ven,19 +lre22_dev_ejwch,fra-ntf,21 +lre22_dev_ejzhx,xho-xho,17 +lre22_dev_ekbkm,afr-afr,21 +lre22_dev_ekzhk,ara-ayl,10 +lre22_dev_elanj,tso-tso,18 +lre22_dev_elvvn,tir-tir,16 +lre22_dev_emadg,xho-xho,22 +lre22_dev_emkzr,afr-afr,21 +lre22_dev_emmck,ara-arq,15 +lre22_dev_enwfu,afr-afr,15 +lre22_dev_eodro,ara-arq,15 +lre22_dev_eoisu,ven-ven,18 +lre22_dev_eomzr,xho-xho,23 +lre22_dev_eorva,xho-xho,21 +lre22_dev_epbwh,nbl-nbl,17 +lre22_dev_epeou,xho-xho,20 +lre22_dev_epifq,nbl-nbl,22 +lre22_dev_epqqo,ara-ayl,14 +lre22_dev_epsld,tso-tso,12 +lre22_dev_epsza,ara-ayl,12 +lre22_dev_eqmgm,ara-aeb,12 +lre22_dev_eqrhr,afr-afr,22 +lre22_dev_eqvan,ara-ayl,13 +lre22_dev_ersgd,orm-orm,22 +lre22_dev_erxig,zul-zul,15 +lre22_dev_esbrw,fra-ntf,19 +lre22_dev_esuug,nbl-nbl,20 +lre22_dev_etczk,tir-tir,14 +lre22_dev_etelz,fra-ntf,21 +lre22_dev_ettsh,fra-ntf,20 +lre22_dev_etuwp,ven-ven,19 +lre22_dev_eubgy,fra-ntf,18 +lre22_dev_euewj,orm-orm,18 +lre22_dev_euzyb,ara-aeb,13 +lre22_dev_ewatn,zul-zul,18 +lre22_dev_ewehs,orm-orm,17 +lre22_dev_ewexz,fra-ntf,18 +lre22_dev_ewgop,tir-tir,20 +lre22_dev_ewmgd,fra-ntf,21 +lre22_dev_ewzma,orm-orm,18 +lre22_dev_expvn,xho-xho,17 +lre22_dev_eyoqu,tir-tir,16 +lre22_dev_eyylz,nbl-nbl,16 +lre22_dev_eyzqu,tir-tir,18 +lre22_dev_ezdty,afr-afr,18 +lre22_dev_ezgcl,ara-aeb,13 +lre22_dev_eznzd,zul-zul,19 +lre22_dev_ezzwj,eng-iaf,18 +lre22_dev_facyr,zul-zul,18 +lre22_dev_faejb,tso-tso,16 +lre22_dev_famjw,orm-orm,18 +lre22_dev_favzh,ara-arq,11 +lre22_dev_fbsre,orm-orm,23 +lre22_dev_fbtkl,fra-ntf,22 +lre22_dev_fbvxh,ara-ayl,14 +lre22_dev_fbyhp,nbl-nbl,20 +lre22_dev_fbysf,nbl-nbl,17 +lre22_dev_fcckx,ara-arq,12 +lre22_dev_fczba,eng-iaf,17 +lre22_dev_fdouw,eng-ens,14 +lre22_dev_fdtmf,tso-tso,13 +lre22_dev_fdtnc,fra-ntf,20 +lre22_dev_fdwme,afr-afr,19 +lre22_dev_fdyhr,eng-ens,18 +lre22_dev_feanh,fra-ntf,22 +lre22_dev_femmc,ara-arq,12 +lre22_dev_fevab,orm-orm,19 +lre22_dev_fexsi,orm-orm,17 +lre22_dev_fflai,ara-aeb,14 +lre22_dev_fgblw,tso-tso,14 +lre22_dev_fglhf,nbl-nbl,22 +lre22_dev_fhucm,ara-ayl,14 +lre22_dev_fhzwp,nbl-nbl,17 +lre22_dev_fifon,eng-iaf,14 +lre22_dev_fipff,orm-orm,19 +lre22_dev_fipyx,zul-zul,14 +lre22_dev_firtn,zul-zul,18 +lre22_dev_fjdqb,nbl-nbl,16 +lre22_dev_fjdxl,tir-tir,14 +lre22_dev_fjocp,ara-ayl,12 +lre22_dev_fjudb,ara-aeb,15 +lre22_dev_fkbjz,afr-afr,22 +lre22_dev_fkwaq,afr-afr,19 +lre22_dev_flbgp,afr-afr,16 +lre22_dev_flgxs,tir-tir,13 +lre22_dev_fljfm,tir-tir,19 +lre22_dev_fmauu,tso-tso,18 +lre22_dev_fmbvf,fra-ntf,19 +lre22_dev_fmhfa,ara-arq,12 +lre22_dev_fmije,ara-ayl,13 +lre22_dev_fnafq,tir-tir,20 +lre22_dev_fofmo,eng-ens,15 +lre22_dev_foikm,tir-tir,16 +lre22_dev_fosfi,eng-iaf,19 +lre22_dev_fotti,eng-ens,13 +lre22_dev_fozzx,zul-zul,15 +lre22_dev_fpehr,ara-aeb,12 +lre22_dev_fpiig,orm-orm,21 +lre22_dev_fqfag,ara-ayl,16 +lre22_dev_fqogo,tir-tir,13 +lre22_dev_frdqe,ara-arq,11 +lre22_dev_fremq,afr-afr,22 +lre22_dev_frjdx,zul-zul,18 +lre22_dev_fruha,ara-ayl,12 +lre22_dev_frxmu,eng-iaf,18 +lre22_dev_fsbeo,tso-tso,13 +lre22_dev_fsijy,fra-ntf,22 +lre22_dev_fsjwh,nbl-nbl,18 +lre22_dev_fspmb,tso-tso,19 +lre22_dev_ftbak,tir-tir,13 +lre22_dev_ftxuo,eng-iaf,20 +lre22_dev_fupee,ara-aeb,13 +lre22_dev_fupla,ara-aeb,11 +lre22_dev_fvmdq,fra-ntf,22 +lre22_dev_fvmjb,fra-ntf,20 +lre22_dev_fvubo,fra-ntf,22 +lre22_dev_fvwze,afr-afr,23 +lre22_dev_fvxxt,ara-arq,13 +lre22_dev_fwcye,ven-ven,21 +lre22_dev_fwkwv,orm-orm,18 +lre22_dev_fxezd,orm-orm,17 +lre22_dev_fxuir,nbl-nbl,19 +lre22_dev_fzgcm,zul-zul,14 +lre22_dev_fzncb,nbl-nbl,16 +lre22_dev_gaezu,ara-aeb,11 +lre22_dev_gawox,ara-aeb,13 +lre22_dev_gbcfq,zul-zul,14 +lre22_dev_gbdkv,orm-orm,17 +lre22_dev_gbevf,eng-iaf,20 +lre22_dev_gchke,ara-aeb,12 +lre22_dev_gcncr,ara-arq,13 +lre22_dev_gdeqd,ara-ayl,14 +lre22_dev_gdncj,eng-iaf,14 +lre22_dev_gdobt,ven-ven,21 +lre22_dev_geeoy,xho-xho,22 +lre22_dev_geraa,afr-afr,20 +lre22_dev_gfigd,nbl-nbl,16 +lre22_dev_gfjzm,ara-ayl,12 +lre22_dev_gftlv,tir-tir,20 +lre22_dev_ggaux,xho-xho,16 +lre22_dev_ggbgc,zul-zul,15 +lre22_dev_gghhn,zul-zul,18 +lre22_dev_ggrwj,eng-iaf,17 +lre22_dev_ghdur,eng-ens,15 +lre22_dev_ghgbo,ara-ayl,14 +lre22_dev_ghhop,nbl-nbl,20 +lre22_dev_ghnwg,ara-ayl,14 +lre22_dev_ghpmd,ara-ayl,14 +lre22_dev_ghqbh,orm-orm,19 +lre22_dev_gihvo,eng-ens,16 +lre22_dev_giueq,tso-tso,19 +lre22_dev_giuix,ara-aeb,15 +lre22_dev_gjaqj,eng-iaf,20 +lre22_dev_gjgcw,xho-xho,18 +lre22_dev_gjirh,eng-iaf,16 +lre22_dev_gjvwy,nbl-nbl,22 +lre22_dev_gkeql,eng-iaf,16 +lre22_dev_gkhas,tso-tso,16 +lre22_dev_glmyp,nbl-nbl,16 +lre22_dev_glqft,eng-ens,18 +lre22_dev_glsnb,afr-afr,17 +lre22_dev_gmfcb,eng-iaf,16 +lre22_dev_gmlwo,afr-afr,16 +lre22_dev_gmpjq,tso-tso,12 +lre22_dev_gmrvk,ara-aeb,14 +lre22_dev_gmryq,ara-ayl,13 +lre22_dev_gmsds,eng-ens,16 +lre22_dev_gmztl,xho-xho,16 +lre22_dev_gnbyu,eng-iaf,15 +lre22_dev_gntym,zul-zul,17 +lre22_dev_gocpa,tso-tso,15 +lre22_dev_gpyxs,orm-orm,17 +lre22_dev_grgvb,afr-afr,16 +lre22_dev_grspj,orm-orm,19 +lre22_dev_grvjm,xho-xho,19 +lre22_dev_gsidj,eng-ens,18 +lre22_dev_gslzy,afr-afr,22 +lre22_dev_gtwcl,tir-tir,14 +lre22_dev_gulky,orm-orm,21 +lre22_dev_gvlhy,tir-tir,20 +lre22_dev_gvljx,tso-tso,15 +lre22_dev_gvmma,tso-tso,13 +lre22_dev_gvtvb,afr-afr,23 +lre22_dev_gweym,xho-xho,19 +lre22_dev_gwljh,ara-aeb,11 +lre22_dev_gwxtn,ara-ayl,14 +lre22_dev_gxdpw,fra-ntf,16 +lre22_dev_gxext,afr-afr,15 +lre22_dev_gxkqq,nbl-nbl,19 +lre22_dev_gxkxo,xho-xho,21 +lre22_dev_gxnkr,xho-xho,18 +lre22_dev_gxxbk,fra-ntf,21 +lre22_dev_gydvv,afr-afr,20 +lre22_dev_gytkt,ara-arq,12 +lre22_dev_gzmvp,afr-afr,18 +lre22_dev_gzoou,ven-ven,19 +lre22_dev_gzvza,tir-tir,15 +lre22_dev_gzwee,eng-iaf,17 +lre22_dev_haewp,tir-tir,19 +lre22_dev_haokb,fra-ntf,19 +lre22_dev_hazis,nbl-nbl,20 +lre22_dev_hbbbc,eng-ens,16 +lre22_dev_hblqa,nbl-nbl,17 +lre22_dev_hbmfy,zul-zul,15 +lre22_dev_hbndl,zul-zul,17 +lre22_dev_hcgfc,eng-ens,13 +lre22_dev_hcjnx,orm-orm,17 +lre22_dev_hcont,tir-tir,17 +lre22_dev_hcvik,tso-tso,13 +lre22_dev_hczom,zul-zul,19 +lre22_dev_hdaca,xho-xho,19 +lre22_dev_hdijt,fra-ntf,15 +lre22_dev_hdkyr,afr-afr,18 +lre22_dev_hdnoq,orm-orm,23 +lre22_dev_hdtlb,eng-iaf,16 +lre22_dev_hever,nbl-nbl,18 +lre22_dev_hfirj,nbl-nbl,17 +lre22_dev_hgbxp,xho-xho,21 +lre22_dev_hgcax,xho-xho,19 +lre22_dev_hgkwa,tso-tso,13 +lre22_dev_hgljd,ara-arq,15 +lre22_dev_hgvrh,nbl-nbl,21 +lre22_dev_hhovn,eng-iaf,16 +lre22_dev_hhpzm,fra-ntf,22 +lre22_dev_hhuab,ven-ven,20 +lre22_dev_hicev,ven-ven,18 +lre22_dev_hickz,ara-arq,12 +lre22_dev_hilii,orm-orm,23 +lre22_dev_hjenx,eng-iaf,19 +lre22_dev_hjiui,orm-orm,18 +lre22_dev_hkfts,eng-ens,18 +lre22_dev_hkhvl,zul-zul,19 +lre22_dev_hkobh,xho-xho,17 +lre22_dev_hkvay,ara-arq,13 +lre22_dev_hkvtj,orm-orm,21 +lre22_dev_hlevc,fra-ntf,17 +lre22_dev_hliut,ara-aeb,14 +lre22_dev_hlntc,zul-zul,18 +lre22_dev_hlprm,zul-zul,18 +lre22_dev_hmeav,ven-ven,17 +lre22_dev_hnelt,tir-tir,15 +lre22_dev_hniiy,ara-arq,15 +lre22_dev_hoepv,ara-aeb,13 +lre22_dev_hofkm,orm-orm,19 +lre22_dev_hoilz,tir-tir,19 +lre22_dev_hookr,ara-aeb,13 +lre22_dev_hpbhl,tir-tir,16 +lre22_dev_hpbzf,ara-aeb,11 +lre22_dev_hpizl,eng-ens,15 +lre22_dev_hplhi,ara-ayl,13 +lre22_dev_hplrq,xho-xho,20 +lre22_dev_hqdva,ven-ven,21 +lre22_dev_hqnus,xho-xho,16 +lre22_dev_hqoiz,orm-orm,18 +lre22_dev_hrerz,eng-ens,14 +lre22_dev_hrgjq,tir-tir,19 +lre22_dev_hrrhr,zul-zul,17 +lre22_dev_hsfbi,ara-ayl,14 +lre22_dev_hsjlg,tir-tir,17 +lre22_dev_hskug,afr-afr,16 +lre22_dev_hszzt,tso-tso,19 +lre22_dev_htgrl,tso-tso,18 +lre22_dev_htxah,zul-zul,17 +lre22_dev_htxrs,xho-xho,23 +lre22_dev_hudwz,nbl-nbl,17 +lre22_dev_huuqj,fra-ntf,18 +lre22_dev_hvsds,afr-afr,21 +lre22_dev_hwbhz,orm-orm,23 +lre22_dev_hwbvs,tso-tso,13 +lre22_dev_hwdlb,tso-tso,19 +lre22_dev_hwyki,eng-iaf,16 +lre22_dev_hxcmj,eng-iaf,20 +lre22_dev_hxdly,ara-arq,11 +lre22_dev_hyeqm,xho-xho,19 +lre22_dev_hyofm,ara-arq,12 +lre22_dev_hyogg,ara-arq,13 +lre22_dev_hyouu,tso-tso,13 +lre22_dev_hzfpc,fra-ntf,16 +lre22_dev_hzkjt,ara-aeb,12 +lre22_dev_hzrgv,fra-ntf,20 +lre22_dev_hzuus,tir-tir,19 +lre22_dev_hzzbp,xho-xho,19 +lre22_dev_iautt,afr-afr,20 +lre22_dev_ibdnu,tir-tir,13 +lre22_dev_ibuww,ara-aeb,13 +lre22_dev_icbuo,ven-ven,21 +lre22_dev_icqmr,tso-tso,14 +lre22_dev_ictwj,tir-tir,14 +lre22_dev_ifumz,ven-ven,14 +lre22_dev_igcgi,tso-tso,19 +lre22_dev_igder,tir-tir,19 +lre22_dev_igexm,xho-xho,21 +lre22_dev_igfxi,fra-ntf,20 +lre22_dev_igoxr,afr-afr,15 +lre22_dev_igxyt,ven-ven,21 +lre22_dev_ihqtn,ara-aeb,11 +lre22_dev_ihxfl,tir-tir,13 +lre22_dev_ihyrb,nbl-nbl,18 +lre22_dev_iifuu,tir-tir,15 +lre22_dev_iiien,xho-xho,20 +lre22_dev_ijccu,eng-iaf,16 +lre22_dev_ijrun,afr-afr,18 +lre22_dev_ijwlx,ara-arq,14 +lre22_dev_ijydw,xho-xho,21 +lre22_dev_ikdjt,xho-xho,23 +lre22_dev_iklbv,ara-arq,13 +lre22_dev_ikyai,fra-ntf,18 +lre22_dev_ildmr,orm-orm,21 +lre22_dev_ilebo,orm-orm,19 +lre22_dev_ilptc,eng-ens,18 +lre22_dev_ilsku,fra-ntf,16 +lre22_dev_ilyti,ara-arq,11 +lre22_dev_imnqh,zul-zul,17 +lre22_dev_imxdr,eng-ens,16 +lre22_dev_indww,fra-ntf,19 +lre22_dev_iokar,eng-iaf,15 +lre22_dev_iomtu,eng-iaf,15 +lre22_dev_ioobz,tir-tir,14 +lre22_dev_iosom,zul-zul,17 +lre22_dev_iowyd,ara-arq,14 +lre22_dev_iphzy,nbl-nbl,18 +lre22_dev_ipmrc,nbl-nbl,16 +lre22_dev_ipomi,ara-aeb,12 +lre22_dev_ipour,afr-afr,15 +lre22_dev_ippjq,ara-ayl,16 +lre22_dev_ipvjc,ara-aeb,13 +lre22_dev_iqfdc,ven-ven,19 +lre22_dev_iqppw,tso-tso,15 +lre22_dev_iqtde,tso-tso,14 +lre22_dev_irlee,eng-iaf,14 +lre22_dev_irxuq,ara-aeb,14 +lre22_dev_isjzo,ara-arq,14 +lre22_dev_isnwz,ara-ayl,14 +lre22_dev_isqvk,afr-afr,15 +lre22_dev_isqww,orm-orm,19 +lre22_dev_istdz,tir-tir,18 +lre22_dev_iszhe,fra-ntf,20 +lre22_dev_itblz,ven-ven,18 +lre22_dev_itfez,ara-arq,13 +lre22_dev_itjqm,zul-zul,18 +lre22_dev_itnap,nbl-nbl,21 +lre22_dev_itrms,xho-xho,21 +lre22_dev_itroi,fra-ntf,17 +lre22_dev_ittds,zul-zul,16 +lre22_dev_iuknz,tso-tso,16 +lre22_dev_iumnm,ara-ayl,15 +lre22_dev_iunul,afr-afr,23 +lre22_dev_iverq,ven-ven,16 +lre22_dev_ivwzd,ara-ayl,14 +lre22_dev_ivzjf,tso-tso,12 +lre22_dev_iwbta,nbl-nbl,16 +lre22_dev_iwdeh,orm-orm,21 +lre22_dev_iwgel,ara-aeb,11 +lre22_dev_ixbhj,ara-aeb,11 +lre22_dev_ixbnl,fra-ntf,16 +lre22_dev_ixcef,ven-ven,20 +lre22_dev_ixfdf,orm-orm,18 +lre22_dev_ixjey,orm-orm,19 +lre22_dev_ixlve,tir-tir,17 +lre22_dev_ixutu,ara-ayl,12 +lre22_dev_ixxoj,xho-xho,23 +lre22_dev_ixyko,afr-afr,22 +lre22_dev_iylls,eng-iaf,19 +lre22_dev_izegw,orm-orm,23 +lre22_dev_izglb,ara-ayl,13 +lre22_dev_iziar,ara-arq,13 +lre22_dev_jadvz,afr-afr,18 +lre22_dev_jajtw,ara-aeb,14 +lre22_dev_janvu,tso-tso,16 +lre22_dev_japrb,xho-xho,21 +lre22_dev_jarvz,ara-aeb,12 +lre22_dev_jazcn,tso-tso,13 +lre22_dev_jbfxj,tso-tso,12 +lre22_dev_jbnfg,fra-ntf,15 +lre22_dev_jbwgd,afr-afr,20 +lre22_dev_jceug,tso-tso,15 +lre22_dev_jcqtd,eng-ens,14 +lre22_dev_jcxry,ven-ven,20 +lre22_dev_jdbli,tir-tir,20 +lre22_dev_jegmb,orm-orm,18 +lre22_dev_jegqj,ara-ayl,12 +lre22_dev_jenns,xho-xho,22 +lre22_dev_jfarf,ven-ven,14 +lre22_dev_jfcve,zul-zul,17 +lre22_dev_jfgyq,xho-xho,23 +lre22_dev_jftnz,afr-afr,14 +lre22_dev_jftsj,afr-afr,22 +lre22_dev_jgnid,nbl-nbl,16 +lre22_dev_jgsju,eng-ens,13 +lre22_dev_jifal,orm-orm,19 +lre22_dev_jihsd,orm-orm,21 +lre22_dev_jihwf,ara-ayl,11 +lre22_dev_jiptp,eng-iaf,15 +lre22_dev_jizij,tir-tir,14 +lre22_dev_jjpzg,orm-orm,23 +lre22_dev_jkezw,fra-ntf,18 +lre22_dev_jkmux,fra-ntf,20 +lre22_dev_jkpnt,orm-orm,22 +lre22_dev_jlkfj,eng-ens,18 +lre22_dev_jlmtf,ven-ven,19 +lre22_dev_jlrfm,ara-arq,12 +lre22_dev_jmojg,orm-orm,19 +lre22_dev_jmrcv,ara-aeb,13 +lre22_dev_jmsxc,eng-iaf,16 +lre22_dev_jnjpw,tir-tir,14 +lre22_dev_jnzvu,ara-aeb,14 +lre22_dev_jocyh,xho-xho,19 +lre22_dev_joezr,tso-tso,16 +lre22_dev_jofqy,ara-arq,11 +lre22_dev_jpbyf,eng-ens,15 +lre22_dev_jppuy,ara-arq,13 +lre22_dev_jptts,ara-aeb,12 +lre22_dev_jqdyx,fra-ntf,22 +lre22_dev_jqjbq,zul-zul,17 +lre22_dev_jqpnb,ven-ven,21 +lre22_dev_jqqin,zul-zul,17 +lre22_dev_jqzkq,ara-ayl,13 +lre22_dev_jrroq,orm-orm,21 +lre22_dev_jruru,eng-ens,16 +lre22_dev_jskbr,ara-arq,11 +lre22_dev_jskdd,nbl-nbl,19 +lre22_dev_jslnc,eng-ens,12 +lre22_dev_jsmat,orm-orm,17 +lre22_dev_jsmdw,ara-aeb,11 +lre22_dev_jsvaz,afr-afr,19 +lre22_dev_jsxcy,afr-afr,21 +lre22_dev_jszgk,eng-iaf,19 +lre22_dev_jthui,ven-ven,20 +lre22_dev_jtpvz,ven-ven,17 +lre22_dev_jtwdi,ven-ven,14 +lre22_dev_jtwfh,ven-ven,18 +lre22_dev_juwid,tir-tir,20 +lre22_dev_jvdww,fra-ntf,21 +lre22_dev_jweyx,tir-tir,19 +lre22_dev_jwuto,afr-afr,19 +lre22_dev_jwwgs,afr-afr,19 +lre22_dev_jxhxf,nbl-nbl,17 +lre22_dev_jxtxk,orm-orm,20 +lre22_dev_jxzvy,eng-ens,15 +lre22_dev_jyjlm,nbl-nbl,19 +lre22_dev_jynvf,ara-ayl,13 +lre22_dev_jyzmh,nbl-nbl,19 +lre22_dev_jzivf,eng-ens,14 +lre22_dev_jzpns,tso-tso,14 +lre22_dev_kadwu,fra-ntf,18 +lre22_dev_kbnbi,tir-tir,13 +lre22_dev_kbqbd,fra-ntf,16 +lre22_dev_kbscm,tso-tso,15 +lre22_dev_kbxko,ara-aeb,12 +lre22_dev_kcegv,tso-tso,15 +lre22_dev_kcibo,afr-afr,17 +lre22_dev_kcmky,ara-ayl,14 +lre22_dev_kctrd,nbl-nbl,22 +lre22_dev_kcvbf,fra-ntf,16 +lre22_dev_kdbqy,zul-zul,15 +lre22_dev_kdgpz,ara-arq,14 +lre22_dev_kdhgq,nbl-nbl,22 +lre22_dev_kdvtu,eng-iaf,16 +lre22_dev_kdyhm,tso-tso,12 +lre22_dev_keeyz,zul-zul,18 +lre22_dev_kejvy,ven-ven,18 +lre22_dev_kerpr,ven-ven,21 +lre22_dev_keweh,ara-aeb,13 +lre22_dev_keysx,orm-orm,23 +lre22_dev_kezyv,ara-ayl,13 +lre22_dev_kgbiq,ven-ven,18 +lre22_dev_kgovz,tso-tso,15 +lre22_dev_kgxka,eng-ens,16 +lre22_dev_khkcx,fra-ntf,20 +lre22_dev_khobl,orm-orm,19 +lre22_dev_khttn,afr-afr,17 +lre22_dev_khvss,tir-tir,15 +lre22_dev_kiezl,tso-tso,16 +lre22_dev_kihlw,eng-ens,14 +lre22_dev_kipuq,ara-arq,14 +lre22_dev_kiqcx,tir-tir,16 +lre22_dev_kjiks,xho-xho,19 +lre22_dev_kjmpa,zul-zul,18 +lre22_dev_kjocf,eng-iaf,16 +lre22_dev_kkbur,ven-ven,16 +lre22_dev_kksdi,xho-xho,22 +lre22_dev_kkytv,ara-aeb,11 +lre22_dev_kmkgx,nbl-nbl,17 +lre22_dev_kmpkm,zul-zul,19 +lre22_dev_kmyzy,ara-ayl,13 +lre22_dev_knfsj,afr-afr,15 +lre22_dev_knyuq,orm-orm,19 +lre22_dev_koacp,orm-orm,19 +lre22_dev_koket,eng-ens,18 +lre22_dev_kovdn,zul-zul,15 +lre22_dev_kowqf,ven-ven,19 +lre22_dev_kozfr,nbl-nbl,21 +lre22_dev_kpmyz,orm-orm,19 +lre22_dev_kqfdc,eng-ens,17 +lre22_dev_kqumw,fra-ntf,22 +lre22_dev_kqwdi,nbl-nbl,16 +lre22_dev_krczb,ven-ven,19 +lre22_dev_kremz,nbl-nbl,16 +lre22_dev_ksruw,ven-ven,18 +lre22_dev_kszdw,eng-iaf,20 +lre22_dev_ktgvi,ara-arq,11 +lre22_dev_ktjax,fra-ntf,20 +lre22_dev_ktlvc,orm-orm,19 +lre22_dev_kvqgp,afr-afr,21 +lre22_dev_kvyoz,afr-afr,20 +lre22_dev_kvzim,afr-afr,14 +lre22_dev_kvzwc,eng-iaf,14 +lre22_dev_kwcwa,ara-arq,14 +lre22_dev_kwomo,zul-zul,19 +lre22_dev_kwxau,xho-xho,18 +lre22_dev_kxawf,tir-tir,19 +lre22_dev_kxjhn,ara-aeb,11 +lre22_dev_kxklh,tir-tir,19 +lre22_dev_kxlgg,tir-tir,16 +lre22_dev_kyqbp,fra-ntf,21 +lre22_dev_kyzio,ven-ven,20 +lre22_dev_kzcgh,ara-ayl,13 +lre22_dev_kzeyf,ven-ven,18 +lre22_dev_kzfwf,fra-ntf,19 +lre22_dev_kzjuz,orm-orm,21 +lre22_dev_kzjwx,ara-ayl,11 +lre22_dev_lamjl,tso-tso,17 +lre22_dev_laowh,xho-xho,16 +lre22_dev_larex,ara-ayl,11 +lre22_dev_laycs,tso-tso,12 +lre22_dev_lbxfn,eng-iaf,20 +lre22_dev_lcrog,zul-zul,18 +lre22_dev_ldczz,xho-xho,17 +lre22_dev_ldkgv,ara-aeb,13 +lre22_dev_ldkst,fra-ntf,20 +lre22_dev_ldkwr,orm-orm,22 +lre22_dev_lenxf,ven-ven,14 +lre22_dev_lfbey,ara-ayl,12 +lre22_dev_lfmml,fra-ntf,18 +lre22_dev_lfmxu,ven-ven,18 +lre22_dev_lfqfj,afr-afr,17 +lre22_dev_lgetu,ara-aeb,14 +lre22_dev_lgleu,ara-ayl,11 +lre22_dev_lgoat,eng-iaf,16 +lre22_dev_lhgaj,tso-tso,15 +lre22_dev_lhqyw,nbl-nbl,17 +lre22_dev_lhrmr,eng-iaf,17 +lre22_dev_lhtsd,tir-tir,19 +lre22_dev_lhydp,fra-ntf,22 +lre22_dev_livbf,tir-tir,15 +lre22_dev_ljdrg,ara-arq,13 +lre22_dev_ljniw,tso-tso,16 +lre22_dev_ljpmq,tso-tso,12 +lre22_dev_lkjon,tso-tso,15 +lre22_dev_lkszp,nbl-nbl,19 +lre22_dev_llbim,ara-ayl,15 +lre22_dev_llkkt,fra-ntf,15 +lre22_dev_llvcc,orm-orm,22 +lre22_dev_lmbug,ara-arq,12 +lre22_dev_lmmmw,nbl-nbl,19 +lre22_dev_lmsek,ven-ven,16 +lre22_dev_lmudp,ara-ayl,10 +lre22_dev_lmzmv,eng-iaf,19 +lre22_dev_lnlae,ara-arq,14 +lre22_dev_lnlvt,zul-zul,17 +lre22_dev_lnppu,ara-ayl,13 +lre22_dev_lnpyc,tso-tso,19 +lre22_dev_lolkv,xho-xho,19 +lre22_dev_lorcx,nbl-nbl,20 +lre22_dev_lparq,xho-xho,16 +lre22_dev_lqlft,ara-arq,11 +lre22_dev_lqlyq,ara-arq,12 +lre22_dev_lqoeu,tso-tso,14 +lre22_dev_lqueh,ara-ayl,11 +lre22_dev_lquzk,ara-arq,12 +lre22_dev_lqvav,zul-zul,18 +lre22_dev_lrgpy,eng-iaf,16 +lre22_dev_lrjbn,ven-ven,21 +lre22_dev_lrtad,ara-arq,14 +lre22_dev_lrtxd,ara-aeb,11 +lre22_dev_lrvkn,ven-ven,16 +lre22_dev_lrzwy,ara-ayl,13 +lre22_dev_lsefk,ara-arq,13 +lre22_dev_ltmmt,orm-orm,22 +lre22_dev_lutgh,ara-aeb,15 +lre22_dev_lvhmd,tso-tso,14 +lre22_dev_lvqim,ara-aeb,14 +lre22_dev_lvuuo,fra-ntf,17 +lre22_dev_lvzri,ven-ven,16 +lre22_dev_lweml,ara-arq,14 +lre22_dev_lwstj,eng-iaf,16 +lre22_dev_lwzdj,afr-afr,18 +lre22_dev_lxdsk,eng-ens,16 +lre22_dev_lxlcr,ara-aeb,13 +lre22_dev_lxshv,eng-iaf,20 +lre22_dev_lxxvv,eng-ens,16 +lre22_dev_lyfhc,ven-ven,18 +lre22_dev_lyikp,zul-zul,19 +lre22_dev_lyjix,tso-tso,14 +lre22_dev_lyxyh,eng-iaf,19 +lre22_dev_lyzxd,tir-tir,17 +lre22_dev_lzguf,orm-orm,21 +lre22_dev_lzpmk,tir-tir,16 +lre22_dev_lzugv,xho-xho,19 +lre22_dev_maeeb,tir-tir,15 +lre22_dev_maemn,zul-zul,16 +lre22_dev_manpw,orm-orm,19 +lre22_dev_mavli,ara-aeb,12 +lre22_dev_mbywd,orm-orm,19 +lre22_dev_mcath,nbl-nbl,22 +lre22_dev_mcjtw,xho-xho,16 +lre22_dev_mcndd,ven-ven,15 +lre22_dev_mcxqb,tir-tir,13 +lre22_dev_mdlia,fra-ntf,16 +lre22_dev_mdxsp,eng-ens,18 +lre22_dev_menex,eng-iaf,16 +lre22_dev_merfk,orm-orm,21 +lre22_dev_mfipk,zul-zul,16 +lre22_dev_mfuqh,ara-arq,14 +lre22_dev_mgcvo,xho-xho,19 +lre22_dev_mggbx,zul-zul,18 +lre22_dev_mgghl,tso-tso,12 +lre22_dev_mgwqd,ara-arq,14 +lre22_dev_mhswt,ara-ayl,15 +lre22_dev_mhwmt,tso-tso,16 +lre22_dev_miayn,ara-aeb,12 +lre22_dev_miley,tso-tso,16 +lre22_dev_mjfmb,nbl-nbl,21 +lre22_dev_mkbyx,tir-tir,19 +lre22_dev_mlbzi,xho-xho,23 +lre22_dev_mlduq,xho-xho,16 +lre22_dev_mljnp,ara-arq,14 +lre22_dev_mljpb,orm-orm,22 +lre22_dev_mlrsm,xho-xho,17 +lre22_dev_mlwzr,eng-ens,13 +lre22_dev_mlyeo,ven-ven,15 +lre22_dev_mmaed,ara-ayl,14 +lre22_dev_mmbns,eng-ens,12 +lre22_dev_mneyt,xho-xho,17 +lre22_dev_mnhsk,ven-ven,14 +lre22_dev_mnnvk,eng-ens,15 +lre22_dev_mnswo,tso-tso,16 +lre22_dev_mntdk,eng-ens,18 +lre22_dev_mogwl,orm-orm,22 +lre22_dev_mpbun,nbl-nbl,21 +lre22_dev_mpmuf,ara-aeb,14 +lre22_dev_mpoet,nbl-nbl,16 +lre22_dev_mptyi,afr-afr,18 +lre22_dev_mpzxy,orm-orm,18 +lre22_dev_mqxni,ara-arq,11 +lre22_dev_mqzga,tso-tso,19 +lre22_dev_mrgdh,xho-xho,17 +lre22_dev_mrgko,afr-afr,18 +lre22_dev_mrksc,tir-tir,19 +lre22_dev_mrogp,eng-iaf,15 +lre22_dev_mscwd,fra-ntf,16 +lre22_dev_mshco,ara-ayl,12 +lre22_dev_msptn,ara-ayl,16 +lre22_dev_msslk,ara-aeb,14 +lre22_dev_mtaus,fra-ntf,19 +lre22_dev_mtpgl,tso-tso,13 +lre22_dev_mttly,tir-tir,19 +lre22_dev_mubqn,fra-ntf,15 +lre22_dev_muskv,tso-tso,12 +lre22_dev_muzkp,ara-arq,14 +lre22_dev_mvdus,ven-ven,19 +lre22_dev_mvngl,xho-xho,19 +lre22_dev_mvrpq,tso-tso,12 +lre22_dev_mvtcj,afr-afr,22 +lre22_dev_mwhsu,xho-xho,21 +lre22_dev_mwkyp,nbl-nbl,20 +lre22_dev_mxcey,ara-ayl,12 +lre22_dev_mxcub,ara-aeb,12 +lre22_dev_myekh,ara-aeb,11 +lre22_dev_mzxhf,zul-zul,17 +lre22_dev_mzyru,ara-arq,12 +lre22_dev_nakax,eng-iaf,15 +lre22_dev_naymc,ara-ayl,13 +lre22_dev_nbgid,orm-orm,19 +lre22_dev_nbmnl,xho-xho,16 +lre22_dev_ncffi,zul-zul,14 +lre22_dev_ncjtj,fra-ntf,22 +lre22_dev_ncpix,ara-ayl,11 +lre22_dev_nctqc,xho-xho,16 +lre22_dev_ndkuo,orm-orm,20 +lre22_dev_ndqfw,nbl-nbl,17 +lre22_dev_nedes,ven-ven,15 +lre22_dev_neomw,zul-zul,18 +lre22_dev_neziz,tir-tir,19 +lre22_dev_nfcvg,eng-iaf,17 +lre22_dev_nfdfc,afr-afr,17 +lre22_dev_ngijv,xho-xho,21 +lre22_dev_ngrxk,ara-ayl,13 +lre22_dev_ngzja,ara-aeb,13 +lre22_dev_nhaub,tso-tso,13 +lre22_dev_nhkro,xho-xho,23 +lre22_dev_nhlvt,ara-arq,14 +lre22_dev_nhlxm,eng-ens,14 +lre22_dev_nhyjy,afr-afr,17 +lre22_dev_nifei,zul-zul,19 +lre22_dev_nikpx,ven-ven,18 +lre22_dev_njceq,afr-afr,18 +lre22_dev_njmlt,eng-ens,17 +lre22_dev_njqfj,orm-orm,18 +lre22_dev_nkdje,eng-iaf,19 +lre22_dev_nkkqo,nbl-nbl,22 +lre22_dev_nknrw,orm-orm,21 +lre22_dev_nkogd,fra-ntf,19 +lre22_dev_nksfc,tir-tir,19 +lre22_dev_nkwmm,orm-orm,22 +lre22_dev_nmhdg,ara-ayl,10 +lre22_dev_nmoux,ven-ven,20 +lre22_dev_nmrsq,ven-ven,21 +lre22_dev_nnbhc,fra-ntf,20 +lre22_dev_nnbpy,tir-tir,18 +lre22_dev_nnpwd,ara-aeb,13 +lre22_dev_nodin,ara-ayl,14 +lre22_dev_nogji,nbl-nbl,20 +lre22_dev_nonvr,afr-afr,15 +lre22_dev_notcl,eng-iaf,19 +lre22_dev_noufn,ara-aeb,11 +lre22_dev_noveb,ara-ayl,11 +lre22_dev_npajm,nbl-nbl,19 +lre22_dev_npehj,ara-ayl,14 +lre22_dev_nqdaj,tso-tso,12 +lre22_dev_nqkon,xho-xho,18 +lre22_dev_nqlhw,ara-aeb,13 +lre22_dev_nraqr,eng-ens,14 +lre22_dev_nrino,tso-tso,14 +lre22_dev_nrzgt,xho-xho,16 +lre22_dev_nscrg,orm-orm,18 +lre22_dev_nstgp,orm-orm,23 +lre22_dev_ntgqz,afr-afr,23 +lre22_dev_nthzr,eng-iaf,18 +lre22_dev_ntwzb,afr-afr,16 +lre22_dev_nudwv,eng-ens,14 +lre22_dev_nuerz,eng-iaf,18 +lre22_dev_nujfy,xho-xho,21 +lre22_dev_nurlx,eng-ens,13 +lre22_dev_nvakd,zul-zul,17 +lre22_dev_nvgkj,eng-ens,17 +lre22_dev_nvhvv,fra-ntf,20 +lre22_dev_nwbnz,ara-arq,14 +lre22_dev_nwjed,nbl-nbl,19 +lre22_dev_nwrto,ara-aeb,11 +lre22_dev_nwunl,zul-zul,14 +lre22_dev_nwvyy,tir-tir,19 +lre22_dev_nxwlo,nbl-nbl,17 +lre22_dev_nxxzy,zul-zul,16 +lre22_dev_nxzpp,nbl-nbl,20 +lre22_dev_nyhwg,ara-arq,14 +lre22_dev_nykvr,eng-ens,17 +lre22_dev_nyvkc,tir-tir,15 +lre22_dev_nyyui,ara-arq,11 +lre22_dev_nzbfh,zul-zul,19 +lre22_dev_nzxsk,xho-xho,21 +lre22_dev_oasrh,ara-arq,11 +lre22_dev_oavaf,xho-xho,21 +lre22_dev_obfrf,orm-orm,20 +lre22_dev_obocn,ara-arq,14 +lre22_dev_obumo,eng-ens,15 +lre22_dev_ocbuj,eng-ens,12 +lre22_dev_ocbxu,nbl-nbl,21 +lre22_dev_ocdvw,ara-ayl,13 +lre22_dev_ocdzj,xho-xho,19 +lre22_dev_ocveq,fra-ntf,22 +lre22_dev_odest,ara-ayl,11 +lre22_dev_odjlq,ven-ven,18 +lre22_dev_odpoq,ara-ayl,12 +lre22_dev_odrcm,fra-ntf,21 +lre22_dev_oeavx,ara-arq,12 +lre22_dev_oefoy,ara-aeb,12 +lre22_dev_oefqy,ven-ven,16 +lre22_dev_oehxk,ara-ayl,12 +lre22_dev_oeqbo,ara-aeb,14 +lre22_dev_oeqjq,fra-ntf,20 +lre22_dev_ofdgy,ara-ayl,15 +lre22_dev_ofgkq,fra-ntf,21 +lre22_dev_ofpva,ara-arq,11 +lre22_dev_ofufy,eng-iaf,17 +lre22_dev_ogglz,ara-aeb,13 +lre22_dev_oggtr,nbl-nbl,19 +lre22_dev_ogpxk,ara-aeb,11 +lre22_dev_ogsay,tso-tso,19 +lre22_dev_ogtvj,zul-zul,19 +lre22_dev_ohqwz,ara-arq,13 +lre22_dev_ohuxo,afr-afr,20 +lre22_dev_ohweb,ven-ven,16 +lre22_dev_ohzpg,fra-ntf,21 +lre22_dev_oijcy,xho-xho,19 +lre22_dev_oijgv,tir-tir,16 +lre22_dev_oikqj,eng-iaf,17 +lre22_dev_oinvl,ven-ven,15 +lre22_dev_oiofr,fra-ntf,19 +lre22_dev_oipks,eng-ens,17 +lre22_dev_ojzos,ara-arq,14 +lre22_dev_okbnu,ara-ayl,10 +lre22_dev_okpcp,eng-iaf,18 +lre22_dev_okwpq,tso-tso,16 +lre22_dev_oleie,ara-arq,12 +lre22_dev_oljep,ven-ven,21 +lre22_dev_oljsa,fra-ntf,16 +lre22_dev_olkup,nbl-nbl,16 +lre22_dev_olqbh,ara-ayl,14 +lre22_dev_omjqo,ara-aeb,14 +lre22_dev_omwiy,ara-ayl,12 +lre22_dev_omxnk,ara-arq,13 +lre22_dev_onqke,eng-iaf,16 +lre22_dev_onzje,tir-tir,13 +lre22_dev_ooktw,afr-afr,18 +lre22_dev_oosff,ara-aeb,12 +lre22_dev_ootbi,xho-xho,21 +lre22_dev_opciz,orm-orm,23 +lre22_dev_opgny,xho-xho,19 +lre22_dev_opifd,ara-arq,12 +lre22_dev_oporo,eng-iaf,19 +lre22_dev_opryj,nbl-nbl,16 +lre22_dev_opuzh,eng-ens,12 +lre22_dev_oqbaw,ven-ven,18 +lre22_dev_oqeuj,tir-tir,14 +lre22_dev_oqmhb,xho-xho,21 +lre22_dev_oqmrs,ara-arq,14 +lre22_dev_oqqwq,tso-tso,12 +lre22_dev_oquaq,xho-xho,17 +lre22_dev_oriap,fra-ntf,20 +lre22_dev_orsjj,tir-tir,20 +lre22_dev_orvna,fra-ntf,21 +lre22_dev_oskoe,orm-orm,20 +lre22_dev_otlyk,nbl-nbl,18 +lre22_dev_oujnj,nbl-nbl,17 +lre22_dev_oumka,ven-ven,14 +lre22_dev_ouqsx,ara-arq,13 +lre22_dev_outyl,zul-zul,16 +lre22_dev_owlwt,ara-ayl,14 +lre22_dev_owvfd,orm-orm,18 +lre22_dev_oxizc,tir-tir,15 +lre22_dev_oxpht,eng-ens,18 +lre22_dev_oxqlz,afr-afr,15 +lre22_dev_oydiw,nbl-nbl,16 +lre22_dev_oyfcl,fra-ntf,22 +lre22_dev_oyhba,eng-ens,18 +lre22_dev_oyiif,afr-afr,17 +lre22_dev_oyslg,afr-afr,21 +lre22_dev_ozfpi,tir-tir,15 +lre22_dev_ozlww,ven-ven,19 +lre22_dev_paxnc,eng-ens,17 +lre22_dev_pbbgx,eng-iaf,14 +lre22_dev_pcfmw,nbl-nbl,21 +lre22_dev_pclpc,fra-ntf,15 +lre22_dev_pcmmj,afr-afr,16 +lre22_dev_pcsqz,tso-tso,18 +lre22_dev_pdcfm,ara-ayl,10 +lre22_dev_pdtuf,eng-ens,18 +lre22_dev_pdzuj,zul-zul,17 +lre22_dev_pehfu,fra-ntf,15 +lre22_dev_pewpj,orm-orm,22 +lre22_dev_pexjz,orm-orm,17 +lre22_dev_pfioj,eng-iaf,15 +lre22_dev_pfkcf,eng-iaf,16 +lre22_dev_pfknl,ara-arq,14 +lre22_dev_pfucv,ara-ayl,12 +lre22_dev_pfyha,fra-ntf,21 +lre22_dev_pgavf,ara-ayl,13 +lre22_dev_phket,nbl-nbl,22 +lre22_dev_piabk,afr-afr,19 +lre22_dev_picvg,orm-orm,17 +lre22_dev_piina,eng-ens,14 +lre22_dev_pjahm,afr-afr,20 +lre22_dev_pjcso,nbl-nbl,17 +lre22_dev_pjggp,ven-ven,16 +lre22_dev_pjohw,xho-xho,19 +lre22_dev_pkpxo,ara-ayl,11 +lre22_dev_pktgk,nbl-nbl,22 +lre22_dev_plojq,eng-ens,12 +lre22_dev_pmayg,ven-ven,21 +lre22_dev_pmjyi,xho-xho,20 +lre22_dev_pmkcp,nbl-nbl,20 +lre22_dev_pnfhk,fra-ntf,18 +lre22_dev_pnust,nbl-nbl,20 +lre22_dev_pnwey,eng-iaf,15 +lre22_dev_pnwti,ara-aeb,13 +lre22_dev_pohmm,afr-afr,14 +lre22_dev_pojvr,nbl-nbl,22 +lre22_dev_poxsw,ara-aeb,13 +lre22_dev_ppjvq,tir-tir,16 +lre22_dev_ppkfc,fra-ntf,19 +lre22_dev_ppmnu,tso-tso,12 +lre22_dev_ppzno,tso-tso,12 +lre22_dev_pqksl,afr-afr,14 +lre22_dev_pqnvh,zul-zul,19 +lre22_dev_prcus,tso-tso,15 +lre22_dev_prhoh,tir-tir,19 +lre22_dev_prkth,ara-arq,12 +lre22_dev_prnhd,xho-xho,18 +lre22_dev_psjma,fra-ntf,18 +lre22_dev_psldq,tir-tir,19 +lre22_dev_psnvo,afr-afr,15 +lre22_dev_psnzj,zul-zul,19 +lre22_dev_pudqr,eng-ens,17 +lre22_dev_pufnl,orm-orm,19 +lre22_dev_pusxa,nbl-nbl,22 +lre22_dev_pvsqi,ara-arq,11 +lre22_dev_pvteg,fra-ntf,17 +lre22_dev_pvvay,tir-tir,14 +lre22_dev_pvxcv,ara-aeb,15 +lre22_dev_pvygc,ara-aeb,11 +lre22_dev_pwcxu,tir-tir,13 +lre22_dev_pwhdm,nbl-nbl,17 +lre22_dev_pwnkz,ven-ven,20 +lre22_dev_pwrqe,ara-aeb,14 +lre22_dev_pxbhi,afr-afr,16 +lre22_dev_pxeyk,zul-zul,18 +lre22_dev_pxkzd,ara-arq,14 +lre22_dev_pydgm,afr-afr,19 +lre22_dev_pyiju,ven-ven,20 +lre22_dev_pzhrc,tso-tso,13 +lre22_dev_pzkea,ven-ven,14 +lre22_dev_pzqka,ara-arq,11 +lre22_dev_pzuis,ara-arq,13 +lre22_dev_qabac,ven-ven,19 +lre22_dev_qahym,ara-ayl,11 +lre22_dev_qaxfr,xho-xho,17 +lre22_dev_qazyc,ara-ayl,14 +lre22_dev_qbcoz,nbl-nbl,22 +lre22_dev_qcavr,eng-iaf,20 +lre22_dev_qcbkh,fra-ntf,18 +lre22_dev_qcbtt,afr-afr,18 +lre22_dev_qclly,xho-xho,22 +lre22_dev_qcqdt,eng-iaf,18 +lre22_dev_qdqzp,zul-zul,17 +lre22_dev_qdwut,eng-ens,16 +lre22_dev_qehxr,afr-afr,22 +lre22_dev_qeqah,tir-tir,16 +lre22_dev_qeyjd,afr-afr,17 +lre22_dev_qfprv,ara-ayl,13 +lre22_dev_qfqhi,ara-ayl,15 +lre22_dev_qgoge,tso-tso,13 +lre22_dev_qgrlb,eng-iaf,16 +lre22_dev_qgrsu,zul-zul,14 +lre22_dev_qheor,xho-xho,23 +lre22_dev_qhfdz,tso-tso,14 +lre22_dev_qhlol,ven-ven,21 +lre22_dev_qhnfr,zul-zul,15 +lre22_dev_qhvuq,tso-tso,14 +lre22_dev_qibby,afr-afr,23 +lre22_dev_qicen,orm-orm,16 +lre22_dev_qiehd,eng-iaf,14 +lre22_dev_qjbfh,eng-iaf,15 +lre22_dev_qjdln,afr-afr,19 +lre22_dev_qjmro,ara-ayl,11 +lre22_dev_qkgor,zul-zul,16 +lre22_dev_qlgvf,ara-aeb,12 +lre22_dev_qlpjn,eng-iaf,16 +lre22_dev_qmoop,nbl-nbl,16 +lre22_dev_qmqhy,afr-afr,20 +lre22_dev_qmreh,ara-ayl,10 +lre22_dev_qmucf,ven-ven,18 +lre22_dev_qmvnu,fra-ntf,15 +lre22_dev_qmzke,ara-ayl,13 +lre22_dev_qmzxw,orm-orm,21 +lre22_dev_qnams,ven-ven,20 +lre22_dev_qnefv,xho-xho,23 +lre22_dev_qodht,zul-zul,19 +lre22_dev_qoqtk,eng-ens,16 +lre22_dev_qotto,fra-ntf,18 +lre22_dev_qoudd,tso-tso,18 +lre22_dev_qpego,ara-ayl,14 +lre22_dev_qphcb,fra-ntf,22 +lre22_dev_qqkiv,ara-arq,13 +lre22_dev_qqmeu,eng-ens,17 +lre22_dev_qqudk,orm-orm,21 +lre22_dev_qqvdr,orm-orm,23 +lre22_dev_qrbmq,ara-arq,12 +lre22_dev_qrfvx,fra-ntf,22 +lre22_dev_qrsqg,zul-zul,19 +lre22_dev_qrylo,eng-ens,18 +lre22_dev_qsbdh,nbl-nbl,16 +lre22_dev_qsqzo,afr-afr,14 +lre22_dev_qsudg,nbl-nbl,22 +lre22_dev_qszwt,fra-ntf,21 +lre22_dev_qtcmx,nbl-nbl,21 +lre22_dev_qtfpf,zul-zul,16 +lre22_dev_qtkhk,afr-afr,22 +lre22_dev_qtydg,afr-afr,22 +lre22_dev_qujmp,zul-zul,19 +lre22_dev_qulse,eng-ens,17 +lre22_dev_qutbz,eng-ens,18 +lre22_dev_quvqg,ara-aeb,13 +lre22_dev_qvpjs,eng-iaf,19 +lre22_dev_qvtdy,tso-tso,12 +lre22_dev_qvzol,orm-orm,19 +lre22_dev_qwvgm,ara-ayl,13 +lre22_dev_qwzxt,zul-zul,19 +lre22_dev_qxigw,tir-tir,19 +lre22_dev_qxkuu,tso-tso,13 +lre22_dev_qxtss,afr-afr,15 +lre22_dev_qxvbe,nbl-nbl,17 +lre22_dev_qxysh,afr-afr,22 +lre22_dev_qyfba,zul-zul,14 +lre22_dev_qyfov,fra-ntf,19 +lre22_dev_qyjgj,afr-afr,22 +lre22_dev_qyuwy,ara-aeb,15 +lre22_dev_qzfdr,nbl-nbl,18 +lre22_dev_qzldb,eng-iaf,19 +lre22_dev_ranrd,nbl-nbl,22 +lre22_dev_raurj,eng-ens,12 +lre22_dev_rbntq,ara-arq,11 +lre22_dev_rbssw,ara-aeb,11 +lre22_dev_rbwgx,ara-ayl,16 +lre22_dev_rcooi,fra-ntf,18 +lre22_dev_rcyom,ara-ayl,11 +lre22_dev_rdcns,zul-zul,18 +lre22_dev_rdrhv,ara-arq,11 +lre22_dev_rdyxn,eng-iaf,19 +lre22_dev_repec,tir-tir,19 +lre22_dev_rgbby,tso-tso,19 +lre22_dev_rgdvt,fra-ntf,20 +lre22_dev_rguqm,tso-tso,14 +lre22_dev_rgwjy,afr-afr,19 +lre22_dev_rijeq,orm-orm,19 +lre22_dev_rincv,tir-tir,16 +lre22_dev_rindo,zul-zul,17 +lre22_dev_rirhy,ara-arq,11 +lre22_dev_rjikw,fra-ntf,20 +lre22_dev_rjsik,tso-tso,16 +lre22_dev_rjvvj,tso-tso,19 +lre22_dev_rksid,nbl-nbl,22 +lre22_dev_rkycg,ven-ven,21 +lre22_dev_rlamm,zul-zul,15 +lre22_dev_rllya,tso-tso,15 +lre22_dev_rlzrk,eng-ens,14 +lre22_dev_rmxbg,tir-tir,14 +lre22_dev_rnrsy,tir-tir,19 +lre22_dev_rokej,xho-xho,17 +lre22_dev_rooaf,fra-ntf,17 +lre22_dev_rorob,ven-ven,15 +lre22_dev_rowwe,nbl-nbl,17 +lre22_dev_rqcuw,ara-ayl,11 +lre22_dev_rqdte,ara-ayl,10 +lre22_dev_rqpau,tso-tso,15 +lre22_dev_rquba,ven-ven,19 +lre22_dev_rrbgv,afr-afr,20 +lre22_dev_rsvjn,fra-ntf,16 +lre22_dev_rsynm,tir-tir,19 +lre22_dev_rtezn,tir-tir,19 +lre22_dev_rtkum,orm-orm,21 +lre22_dev_rturg,zul-zul,17 +lre22_dev_runwu,tir-tir,16 +lre22_dev_rvbmf,tso-tso,12 +lre22_dev_rvfls,tso-tso,16 +lre22_dev_rvhxb,ara-aeb,11 +lre22_dev_rvufk,orm-orm,20 +lre22_dev_rvzbo,ara-ayl,14 +lre22_dev_rwhfu,xho-xho,16 +lre22_dev_rwhiz,ara-ayl,10 +lre22_dev_rwimz,ven-ven,16 +lre22_dev_rwish,eng-ens,16 +lre22_dev_rwpzp,xho-xho,19 +lre22_dev_rwqlq,tir-tir,19 +lre22_dev_rwsnw,afr-afr,15 +lre22_dev_rwzwb,tso-tso,19 +lre22_dev_rxcjq,ara-arq,13 +lre22_dev_rxcka,ara-arq,14 +lre22_dev_rxgxu,tir-tir,19 +lre22_dev_rxqxn,nbl-nbl,20 +lre22_dev_rxwip,ara-ayl,10 +lre22_dev_rycca,ven-ven,14 +lre22_dev_rydpu,eng-ens,17 +lre22_dev_ryksb,ven-ven,14 +lre22_dev_rysmu,afr-afr,23 +lre22_dev_rzisy,ara-aeb,13 +lre22_dev_rzpus,ara-arq,15 +lre22_dev_rzqyn,ara-ayl,11 +lre22_dev_rzzca,orm-orm,21 +lre22_dev_sazdy,tso-tso,15 +lre22_dev_sbkip,afr-afr,14 +lre22_dev_sbyek,ara-arq,11 +lre22_dev_scjzn,xho-xho,21 +lre22_dev_scobo,ven-ven,17 +lre22_dev_scqui,orm-orm,16 +lre22_dev_sdccf,ara-arq,14 +lre22_dev_sdcty,tso-tso,19 +lre22_dev_sdebh,ara-ayl,12 +lre22_dev_sedif,orm-orm,21 +lre22_dev_sedug,xho-xho,18 +lre22_dev_seynu,tso-tso,13 +lre22_dev_seyxt,ara-aeb,13 +lre22_dev_sezun,ara-aeb,14 +lre22_dev_sfeyl,ara-aeb,12 +lre22_dev_sfnux,afr-afr,18 +lre22_dev_sfqnk,zul-zul,15 +lre22_dev_sftvb,ara-ayl,11 +lre22_dev_sfwkd,ven-ven,17 +lre22_dev_shgbp,fra-ntf,22 +lre22_dev_shikk,tir-tir,19 +lre22_dev_shpve,afr-afr,21 +lre22_dev_sidjm,ara-ayl,10 +lre22_dev_sihvc,orm-orm,17 +lre22_dev_siiaw,ven-ven,16 +lre22_dev_sinfr,xho-xho,19 +lre22_dev_sipnk,eng-iaf,16 +lre22_dev_sjbcr,tir-tir,19 +lre22_dev_sjdzp,eng-iaf,16 +lre22_dev_sjmsx,ven-ven,19 +lre22_dev_sjsnf,afr-afr,16 +lre22_dev_sjwmd,tir-tir,19 +lre22_dev_sjxce,nbl-nbl,16 +lre22_dev_sjzcc,eng-ens,13 +lre22_dev_sjzsv,fra-ntf,22 +lre22_dev_skegk,afr-afr,18 +lre22_dev_skpib,ven-ven,14 +lre22_dev_slgub,orm-orm,18 +lre22_dev_slryu,nbl-nbl,17 +lre22_dev_slupt,ara-ayl,13 +lre22_dev_smfbl,ara-aeb,14 +lre22_dev_smfon,xho-xho,20 +lre22_dev_smvms,afr-afr,18 +lre22_dev_snegl,xho-xho,18 +lre22_dev_snvvg,tso-tso,14 +lre22_dev_sobpf,orm-orm,19 +lre22_dev_soely,eng-iaf,14 +lre22_dev_sorzd,tir-tir,19 +lre22_dev_spixz,nbl-nbl,18 +lre22_dev_spjcl,fra-ntf,17 +lre22_dev_spzra,tso-tso,17 +lre22_dev_sqaei,xho-xho,23 +lre22_dev_sqime,ven-ven,14 +lre22_dev_srgaw,eng-iaf,15 +lre22_dev_srnhq,ven-ven,16 +lre22_dev_srsng,orm-orm,21 +lre22_dev_srysc,nbl-nbl,17 +lre22_dev_srzgk,eng-ens,16 +lre22_dev_srzsi,ara-aeb,14 +lre22_dev_ssjtt,nbl-nbl,16 +lre22_dev_stajf,xho-xho,21 +lre22_dev_sttfd,ara-aeb,15 +lre22_dev_suevr,ara-aeb,15 +lre22_dev_sumum,afr-afr,18 +lre22_dev_svukm,fra-ntf,20 +lre22_dev_swkzf,tir-tir,17 +lre22_dev_sxqmv,ara-aeb,11 +lre22_dev_sxvuf,ara-aeb,11 +lre22_dev_sydqt,eng-ens,18 +lre22_dev_syooe,eng-ens,14 +lre22_dev_szpip,tir-tir,17 +lre22_dev_szsgp,fra-ntf,19 +lre22_dev_szzuj,ara-ayl,11 +lre22_dev_tabof,orm-orm,19 +lre22_dev_tavcw,ven-ven,19 +lre22_dev_tbjal,xho-xho,22 +lre22_dev_tbxzb,fra-ntf,21 +lre22_dev_tdalr,nbl-nbl,18 +lre22_dev_tdfzf,eng-iaf,17 +lre22_dev_tdlyk,tir-tir,15 +lre22_dev_tefms,fra-ntf,15 +lre22_dev_telgo,xho-xho,19 +lre22_dev_teric,eng-ens,14 +lre22_dev_tfcgx,orm-orm,21 +lre22_dev_tgiid,xho-xho,19 +lre22_dev_tgoea,ara-ayl,13 +lre22_dev_tgrrk,eng-iaf,18 +lre22_dev_tgtyv,tso-tso,12 +lre22_dev_tgzex,tso-tso,12 +lre22_dev_thone,nbl-nbl,17 +lre22_dev_thpnk,afr-afr,18 +lre22_dev_thwls,ven-ven,17 +lre22_dev_tibov,tir-tir,14 +lre22_dev_tidld,tso-tso,16 +lre22_dev_tiezu,eng-ens,17 +lre22_dev_tioqa,nbl-nbl,16 +lre22_dev_tiuym,zul-zul,15 +lre22_dev_tjivp,afr-afr,22 +lre22_dev_tjltd,orm-orm,20 +lre22_dev_tkcqj,ara-aeb,12 +lre22_dev_tkpij,tir-tir,19 +lre22_dev_tkpwp,orm-orm,19 +lre22_dev_tkyuh,tso-tso,12 +lre22_dev_tlkrm,zul-zul,19 +lre22_dev_tlspo,zul-zul,18 +lre22_dev_tmdvx,zul-zul,17 +lre22_dev_tmynp,afr-afr,20 +lre22_dev_tntmu,xho-xho,22 +lre22_dev_tnwok,orm-orm,21 +lre22_dev_toccu,eng-iaf,16 +lre22_dev_tofur,tir-tir,14 +lre22_dev_tokhl,ven-ven,21 +lre22_dev_tonkq,zul-zul,15 +lre22_dev_topxu,zul-zul,14 +lre22_dev_touna,ara-arq,15 +lre22_dev_towvr,tso-tso,12 +lre22_dev_tpasn,tir-tir,15 +lre22_dev_tpmen,ara-ayl,10 +lre22_dev_tpuws,tir-tir,19 +lre22_dev_tqbqi,xho-xho,17 +lre22_dev_tqtfo,tso-tso,17 +lre22_dev_traqh,fra-ntf,21 +lre22_dev_trdfp,ara-ayl,15 +lre22_dev_trdml,xho-xho,23 +lre22_dev_trmpg,nbl-nbl,19 +lre22_dev_tsdyg,tso-tso,19 +lre22_dev_tsvmo,ara-ayl,11 +lre22_dev_ttcul,afr-afr,19 +lre22_dev_ttrfr,ara-arq,12 +lre22_dev_tuhrp,ven-ven,14 +lre22_dev_twaba,afr-afr,15 +lre22_dev_twcnd,tir-tir,13 +lre22_dev_twtog,ven-ven,15 +lre22_dev_twvne,tir-tir,19 +lre22_dev_txcqg,orm-orm,19 +lre22_dev_txjsy,eng-ens,18 +lre22_dev_txmpu,afr-afr,19 +lre22_dev_txqde,eng-iaf,16 +lre22_dev_tyaup,eng-ens,17 +lre22_dev_tyaym,afr-afr,17 +lre22_dev_tybrl,nbl-nbl,16 +lre22_dev_tyduc,eng-ens,17 +lre22_dev_tyhsa,fra-ntf,21 +lre22_dev_tyigo,ara-ayl,11 +lre22_dev_tykte,zul-zul,18 +lre22_dev_tymil,tir-tir,16 +lre22_dev_tyofb,ven-ven,20 +lre22_dev_tysph,fra-ntf,16 +lre22_dev_tzamn,ara-aeb,11 +lre22_dev_tzrpp,ven-ven,15 +lre22_dev_tzukm,ara-aeb,12 +lre22_dev_uabum,xho-xho,19 +lre22_dev_uankd,nbl-nbl,18 +lre22_dev_uazyk,ara-ayl,14 +lre22_dev_ubdfa,eng-iaf,15 +lre22_dev_ubugi,orm-orm,22 +lre22_dev_ucetp,ven-ven,21 +lre22_dev_ucsxt,eng-ens,12 +lre22_dev_uczke,zul-zul,14 +lre22_dev_udldh,ara-arq,11 +lre22_dev_uejdk,orm-orm,17 +lre22_dev_uekog,zul-zul,17 +lre22_dev_uemql,xho-xho,16 +lre22_dev_ueovt,eng-ens,14 +lre22_dev_uesao,zul-zul,19 +lre22_dev_ueyxm,ara-ayl,13 +lre22_dev_ufafi,tir-tir,17 +lre22_dev_ufaig,tso-tso,12 +lre22_dev_uffpc,ara-arq,14 +lre22_dev_ufrmg,ven-ven,20 +lre22_dev_ugieb,ara-aeb,12 +lre22_dev_ugoiy,ara-ayl,10 +lre22_dev_ugzkq,ara-aeb,12 +lre22_dev_uhdrj,xho-xho,18 +lre22_dev_uhjdn,ara-ayl,16 +lre22_dev_uhkcq,ara-ayl,11 +lre22_dev_uhrjo,ara-aeb,13 +lre22_dev_uhrow,afr-afr,16 +lre22_dev_uikqm,ara-arq,12 +lre22_dev_uitct,eng-ens,13 +lre22_dev_uitqu,ara-ayl,12 +lre22_dev_ujiby,eng-ens,18 +lre22_dev_ujmtl,orm-orm,22 +lre22_dev_ukdpu,ven-ven,17 +lre22_dev_ukfpb,xho-xho,19 +lre22_dev_ukklw,fra-ntf,22 +lre22_dev_ukwjy,xho-xho,17 +lre22_dev_uljbx,fra-ntf,20 +lre22_dev_uljgh,tir-tir,13 +lre22_dev_uljvo,fra-ntf,21 +lre22_dev_undfd,orm-orm,20 +lre22_dev_unmiu,ara-arq,14 +lre22_dev_updar,nbl-nbl,17 +lre22_dev_uprkv,eng-iaf,16 +lre22_dev_urkok,ara-ayl,11 +lre22_dev_urolj,orm-orm,22 +lre22_dev_uscpv,eng-ens,14 +lre22_dev_ushtk,fra-ntf,20 +lre22_dev_usiey,ven-ven,19 +lre22_dev_usitw,ara-arq,14 +lre22_dev_utkxp,nbl-nbl,19 +lre22_dev_utnvo,tir-tir,16 +lre22_dev_utyjg,tso-tso,18 +lre22_dev_uuwaa,ara-arq,12 +lre22_dev_uuxla,eng-iaf,15 +lre22_dev_uuzuj,ara-arq,14 +lre22_dev_uvcxs,eng-ens,12 +lre22_dev_uveah,ven-ven,17 +lre22_dev_uvfqy,ara-arq,13 +lre22_dev_uvnhb,fra-ntf,20 +lre22_dev_uvqbm,afr-afr,19 +lre22_dev_uvsus,zul-zul,15 +lre22_dev_uvyev,fra-ntf,20 +lre22_dev_uwicd,tso-tso,12 +lre22_dev_uwnlz,zul-zul,18 +lre22_dev_uwwyj,afr-afr,20 +lre22_dev_uwyxc,eng-iaf,17 +lre22_dev_uxjzh,xho-xho,21 +lre22_dev_uxpyg,tso-tso,15 +lre22_dev_uxrxr,tso-tso,12 +lre22_dev_uyciz,eng-ens,14 +lre22_dev_uycza,xho-xho,17 +lre22_dev_uyvyb,eng-ens,17 +lre22_dev_uziar,zul-zul,15 +lre22_dev_uzlxd,fra-ntf,22 +lre22_dev_uznjr,tir-tir,13 +lre22_dev_vagda,ara-ayl,12 +lre22_dev_vanjm,ven-ven,18 +lre22_dev_vaqia,tir-tir,19 +lre22_dev_vasjz,ara-arq,11 +lre22_dev_vcexs,tir-tir,17 +lre22_dev_vchpm,fra-ntf,21 +lre22_dev_vctsa,nbl-nbl,19 +lre22_dev_vcxit,ven-ven,15 +lre22_dev_vcyqv,xho-xho,19 +lre22_dev_vdjlh,afr-afr,22 +lre22_dev_vdogx,ven-ven,15 +lre22_dev_veutb,eng-ens,16 +lre22_dev_vezrd,tso-tso,12 +lre22_dev_vfbfg,tso-tso,12 +lre22_dev_vffqd,orm-orm,21 +lre22_dev_vfhum,afr-afr,16 +lre22_dev_vfjtw,ara-arq,11 +lre22_dev_vfnjb,eng-ens,15 +lre22_dev_vgbbh,ara-arq,13 +lre22_dev_vgcao,eng-iaf,20 +lre22_dev_vgpnk,xho-xho,19 +lre22_dev_vityk,zul-zul,18 +lre22_dev_vjeuy,tir-tir,19 +lre22_dev_vjltt,zul-zul,17 +lre22_dev_vjqrm,tir-tir,13 +lre22_dev_vjvbs,tso-tso,18 +lre22_dev_vlcbq,tso-tso,16 +lre22_dev_vlnlb,tso-tso,13 +lre22_dev_vlscu,ara-ayl,15 +lre22_dev_vlwhz,fra-ntf,22 +lre22_dev_vlyeh,tso-tso,16 +lre22_dev_vmnps,zul-zul,14 +lre22_dev_vmqxk,tso-tso,18 +lre22_dev_vmrez,ven-ven,18 +lre22_dev_vmsnh,ara-aeb,11 +lre22_dev_vmuti,ara-aeb,14 +lre22_dev_vncre,afr-afr,22 +lre22_dev_vnkqv,afr-afr,15 +lre22_dev_vnmlt,zul-zul,18 +lre22_dev_vpkra,ara-ayl,11 +lre22_dev_vpoit,ara-arq,14 +lre22_dev_vpruu,orm-orm,23 +lre22_dev_vptiv,tir-tir,18 +lre22_dev_vqhcn,tso-tso,16 +lre22_dev_vqura,tir-tir,16 +lre22_dev_vrqfs,xho-xho,23 +lre22_dev_vrvtr,zul-zul,15 +lre22_dev_vrxvj,fra-ntf,17 +lre22_dev_vsbay,eng-iaf,19 +lre22_dev_vsbvi,fra-ntf,19 +lre22_dev_vslkb,eng-ens,12 +lre22_dev_vsrdg,tso-tso,12 +lre22_dev_vsrnz,zul-zul,14 +lre22_dev_vsryb,nbl-nbl,19 +lre22_dev_vtlab,zul-zul,19 +lre22_dev_vtrff,eng-iaf,17 +lre22_dev_vtztf,ara-aeb,11 +lre22_dev_vucth,eng-ens,14 +lre22_dev_vucug,orm-orm,21 +lre22_dev_vufuu,eng-ens,18 +lre22_dev_vujbs,zul-zul,19 +lre22_dev_vuufm,afr-afr,19 +lre22_dev_vvgdf,eng-ens,18 +lre22_dev_vvlcx,ara-aeb,12 +lre22_dev_vvvho,tir-tir,18 +lre22_dev_vwait,eng-iaf,14 +lre22_dev_vwdcw,ara-arq,14 +lre22_dev_vwyzq,ara-arq,14 +lre22_dev_vwzon,eng-ens,12 +lre22_dev_vxhoc,ara-aeb,11 +lre22_dev_vxkgz,ven-ven,18 +lre22_dev_vxlgl,tir-tir,18 +lre22_dev_vxsqt,eng-ens,15 +lre22_dev_vyqsd,nbl-nbl,17 +lre22_dev_vzcai,zul-zul,19 +lre22_dev_vzgoj,eng-iaf,14 +lre22_dev_vzlon,zul-zul,16 +lre22_dev_vznrg,nbl-nbl,16 +lre22_dev_vzqme,xho-xho,19 +lre22_dev_wabqx,ven-ven,18 +lre22_dev_wafdh,fra-ntf,21 +lre22_dev_wagmt,eng-iaf,18 +lre22_dev_waocz,ven-ven,20 +lre22_dev_wavrh,zul-zul,16 +lre22_dev_wawqg,ara-ayl,13 +lre22_dev_waznj,nbl-nbl,22 +lre22_dev_wbepu,fra-ntf,19 +lre22_dev_wbygw,eng-ens,16 +lre22_dev_wccgz,tso-tso,17 +lre22_dev_wcpwx,tir-tir,18 +lre22_dev_wczkn,eng-iaf,17 +lre22_dev_wdfmt,tir-tir,17 +lre22_dev_wdgbh,ara-arq,12 +lre22_dev_wdind,tso-tso,19 +lre22_dev_wdkit,nbl-nbl,16 +lre22_dev_wdmpt,eng-ens,17 +lre22_dev_wdpya,nbl-nbl,16 +lre22_dev_wdrxo,orm-orm,21 +lre22_dev_wdyiy,ara-ayl,13 +lre22_dev_weccy,afr-afr,15 +lre22_dev_wfmco,ara-arq,14 +lre22_dev_wfnon,nbl-nbl,17 +lre22_dev_wgdui,eng-iaf,14 +lre22_dev_wgkmr,eng-iaf,17 +lre22_dev_wgnex,tir-tir,19 +lre22_dev_wgucy,eng-iaf,18 +lre22_dev_wgwdn,eng-iaf,17 +lre22_dev_whqhx,eng-iaf,15 +lre22_dev_whxwv,eng-ens,14 +lre22_dev_witnq,fra-ntf,17 +lre22_dev_wixzu,tso-tso,16 +lre22_dev_wjhbw,eng-iaf,16 +lre22_dev_wjist,orm-orm,16 +lre22_dev_wjnhh,zul-zul,19 +lre22_dev_wjnyo,ven-ven,20 +lre22_dev_wjtnm,orm-orm,19 +lre22_dev_wjzhz,ara-aeb,13 +lre22_dev_wkacx,eng-iaf,15 +lre22_dev_wkqey,fra-ntf,16 +lre22_dev_wldli,zul-zul,14 +lre22_dev_wlnst,nbl-nbl,16 +lre22_dev_wltvq,zul-zul,17 +lre22_dev_wlwhq,orm-orm,19 +lre22_dev_wmdan,xho-xho,21 +lre22_dev_wmfce,nbl-nbl,20 +lre22_dev_wmigl,ven-ven,20 +lre22_dev_wmwmc,eng-iaf,19 +lre22_dev_wmypk,xho-xho,19 +lre22_dev_wmzpv,eng-ens,17 +lre22_dev_wnjpz,ven-ven,19 +lre22_dev_wnmkt,orm-orm,23 +lre22_dev_wnpep,nbl-nbl,16 +lre22_dev_wnqhz,nbl-nbl,16 +lre22_dev_wnxpz,ven-ven,15 +lre22_dev_wnxrw,ven-ven,18 +lre22_dev_woawg,ven-ven,18 +lre22_dev_wobzv,eng-ens,14 +lre22_dev_wocbv,tso-tso,18 +lre22_dev_woerb,fra-ntf,21 +lre22_dev_wojrt,orm-orm,19 +lre22_dev_wosus,tir-tir,17 +lre22_dev_wozuc,xho-xho,19 +lre22_dev_wqcyu,tso-tso,15 +lre22_dev_wqfuv,eng-ens,17 +lre22_dev_wqhag,zul-zul,19 +lre22_dev_wqmsd,tir-tir,13 +lre22_dev_wqthl,ara-aeb,12 +lre22_dev_wqtvm,eng-ens,15 +lre22_dev_wrmnw,zul-zul,18 +lre22_dev_wrtec,zul-zul,17 +lre22_dev_wrvls,zul-zul,14 +lre22_dev_wscfs,nbl-nbl,16 +lre22_dev_wssqw,eng-ens,15 +lre22_dev_wtbdf,tir-tir,14 +lre22_dev_wtcpe,ara-aeb,11 +lre22_dev_wthrk,orm-orm,18 +lre22_dev_wtofd,eng-iaf,20 +lre22_dev_wtuol,tso-tso,18 +lre22_dev_wuqez,ara-aeb,11 +lre22_dev_wuquc,tir-tir,18 +lre22_dev_wvlde,tso-tso,13 +lre22_dev_wwbmg,ara-aeb,11 +lre22_dev_wwduf,fra-ntf,18 +lre22_dev_wwvuw,ara-arq,13 +lre22_dev_wxaev,orm-orm,17 +lre22_dev_wycsj,ven-ven,18 +lre22_dev_wypwj,ara-ayl,10 +lre22_dev_wytpq,fra-ntf,17 +lre22_dev_wzhqk,xho-xho,22 +lre22_dev_wzpmq,eng-ens,12 +lre22_dev_wztdj,zul-zul,19 +lre22_dev_wzxgv,ven-ven,18 +lre22_dev_xacjk,fra-ntf,18 +lre22_dev_xaevp,tir-tir,14 +lre22_dev_xaldr,eng-iaf,14 +lre22_dev_xapdy,ara-aeb,12 +lre22_dev_xaurw,nbl-nbl,16 +lre22_dev_xawdd,tir-tir,20 +lre22_dev_xbcpb,ara-arq,12 +lre22_dev_xbfrs,ven-ven,17 +lre22_dev_xbqsr,nbl-nbl,22 +lre22_dev_xbvcc,nbl-nbl,17 +lre22_dev_xbvqw,orm-orm,23 +lre22_dev_xcame,xho-xho,16 +lre22_dev_xcrnp,ara-aeb,13 +lre22_dev_xcswu,ven-ven,18 +lre22_dev_xcuok,orm-orm,21 +lre22_dev_xcvkj,tso-tso,16 +lre22_dev_xdtdp,fra-ntf,17 +lre22_dev_xdyea,ara-ayl,10 +lre22_dev_xerqi,fra-ntf,17 +lre22_dev_xetdb,eng-ens,14 +lre22_dev_xfecy,nbl-nbl,16 +lre22_dev_xfgcu,eng-iaf,19 +lre22_dev_xfing,tir-tir,20 +lre22_dev_xgaig,ara-aeb,15 +lre22_dev_xgoyq,eng-ens,18 +lre22_dev_xhdtx,eng-iaf,14 +lre22_dev_xhvkx,orm-orm,19 +lre22_dev_xiblr,tir-tir,17 +lre22_dev_xifty,ara-aeb,12 +lre22_dev_xigtx,ara-arq,14 +lre22_dev_xijus,tso-tso,14 +lre22_dev_xipox,xho-xho,20 +lre22_dev_xittq,ara-aeb,13 +lre22_dev_xjpwq,ara-ayl,15 +lre22_dev_xjrla,afr-afr,20 +lre22_dev_xkdof,ara-ayl,13 +lre22_dev_xkiba,eng-ens,18 +lre22_dev_xlcxh,fra-ntf,18 +lre22_dev_xlsxb,tso-tso,16 +lre22_dev_xmhpj,ven-ven,20 +lre22_dev_xnqct,ara-arq,11 +lre22_dev_xoayi,eng-ens,13 +lre22_dev_xohps,ara-arq,11 +lre22_dev_xokpn,zul-zul,18 +lre22_dev_xonym,eng-ens,14 +lre22_dev_xozod,afr-afr,14 +lre22_dev_xpenp,ara-arq,11 +lre22_dev_xpnti,ara-aeb,11 +lre22_dev_xpqyr,orm-orm,22 +lre22_dev_xpswt,orm-orm,23 +lre22_dev_xpumn,ven-ven,14 +lre22_dev_xpvcf,orm-orm,20 +lre22_dev_xqhoa,ara-ayl,13 +lre22_dev_xqnpt,orm-orm,22 +lre22_dev_xqooi,xho-xho,20 +lre22_dev_xqupu,fra-ntf,21 +lre22_dev_xresy,eng-iaf,17 +lre22_dev_xrouj,ara-ayl,16 +lre22_dev_xsnxu,ara-aeb,12 +lre22_dev_xtaof,ara-ayl,13 +lre22_dev_xtbxk,orm-orm,20 +lre22_dev_xtgak,nbl-nbl,20 +lre22_dev_xuauh,ara-aeb,13 +lre22_dev_xubei,eng-iaf,17 +lre22_dev_xubol,ara-aeb,11 +lre22_dev_xuieb,orm-orm,19 +lre22_dev_xunxs,ara-ayl,14 +lre22_dev_xutjo,nbl-nbl,20 +lre22_dev_xvbos,afr-afr,22 +lre22_dev_xvcfn,eng-ens,16 +lre22_dev_xvgqo,eng-ens,12 +lre22_dev_xwemk,zul-zul,18 +lre22_dev_xwsyq,ara-ayl,14 +lre22_dev_xxdbg,tso-tso,18 +lre22_dev_xyoua,fra-ntf,22 +lre22_dev_xzoej,ara-aeb,13 +lre22_dev_xzrdl,ara-arq,13 +lre22_dev_xztsz,tso-tso,16 +lre22_dev_xzxbd,zul-zul,15 +lre22_dev_yagvv,tso-tso,13 +lre22_dev_ybqju,tso-tso,13 +lre22_dev_ybrji,ara-arq,11 +lre22_dev_ybsmy,ven-ven,21 +lre22_dev_ycbaf,ara-aeb,14 +lre22_dev_ychsm,ven-ven,14 +lre22_dev_ycrlj,xho-xho,17 +lre22_dev_ycuhc,orm-orm,21 +lre22_dev_ydhqc,ara-arq,13 +lre22_dev_ydmnb,nbl-nbl,17 +lre22_dev_yduem,xho-xho,21 +lre22_dev_yemzu,ara-aeb,11 +lre22_dev_yeoyx,eng-ens,18 +lre22_dev_yersp,ara-ayl,13 +lre22_dev_yeshv,eng-iaf,17 +lre22_dev_yexec,ven-ven,20 +lre22_dev_yeyna,ara-ayl,14 +lre22_dev_yfxmd,ara-arq,14 +lre22_dev_yfzah,ara-arq,14 +lre22_dev_ygkvo,ara-arq,11 +lre22_dev_yhgvr,ara-arq,15 +lre22_dev_yhwin,ara-arq,12 +lre22_dev_yirig,ara-ayl,16 +lre22_dev_yixgu,xho-xho,16 +lre22_dev_yjbfl,xho-xho,19 +lre22_dev_yjodc,eng-ens,14 +lre22_dev_yjoht,ara-aeb,12 +lre22_dev_yjqkb,ara-arq,14 +lre22_dev_yjrkq,ara-arq,15 +lre22_dev_yjrng,afr-afr,16 +lre22_dev_ykpzq,afr-afr,21 +lre22_dev_yktop,eng-iaf,20 +lre22_dev_ylfah,zul-zul,15 +lre22_dev_ylgex,tso-tso,14 +lre22_dev_ylkds,nbl-nbl,17 +lre22_dev_ylvyc,xho-xho,20 +lre22_dev_ylzic,eng-iaf,20 +lre22_dev_ymoon,afr-afr,17 +lre22_dev_yncqr,ara-arq,13 +lre22_dev_ynjtn,ven-ven,18 +lre22_dev_ynmzy,tso-tso,16 +lre22_dev_ynozi,fra-ntf,21 +lre22_dev_yntec,orm-orm,19 +lre22_dev_ynurl,tso-tso,14 +lre22_dev_ypdtt,ara-aeb,11 +lre22_dev_yprom,tso-tso,13 +lre22_dev_yptsk,xho-xho,23 +lre22_dev_ypyft,eng-iaf,14 +lre22_dev_yqhwt,orm-orm,23 +lre22_dev_yqtxe,eng-iaf,19 +lre22_dev_yquja,ara-ayl,10 +lre22_dev_yqxhl,eng-ens,14 +lre22_dev_yqyby,nbl-nbl,18 +lre22_dev_yqzua,fra-ntf,16 +lre22_dev_yrfxo,ven-ven,21 +lre22_dev_yrgzf,ara-aeb,13 +lre22_dev_yruqe,tso-tso,17 +lre22_dev_yrwgb,zul-zul,18 +lre22_dev_yrxsi,orm-orm,21 +lre22_dev_ysdkl,tso-tso,15 +lre22_dev_ytgav,xho-xho,16 +lre22_dev_ytoet,ara-arq,14 +lre22_dev_yuabg,eng-ens,16 +lre22_dev_yundm,tso-tso,14 +lre22_dev_yuvux,ara-ayl,13 +lre22_dev_yvdcv,fra-ntf,21 +lre22_dev_yvoli,orm-orm,23 +lre22_dev_yweox,orm-orm,21 +lre22_dev_ywgoc,eng-iaf,19 +lre22_dev_ywoyx,ven-ven,18 +lre22_dev_ywxql,zul-zul,19 +lre22_dev_yxkyl,eng-iaf,15 +lre22_dev_yxtmn,ara-aeb,14 +lre22_dev_yycsn,ara-ayl,12 +lre22_dev_yyswd,eng-iaf,16 +lre22_dev_yyugr,ven-ven,21 +lre22_dev_yzitu,orm-orm,20 +lre22_dev_yzwmi,eng-ens,16 +lre22_dev_yzzww,zul-zul,17 +lre22_dev_zabub,ara-ayl,16 +lre22_dev_zabuv,eng-iaf,14 +lre22_dev_zacuc,zul-zul,19 +lre22_dev_zavru,zul-zul,19 +lre22_dev_zbfgy,ara-arq,12 +lre22_dev_zbjez,nbl-nbl,17 +lre22_dev_zbtpo,ven-ven,18 +lre22_dev_zbzip,tso-tso,19 +lre22_dev_zcevz,nbl-nbl,16 +lre22_dev_zcnsv,afr-afr,21 +lre22_dev_zcqkl,eng-iaf,20 +lre22_dev_zczer,ven-ven,14 +lre22_dev_zdcdt,nbl-nbl,18 +lre22_dev_zddua,xho-xho,19 +lre22_dev_zdvsh,ara-arq,14 +lre22_dev_zdwxx,ara-ayl,14 +lre22_dev_zdyxi,tir-tir,14 +lre22_dev_zetju,eng-iaf,17 +lre22_dev_zfsek,ara-arq,11 +lre22_dev_zfvfa,eng-ens,18 +lre22_dev_zggiu,zul-zul,19 +lre22_dev_zgndz,tso-tso,14 +lre22_dev_zgxth,eng-ens,16 +lre22_dev_zhlxa,ara-ayl,14 +lre22_dev_zhnsb,ara-ayl,15 +lre22_dev_zhsmo,ara-aeb,13 +lre22_dev_zhvbf,xho-xho,18 +lre22_dev_zhzrh,eng-iaf,15 +lre22_dev_ziigd,orm-orm,21 +lre22_dev_zilud,tir-tir,19 +lre22_dev_zjivp,zul-zul,19 +lre22_dev_zjleg,zul-zul,19 +lre22_dev_zjquq,orm-orm,16 +lre22_dev_zkgjo,nbl-nbl,22 +lre22_dev_zkhes,fra-ntf,16 +lre22_dev_zkioq,ara-aeb,12 +lre22_dev_zkwaw,afr-afr,21 +lre22_dev_zlapc,ara-ayl,13 +lre22_dev_zlntm,zul-zul,19 +lre22_dev_zmmyn,xho-xho,23 +lre22_dev_zmxld,ven-ven,17 +lre22_dev_znhcf,ven-ven,21 +lre22_dev_znwsk,afr-afr,22 +lre22_dev_znxvg,eng-ens,18 +lre22_dev_znycz,ara-aeb,13 +lre22_dev_zoayx,zul-zul,18 +lre22_dev_zogte,nbl-nbl,16 +lre22_dev_zoldl,ara-aeb,12 +lre22_dev_zoqzl,eng-ens,17 +lre22_dev_zorfv,eng-iaf,16 +lre22_dev_zoseh,ara-arq,12 +lre22_dev_zpotb,xho-xho,16 +lre22_dev_zptbg,tir-tir,14 +lre22_dev_zqjzi,ara-aeb,11 +lre22_dev_zqljj,ara-aeb,14 +lre22_dev_zqlri,orm-orm,18 +lre22_dev_zqoif,zul-zul,19 +lre22_dev_zqorv,ara-aeb,12 +lre22_dev_zqwgs,fra-ntf,18 +lre22_dev_zrhbt,tir-tir,19 +lre22_dev_zrqar,ara-aeb,13 +lre22_dev_zrqec,eng-iaf,17 +lre22_dev_ztdrx,fra-ntf,15 +lre22_dev_ztdwr,orm-orm,17 +lre22_dev_zthiv,ara-arq,15 +lre22_dev_ztknh,xho-xho,18 +lre22_dev_ztlcq,ara-aeb,13 +lre22_dev_ztufj,fra-ntf,19 +lre22_dev_zubjl,fra-ntf,20 +lre22_dev_zunuw,tso-tso,17 +lre22_dev_zutul,tir-tir,13 +lre22_dev_zutvv,eng-ens,12 +lre22_dev_zuugc,eng-iaf,17 +lre22_dev_zuvqx,eng-iaf,14 +lre22_dev_zvthu,orm-orm,20 +lre22_dev_zvvov,ara-aeb,11 +lre22_dev_zvyuh,ara-arq,14 +lre22_dev_zwfqq,eng-iaf,17 +lre22_dev_zwosr,xho-xho,16 +lre22_dev_zwvhw,tso-tso,12 +lre22_dev_zxihz,ven-ven,14 +lre22_dev_zydma,eng-ens,12 +lre22_dev_zyqlz,zul-zul,19 +lre22_dev_zyyie,orm-orm,23 +lre22_dev_zyywo,eng-iaf,14 +lre22_dev_zzyze,ara-ayl,12 diff --git a/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/train_segments.csv b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/train_segments.csv new file mode 100644 index 00000000..6518f24e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/train_segments.csv @@ -0,0 +1,2114 @@ +id,class_id,subclass_idx +lre22_dev_aadaq,afr-afr,5 +lre22_dev_aaxdt,xho-xho,14 +lre22_dev_abujj,xho-xho,15 +lre22_dev_acgiu,zul-zul,6 +lre22_dev_acnyv,ven-ven,7 +lre22_dev_adbku,ara-ayl,4 +lre22_dev_ademr,orm-orm,3 +lre22_dev_adgoy,xho-xho,4 +lre22_dev_adnpi,eng-ens,1 +lre22_dev_adqaa,ven-ven,10 +lre22_dev_adwzf,zul-zul,2 +lre22_dev_aeiuj,afr-afr,4 +lre22_dev_afhui,eng-ens,4 +lre22_dev_afuav,nbl-nbl,15 +lre22_dev_afvvg,ven-ven,10 +lre22_dev_afxjf,eng-iaf,10 +lre22_dev_agmwb,ara-aeb,10 +lre22_dev_agnik,eng-ens,3 +lre22_dev_ahcja,orm-orm,14 +lre22_dev_ahobp,afr-afr,13 +lre22_dev_ahupk,eng-ens,11 +lre22_dev_aicjg,xho-xho,12 +lre22_dev_aikrz,eng-ens,9 +lre22_dev_ailwo,orm-orm,7 +lre22_dev_aiqhl,tir-tir,10 +lre22_dev_aiuwf,ara-ayl,5 +lre22_dev_aizyr,ara-arq,0 +lre22_dev_ajbui,zul-zul,12 +lre22_dev_ajigk,ara-aeb,10 +lre22_dev_ajuwq,ara-ayl,3 +lre22_dev_akbly,nbl-nbl,3 +lre22_dev_akhwr,xho-xho,6 +lre22_dev_aksxd,nbl-nbl,6 +lre22_dev_aktcg,afr-afr,1 +lre22_dev_aktzw,eng-ens,11 +lre22_dev_akulq,orm-orm,14 +lre22_dev_alcie,orm-orm,11 +lre22_dev_alunz,xho-xho,6 +lre22_dev_amaec,tir-tir,10 +lre22_dev_amnvo,ara-arq,6 +lre22_dev_amxrk,zul-zul,9 +lre22_dev_anmuv,tso-tso,11 +lre22_dev_aomcz,ara-aeb,7 +lre22_dev_aooht,fra-ntf,11 +lre22_dev_aprbe,ara-arq,3 +lre22_dev_apxxx,orm-orm,12 +lre22_dev_aqdwu,ven-ven,6 +lre22_dev_aqejl,xho-xho,5 +lre22_dev_aqnyy,tso-tso,5 +lre22_dev_arjuc,afr-afr,5 +lre22_dev_arrkp,tir-tir,1 +lre22_dev_atdgp,zul-zul,13 +lre22_dev_atoxn,eng-ens,10 +lre22_dev_audls,afr-afr,6 +lre22_dev_auilj,ven-ven,11 +lre22_dev_auqgt,eng-iaf,3 +lre22_dev_autlo,zul-zul,7 +lre22_dev_avait,zul-zul,3 +lre22_dev_avvik,nbl-nbl,14 +lre22_dev_awgem,ara-ayl,3 +lre22_dev_awgnb,fra-ntf,14 +lre22_dev_awvym,ara-ayl,9 +lre22_dev_axhbz,tir-tir,12 +lre22_dev_axici,tir-tir,8 +lre22_dev_axtpv,xho-xho,6 +lre22_dev_aygsz,ara-aeb,4 +lre22_dev_ayiif,ven-ven,7 +lre22_dev_azqvo,zul-zul,3 +lre22_dev_basml,eng-ens,11 +lre22_dev_bawje,tir-tir,6 +lre22_dev_bbana,zul-zul,7 +lre22_dev_bbtpz,ven-ven,5 +lre22_dev_bcbrw,eng-iaf,2 +lre22_dev_bchvx,zul-zul,9 +lre22_dev_bcllp,afr-afr,13 +lre22_dev_bcsmi,fra-ntf,6 +lre22_dev_bdqaw,ven-ven,6 +lre22_dev_bdwle,ara-arq,6 +lre22_dev_behbh,ara-ayl,4 +lre22_dev_bexda,ara-arq,6 +lre22_dev_bfbyn,ara-aeb,9 +lre22_dev_bfjgx,ara-ayl,7 +lre22_dev_bgbjo,nbl-nbl,1 +lre22_dev_bgebs,ara-ayl,5 +lre22_dev_bgnod,fra-ntf,3 +lre22_dev_bhezb,ara-ayl,7 +lre22_dev_bhyuy,afr-afr,13 +lre22_dev_bidge,tir-tir,12 +lre22_dev_bimnd,eng-ens,7 +lre22_dev_biyaj,ara-ayl,5 +lre22_dev_bjsgu,afr-afr,10 +lre22_dev_blmfp,eng-iaf,5 +lre22_dev_blohd,ven-ven,4 +lre22_dev_bmebz,ara-arq,4 +lre22_dev_bmjuo,ara-aeb,6 +lre22_dev_bmkrm,fra-ntf,10 +lre22_dev_bmzym,zul-zul,5 +lre22_dev_bnfuu,orm-orm,13 +lre22_dev_bnilb,zul-zul,8 +lre22_dev_bnxna,eng-ens,1 +lre22_dev_boikl,orm-orm,7 +lre22_dev_boisz,ven-ven,2 +lre22_dev_boqxy,zul-zul,13 +lre22_dev_bpqhd,tso-tso,2 +lre22_dev_briiw,ara-aeb,8 +lre22_dev_brohj,fra-ntf,1 +lre22_dev_brqdv,nbl-nbl,3 +lre22_dev_brwcj,afr-afr,6 +lre22_dev_bsclv,orm-orm,8 +lre22_dev_bsdbb,ara-arq,4 +lre22_dev_bstjt,nbl-nbl,10 +lre22_dev_btbke,ara-aeb,0 +lre22_dev_btcfj,ven-ven,12 +lre22_dev_btomw,ven-ven,6 +lre22_dev_btpvy,afr-afr,1 +lre22_dev_btrtb,ara-arq,4 +lre22_dev_btruf,zul-zul,8 +lre22_dev_btsll,ara-ayl,7 +lre22_dev_butrw,ara-ayl,6 +lre22_dev_buwrj,ara-ayl,2 +lre22_dev_bvlhb,fra-ntf,8 +lre22_dev_bvmql,xho-xho,10 +lre22_dev_bvnsc,tir-tir,10 +lre22_dev_bwrej,ven-ven,9 +lre22_dev_bxial,eng-ens,2 +lre22_dev_bxnbf,fra-ntf,9 +lre22_dev_bybim,afr-afr,6 +lre22_dev_byegp,orm-orm,15 +lre22_dev_byngq,ven-ven,9 +lre22_dev_byytf,fra-ntf,6 +lre22_dev_bzies,tso-tso,3 +lre22_dev_bzipd,afr-afr,7 +lre22_dev_cacop,nbl-nbl,5 +lre22_dev_caent,afr-afr,12 +lre22_dev_capsb,ven-ven,0 +lre22_dev_cawbw,orm-orm,12 +lre22_dev_cblep,ven-ven,3 +lre22_dev_cblig,fra-ntf,6 +lre22_dev_ccexy,ven-ven,7 +lre22_dev_ccsye,ara-aeb,8 +lre22_dev_cctyt,eng-iaf,11 +lre22_dev_ccuie,eng-ens,7 +lre22_dev_ccvzf,eng-iaf,1 +lre22_dev_cdlkq,tso-tso,8 +lre22_dev_cdtiu,ara-ayl,9 +lre22_dev_cemyb,tir-tir,12 +lre22_dev_ceprg,eng-iaf,9 +lre22_dev_ceqow,nbl-nbl,15 +lre22_dev_cfdsu,fra-ntf,7 +lre22_dev_cfhbm,ven-ven,3 +lre22_dev_cfsew,afr-afr,12 +lre22_dev_cgges,eng-iaf,11 +lre22_dev_cgjnr,eng-iaf,10 +lre22_dev_cgotg,eng-ens,11 +lre22_dev_cgovb,nbl-nbl,15 +lre22_dev_cgssg,tir-tir,7 +lre22_dev_chhsl,tir-tir,7 +lre22_dev_chjuh,nbl-nbl,9 +lre22_dev_chpoe,nbl-nbl,11 +lre22_dev_chtgu,ara-aeb,10 +lre22_dev_chtlt,eng-iaf,10 +lre22_dev_cigir,eng-ens,9 +lre22_dev_ciyeh,ara-ayl,2 +lre22_dev_cjswm,orm-orm,12 +lre22_dev_cjtdl,ven-ven,13 +lre22_dev_ckzie,ara-aeb,10 +lre22_dev_cldfc,ara-ayl,8 +lre22_dev_clxqz,ara-arq,9 +lre22_dev_cmahj,afr-afr,13 +lre22_dev_cmqxm,tir-tir,6 +lre22_dev_cmrdt,afr-afr,5 +lre22_dev_cmvpq,ara-ayl,2 +lre22_dev_cnbfw,eng-iaf,5 +lre22_dev_cnbvd,afr-afr,9 +lre22_dev_cnomp,orm-orm,15 +lre22_dev_cnrvj,xho-xho,11 +lre22_dev_cnszu,ara-ayl,4 +lre22_dev_cnudd,xho-xho,14 +lre22_dev_cnuoi,orm-orm,14 +lre22_dev_cnxjs,orm-orm,8 +lre22_dev_coarm,xho-xho,4 +lre22_dev_cocyn,zul-zul,6 +lre22_dev_colxc,zul-zul,13 +lre22_dev_cosfn,ara-aeb,10 +lre22_dev_cosgu,ara-ayl,7 +lre22_dev_cpjab,ara-aeb,10 +lre22_dev_cpple,tso-tso,6 +lre22_dev_cqhjy,ara-ayl,3 +lre22_dev_cqkmy,ara-aeb,10 +lre22_dev_cqukb,tso-tso,9 +lre22_dev_cqusc,orm-orm,6 +lre22_dev_cqyzf,fra-ntf,13 +lre22_dev_crcwu,xho-xho,12 +lre22_dev_crqjz,nbl-nbl,10 +lre22_dev_crtpm,ara-arq,5 +lre22_dev_crucu,tir-tir,6 +lre22_dev_crvby,eng-iaf,12 +lre22_dev_crvoh,eng-ens,7 +lre22_dev_csjxv,ara-arq,3 +lre22_dev_ctfiv,ara-aeb,5 +lre22_dev_ctgpr,ven-ven,12 +lre22_dev_ctlrz,tir-tir,8 +lre22_dev_ctzhm,zul-zul,6 +lre22_dev_cudew,ven-ven,8 +lre22_dev_cusin,ara-arq,10 +lre22_dev_cvaad,eng-iaf,5 +lre22_dev_cvedm,zul-zul,12 +lre22_dev_cvgfx,eng-iaf,8 +lre22_dev_cvujh,ara-ayl,2 +lre22_dev_cweil,ara-aeb,10 +lre22_dev_cweuh,eng-ens,7 +lre22_dev_cwiro,afr-afr,6 +lre22_dev_cwtby,ara-arq,7 +lre22_dev_cxggy,afr-afr,4 +lre22_dev_cxnqr,tso-tso,7 +lre22_dev_cxpan,nbl-nbl,14 +lre22_dev_cxsxl,ara-aeb,10 +lre22_dev_cxyti,tso-tso,8 +lre22_dev_cypcg,zul-zul,12 +lre22_dev_czcmz,zul-zul,10 +lre22_dev_czdzw,orm-orm,7 +lre22_dev_czppj,zul-zul,10 +lre22_dev_czxff,zul-zul,9 +lre22_dev_czxld,fra-ntf,9 +lre22_dev_dajnt,zul-zul,12 +lre22_dev_dbcxi,orm-orm,9 +lre22_dev_dbdbv,tso-tso,8 +lre22_dev_dbdwv,orm-orm,5 +lre22_dev_dbgof,nbl-nbl,15 +lre22_dev_dblhh,eng-iaf,0 +lre22_dev_dbljb,xho-xho,12 +lre22_dev_dcibg,eng-iaf,4 +lre22_dev_dcobk,ara-arq,8 +lre22_dev_dcvcu,afr-afr,4 +lre22_dev_dcvyc,fra-ntf,14 +lre22_dev_ddfeo,ara-ayl,5 +lre22_dev_ddhaq,zul-zul,10 +lre22_dev_ddhes,afr-afr,6 +lre22_dev_ddsds,afr-afr,12 +lre22_dev_ddxvn,ven-ven,5 +lre22_dev_dfdrs,ven-ven,7 +lre22_dev_dfifl,ara-ayl,9 +lre22_dev_dfjek,ven-ven,4 +lre22_dev_dflco,zul-zul,12 +lre22_dev_dftta,tso-tso,6 +lre22_dev_dfxnq,eng-ens,11 +lre22_dev_dgjdi,orm-orm,8 +lre22_dev_dgqwo,tir-tir,9 +lre22_dev_dhapq,ara-aeb,8 +lre22_dev_dhdfk,eng-ens,8 +lre22_dev_dhfjj,ara-arq,4 +lre22_dev_dhlxh,ara-aeb,4 +lre22_dev_dhnne,eng-ens,10 +lre22_dev_dhtlz,eng-ens,6 +lre22_dev_diarz,ara-ayl,2 +lre22_dev_diggg,tir-tir,9 +lre22_dev_diqtw,ara-aeb,8 +lre22_dev_dixuw,orm-orm,9 +lre22_dev_diypf,orm-orm,13 +lre22_dev_djzsk,nbl-nbl,13 +lre22_dev_dksey,nbl-nbl,11 +lre22_dev_dlzwh,fra-ntf,12 +lre22_dev_dmdpv,eng-ens,2 +lre22_dev_dmeea,orm-orm,14 +lre22_dev_dmhdv,xho-xho,10 +lre22_dev_dmics,fra-ntf,14 +lre22_dev_dmiiu,ara-aeb,6 +lre22_dev_dmjxr,xho-xho,10 +lre22_dev_dmzxn,afr-afr,4 +lre22_dev_dngtw,ara-ayl,3 +lre22_dev_dnjdq,eng-ens,7 +lre22_dev_dnprz,zul-zul,12 +lre22_dev_dobdj,fra-ntf,0 +lre22_dev_dobwk,orm-orm,8 +lre22_dev_donqm,ara-arq,3 +lre22_dev_dpbyt,tso-tso,6 +lre22_dev_dpfns,ara-aeb,4 +lre22_dev_dpjjp,fra-ntf,7 +lre22_dev_dpomx,eng-iaf,5 +lre22_dev_dpwhs,eng-ens,8 +lre22_dev_dpygj,eng-iaf,8 +lre22_dev_dqzex,xho-xho,3 +lre22_dev_drcqx,eng-iaf,7 +lre22_dev_drfhb,ara-aeb,10 +lre22_dev_drfte,ara-arq,8 +lre22_dev_driks,eng-ens,11 +lre22_dev_drofs,fra-ntf,1 +lre22_dev_dslxl,ara-ayl,7 +lre22_dev_dsmwd,ven-ven,13 +lre22_dev_dsyyk,tir-tir,9 +lre22_dev_dthcb,zul-zul,12 +lre22_dev_dtumd,fra-ntf,5 +lre22_dev_dtwmj,afr-afr,7 +lre22_dev_duegm,tso-tso,9 +lre22_dev_dvirs,afr-afr,6 +lre22_dev_dvtzf,eng-iaf,7 +lre22_dev_dwcfi,ven-ven,7 +lre22_dev_dwfle,fra-ntf,7 +lre22_dev_dwgsv,tir-tir,6 +lre22_dev_dwlay,ara-arq,3 +lre22_dev_dwnit,xho-xho,15 +lre22_dev_dwvoh,tso-tso,6 +lre22_dev_dxgpq,afr-afr,12 +lre22_dev_dxhpf,ara-ayl,9 +lre22_dev_dxlhq,ara-arq,5 +lre22_dev_dxrcj,zul-zul,5 +lre22_dev_dywox,tir-tir,9 +lre22_dev_dzjrv,eng-iaf,8 +lre22_dev_dzsql,tso-tso,6 +lre22_dev_dzxkv,orm-orm,13 +lre22_dev_eabne,xho-xho,2 +lre22_dev_eacdl,fra-ntf,14 +lre22_dev_eaupg,eng-iaf,11 +lre22_dev_eawug,eng-iaf,6 +lre22_dev_ebbgx,nbl-nbl,15 +lre22_dev_ecber,afr-afr,10 +lre22_dev_ecdgv,ara-arq,5 +lre22_dev_ecneb,afr-afr,6 +lre22_dev_ecxrr,tir-tir,9 +lre22_dev_edldw,tir-tir,10 +lre22_dev_edofc,afr-afr,6 +lre22_dev_edvaf,xho-xho,13 +lre22_dev_edydw,eng-ens,5 +lre22_dev_eejtn,zul-zul,4 +lre22_dev_eekzc,fra-ntf,4 +lre22_dev_eenhx,eng-iaf,9 +lre22_dev_efcgi,fra-ntf,0 +lre22_dev_efdoz,ven-ven,8 +lre22_dev_efioy,tso-tso,9 +lre22_dev_efiwx,eng-ens,9 +lre22_dev_efrlw,ven-ven,2 +lre22_dev_eghmh,eng-ens,11 +lre22_dev_ehhyu,nbl-nbl,10 +lre22_dev_eiomi,ven-ven,12 +lre22_dev_eisiy,orm-orm,8 +lre22_dev_ejaiq,ara-aeb,1 +lre22_dev_ejkmr,eng-iaf,5 +lre22_dev_ejthv,ven-ven,12 +lre22_dev_ejtyd,fra-ntf,14 +lre22_dev_ekfzq,ara-ayl,6 +lre22_dev_ekgjp,zul-zul,3 +lre22_dev_ekixu,nbl-nbl,2 +lre22_dev_ekjxx,ara-arq,6 +lre22_dev_ekvxc,eng-iaf,4 +lre22_dev_eldrg,orm-orm,11 +lre22_dev_elitc,ara-arq,3 +lre22_dev_emdtf,xho-xho,0 +lre22_dev_emhqx,tir-tir,4 +lre22_dev_emxnm,afr-afr,4 +lre22_dev_emzaa,xho-xho,3 +lre22_dev_engqe,xho-xho,15 +lre22_dev_ennjl,tso-tso,10 +lre22_dev_eokyg,nbl-nbl,2 +lre22_dev_epkwr,tir-tir,5 +lre22_dev_epojj,tir-tir,7 +lre22_dev_epsdk,nbl-nbl,12 +lre22_dev_epsfl,xho-xho,14 +lre22_dev_epuno,eng-ens,1 +lre22_dev_epylu,eng-iaf,10 +lre22_dev_ereen,ara-arq,10 +lre22_dev_eriaf,eng-ens,4 +lre22_dev_ermqx,ara-arq,2 +lre22_dev_escob,fra-ntf,9 +lre22_dev_esjsk,ara-ayl,7 +lre22_dev_esqti,xho-xho,9 +lre22_dev_etaln,zul-zul,12 +lre22_dev_etarn,nbl-nbl,6 +lre22_dev_etndu,ven-ven,13 +lre22_dev_etpdc,afr-afr,3 +lre22_dev_etsam,zul-zul,7 +lre22_dev_etwge,eng-ens,6 +lre22_dev_etxyc,orm-orm,12 +lre22_dev_eumsq,zul-zul,10 +lre22_dev_eusfl,orm-orm,8 +lre22_dev_eutkk,tso-tso,0 +lre22_dev_euxuy,orm-orm,13 +lre22_dev_evaon,ara-aeb,4 +lre22_dev_evkaz,eng-iaf,8 +lre22_dev_evret,fra-ntf,8 +lre22_dev_evvep,tso-tso,9 +lre22_dev_evvvd,tir-tir,10 +lre22_dev_ewems,ven-ven,7 +lre22_dev_ewijw,orm-orm,11 +lre22_dev_ewqpv,eng-iaf,6 +lre22_dev_ewywf,nbl-nbl,10 +lre22_dev_exaia,afr-afr,3 +lre22_dev_exbum,afr-afr,4 +lre22_dev_exhhd,ara-aeb,5 +lre22_dev_exkkf,afr-afr,3 +lre22_dev_extrh,zul-zul,6 +lre22_dev_exzyo,xho-xho,15 +lre22_dev_eyrzt,ara-ayl,1 +lre22_dev_eysdu,zul-zul,4 +lre22_dev_eyshz,xho-xho,1 +lre22_dev_eyuyq,ara-ayl,7 +lre22_dev_ezsyu,ven-ven,3 +lre22_dev_faahr,afr-afr,9 +lre22_dev_fabli,ven-ven,6 +lre22_dev_fatah,zul-zul,12 +lre22_dev_fccpw,orm-orm,12 +lre22_dev_fcpbu,xho-xho,8 +lre22_dev_fcqbx,tso-tso,3 +lre22_dev_fcwnw,fra-ntf,8 +lre22_dev_fdgia,orm-orm,10 +lre22_dev_febnk,eng-ens,5 +lre22_dev_fedau,eng-iaf,5 +lre22_dev_fehxn,xho-xho,8 +lre22_dev_fejsd,ven-ven,8 +lre22_dev_feqjc,eng-iaf,12 +lre22_dev_fesss,nbl-nbl,15 +lre22_dev_feuww,fra-ntf,8 +lre22_dev_fevex,zul-zul,2 +lre22_dev_ffban,ara-arq,6 +lre22_dev_ffefw,orm-orm,13 +lre22_dev_ffsps,fra-ntf,8 +lre22_dev_ffwid,tso-tso,11 +lre22_dev_fgbtr,nbl-nbl,15 +lre22_dev_fgmbr,ara-arq,6 +lre22_dev_fgmxd,eng-ens,9 +lre22_dev_fgnfs,tir-tir,12 +lre22_dev_fgrze,eng-ens,11 +lre22_dev_fhlhy,ara-aeb,7 +lre22_dev_fihvr,eng-iaf,7 +lre22_dev_fiizm,xho-xho,14 +lre22_dev_fiksd,fra-ntf,12 +lre22_dev_fitjt,tso-tso,6 +lre22_dev_fiuun,eng-ens,7 +lre22_dev_fjdul,ara-ayl,3 +lre22_dev_fjgrh,ven-ven,8 +lre22_dev_fkaqj,nbl-nbl,13 +lre22_dev_flfgv,ara-aeb,9 +lre22_dev_flirl,fra-ntf,13 +lre22_dev_fljab,fra-ntf,14 +lre22_dev_flnzm,tir-tir,11 +lre22_dev_flsmp,orm-orm,15 +lre22_dev_fmjvq,ven-ven,2 +lre22_dev_fmmxd,afr-afr,4 +lre22_dev_fnglh,afr-afr,13 +lre22_dev_fnsax,xho-xho,6 +lre22_dev_fojyn,eng-ens,5 +lre22_dev_foqgk,ven-ven,2 +lre22_dev_fovba,ara-arq,4 +lre22_dev_fozyj,ara-arq,2 +lre22_dev_fpavw,ara-aeb,8 +lre22_dev_fptba,eng-ens,3 +lre22_dev_fqdfc,tso-tso,11 +lre22_dev_fqdhm,eng-iaf,8 +lre22_dev_fqfet,nbl-nbl,7 +lre22_dev_fqgty,fra-ntf,4 +lre22_dev_fqgyd,zul-zul,10 +lre22_dev_fqvup,tso-tso,2 +lre22_dev_frviu,ara-aeb,10 +lre22_dev_frwfk,nbl-nbl,9 +lre22_dev_fsygm,eng-iaf,5 +lre22_dev_ftfjv,orm-orm,11 +lre22_dev_ftjvg,afr-afr,12 +lre22_dev_ftmnu,ara-aeb,10 +lre22_dev_ftrcl,eng-ens,3 +lre22_dev_ftygz,eng-ens,8 +lre22_dev_fughv,eng-iaf,3 +lre22_dev_fuhuk,ara-ayl,5 +lre22_dev_fusyr,ven-ven,13 +lre22_dev_futhm,zul-zul,5 +lre22_dev_fvbzh,ara-ayl,7 +lre22_dev_fvecf,ven-ven,9 +lre22_dev_fvktn,fra-ntf,8 +lre22_dev_fvpts,orm-orm,6 +lre22_dev_fvsmm,eng-iaf,12 +lre22_dev_fvvgc,ara-arq,5 +lre22_dev_fwvzh,zul-zul,2 +lre22_dev_fwwsy,xho-xho,5 +lre22_dev_fxggn,fra-ntf,1 +lre22_dev_fxqfi,orm-orm,10 +lre22_dev_fxuqw,ara-ayl,3 +lre22_dev_fxwfc,eng-iaf,12 +lre22_dev_fymdc,tso-tso,4 +lre22_dev_fywir,tso-tso,10 +lre22_dev_fzjzu,xho-xho,14 +lre22_dev_fzpeh,ara-aeb,10 +lre22_dev_fztdi,tir-tir,9 +lre22_dev_gcced,ven-ven,6 +lre22_dev_gchqj,zul-zul,10 +lre22_dev_gctmk,xho-xho,12 +lre22_dev_gcupw,ven-ven,7 +lre22_dev_gdfdn,tir-tir,5 +lre22_dev_gdlpg,tir-tir,3 +lre22_dev_gdrwq,fra-ntf,14 +lre22_dev_gdvjh,afr-afr,5 +lre22_dev_gdvtc,eng-iaf,13 +lre22_dev_gdxck,orm-orm,4 +lre22_dev_gecgq,afr-afr,12 +lre22_dev_gevbs,nbl-nbl,13 +lre22_dev_gfqxw,tir-tir,11 +lre22_dev_gfujh,eng-ens,8 +lre22_dev_gfwqx,fra-ntf,10 +lre22_dev_ggchj,tir-tir,10 +lre22_dev_ggeie,ara-arq,8 +lre22_dev_ggqob,ara-aeb,9 +lre22_dev_ghllb,eng-ens,8 +lre22_dev_ghlqh,afr-afr,12 +lre22_dev_ghmuk,afr-afr,13 +lre22_dev_ghskg,tso-tso,4 +lre22_dev_ghwmw,ara-arq,2 +lre22_dev_giijn,ven-ven,6 +lre22_dev_gised,xho-xho,9 +lre22_dev_gisrt,tir-tir,9 +lre22_dev_gjptx,nbl-nbl,4 +lre22_dev_gjvkc,ara-arq,7 +lre22_dev_gjxkc,eng-iaf,13 +lre22_dev_gkywh,ara-aeb,7 +lre22_dev_glhtl,eng-iaf,3 +lre22_dev_glulw,ara-aeb,8 +lre22_dev_gmpja,nbl-nbl,3 +lre22_dev_gmpjm,nbl-nbl,12 +lre22_dev_gnkvz,eng-iaf,13 +lre22_dev_gnmcz,nbl-nbl,4 +lre22_dev_goggr,afr-afr,5 +lre22_dev_goqov,ara-aeb,8 +lre22_dev_gpzgq,tso-tso,9 +lre22_dev_gpzuz,fra-ntf,5 +lre22_dev_gqpul,ara-arq,10 +lre22_dev_gratu,tir-tir,7 +lre22_dev_grewx,afr-afr,9 +lre22_dev_grizt,eng-ens,2 +lre22_dev_grsam,afr-afr,11 +lre22_dev_grsyr,zul-zul,1 +lre22_dev_grxus,nbl-nbl,15 +lre22_dev_gsanj,ven-ven,13 +lre22_dev_gsbwz,nbl-nbl,9 +lre22_dev_gtwjj,tso-tso,4 +lre22_dev_gtxwq,orm-orm,12 +lre22_dev_gubts,ara-ayl,0 +lre22_dev_gvawh,xho-xho,11 +lre22_dev_gvfsb,ara-aeb,10 +lre22_dev_gvhgg,afr-afr,9 +lre22_dev_gvnaj,fra-ntf,8 +lre22_dev_gvysc,ara-aeb,10 +lre22_dev_gwfkz,xho-xho,2 +lre22_dev_gwnqp,xho-xho,7 +lre22_dev_gwumi,tso-tso,3 +lre22_dev_gwvcw,xho-xho,11 +lre22_dev_gwwxz,eng-iaf,1 +lre22_dev_gwzrc,eng-ens,11 +lre22_dev_gxtlx,fra-ntf,13 +lre22_dev_gxygl,tso-tso,9 +lre22_dev_gycld,orm-orm,4 +lre22_dev_gzakl,nbl-nbl,15 +lre22_dev_gzrgo,ara-arq,9 +lre22_dev_hbkul,orm-orm,6 +lre22_dev_hbodn,eng-ens,10 +lre22_dev_hbwgy,ara-arq,6 +lre22_dev_hbwyc,nbl-nbl,5 +lre22_dev_hczek,fra-ntf,7 +lre22_dev_hdpsb,nbl-nbl,6 +lre22_dev_hdvsb,ara-aeb,8 +lre22_dev_hetsy,xho-xho,10 +lre22_dev_hfgrm,ven-ven,12 +lre22_dev_hfurz,afr-afr,13 +lre22_dev_hfwyw,nbl-nbl,11 +lre22_dev_hgdqx,tso-tso,3 +lre22_dev_hgwdk,eng-ens,8 +lre22_dev_hgxqf,eng-iaf,8 +lre22_dev_hgyuk,ven-ven,11 +lre22_dev_hhetm,fra-ntf,14 +lre22_dev_hhjki,ara-arq,8 +lre22_dev_hhvtc,ara-arq,10 +lre22_dev_hhxqv,tso-tso,5 +lre22_dev_hiisb,nbl-nbl,15 +lre22_dev_hioxp,tso-tso,3 +lre22_dev_hjqaf,ara-aeb,9 +lre22_dev_hjqid,orm-orm,6 +lre22_dev_hjzwc,eng-iaf,3 +lre22_dev_hkdzu,ara-arq,9 +lre22_dev_hlatl,eng-iaf,12 +lre22_dev_hlywv,nbl-nbl,2 +lre22_dev_hlzxa,ven-ven,7 +lre22_dev_hmvzg,ara-ayl,3 +lre22_dev_hnjgb,eng-ens,9 +lre22_dev_hntdv,eng-ens,11 +lre22_dev_hoish,tir-tir,2 +lre22_dev_hokbg,ara-ayl,6 +lre22_dev_hondp,eng-iaf,8 +lre22_dev_hpbve,tir-tir,11 +lre22_dev_hpdvc,fra-ntf,8 +lre22_dev_hpgst,orm-orm,5 +lre22_dev_hqbjb,xho-xho,5 +lre22_dev_hqdev,tso-tso,2 +lre22_dev_hqidg,tir-tir,1 +lre22_dev_hqids,afr-afr,9 +lre22_dev_hqltr,tir-tir,4 +lre22_dev_hqqhq,eng-ens,11 +lre22_dev_hrmcg,zul-zul,13 +lre22_dev_hrrcp,afr-afr,8 +lre22_dev_hstgi,xho-xho,9 +lre22_dev_hsvpq,ara-ayl,9 +lre22_dev_hswsy,ara-aeb,4 +lre22_dev_htcgm,eng-iaf,6 +lre22_dev_htedo,xho-xho,13 +lre22_dev_hthkx,eng-iaf,7 +lre22_dev_htohd,afr-afr,6 +lre22_dev_htxik,fra-ntf,0 +lre22_dev_huqbr,xho-xho,10 +lre22_dev_hvdom,afr-afr,8 +lre22_dev_hvkoa,afr-afr,13 +lre22_dev_hvnkg,tir-tir,9 +lre22_dev_hvocp,nbl-nbl,12 +lre22_dev_hvqzj,zul-zul,12 +lre22_dev_hvwph,afr-afr,3 +lre22_dev_hwaqg,zul-zul,8 +lre22_dev_hwgvu,ara-aeb,6 +lre22_dev_hwhlz,ven-ven,11 +lre22_dev_hwkes,fra-ntf,12 +lre22_dev_hwvna,eng-ens,2 +lre22_dev_hxfim,eng-iaf,12 +lre22_dev_hxmdw,afr-afr,10 +lre22_dev_hxrnp,zul-zul,6 +lre22_dev_hxvie,tir-tir,9 +lre22_dev_hxvju,zul-zul,3 +lre22_dev_hxzxm,zul-zul,6 +lre22_dev_hybef,nbl-nbl,14 +lre22_dev_hyfok,eng-ens,2 +lre22_dev_hyscv,ara-arq,4 +lre22_dev_hyzod,eng-iaf,6 +lre22_dev_hzdpb,tso-tso,7 +lre22_dev_hzjwn,ara-aeb,5 +lre22_dev_hzljv,tir-tir,8 +lre22_dev_hzomy,tso-tso,9 +lre22_dev_iaaar,tso-tso,9 +lre22_dev_iaimu,afr-afr,13 +lre22_dev_iakmg,orm-orm,15 +lre22_dev_iarxv,ara-aeb,9 +lre22_dev_iaywv,ara-ayl,6 +lre22_dev_ibcne,eng-ens,11 +lre22_dev_ibeth,zul-zul,2 +lre22_dev_ibwbi,tir-tir,9 +lre22_dev_ibyqr,tso-tso,7 +lre22_dev_iccwp,eng-iaf,6 +lre22_dev_ichmi,afr-afr,12 +lre22_dev_idjrt,zul-zul,8 +lre22_dev_iegng,afr-afr,8 +lre22_dev_iezrr,ara-ayl,7 +lre22_dev_ifaib,ara-ayl,5 +lre22_dev_ifhil,tso-tso,9 +lre22_dev_ifptd,ven-ven,12 +lre22_dev_ifriu,ara-aeb,6 +lre22_dev_ignvp,zul-zul,13 +lre22_dev_igxzy,eng-iaf,12 +lre22_dev_ihdva,fra-ntf,10 +lre22_dev_iiydv,eng-iaf,5 +lre22_dev_ijoyg,ara-ayl,9 +lre22_dev_ikghg,eng-iaf,7 +lre22_dev_ikijv,ven-ven,2 +lre22_dev_ilawb,ara-aeb,8 +lre22_dev_ilgnm,orm-orm,6 +lre22_dev_ilqhp,orm-orm,13 +lre22_dev_imrsx,tso-tso,8 +lre22_dev_inrfz,ara-arq,1 +lre22_dev_inrlw,eng-ens,1 +lre22_dev_inttm,tso-tso,8 +lre22_dev_iorip,ven-ven,13 +lre22_dev_ioryq,ara-aeb,8 +lre22_dev_iosse,afr-afr,1 +lre22_dev_ipahz,tir-tir,12 +lre22_dev_ipaup,tir-tir,10 +lre22_dev_ipllz,tir-tir,12 +lre22_dev_iprih,ara-aeb,4 +lre22_dev_iqkpj,tir-tir,6 +lre22_dev_iqowb,ara-aeb,0 +lre22_dev_iqzfp,orm-orm,15 +lre22_dev_irhue,tso-tso,8 +lre22_dev_irkvo,orm-orm,15 +lre22_dev_irnie,ara-aeb,8 +lre22_dev_irnxg,zul-zul,9 +lre22_dev_irsgt,ven-ven,2 +lre22_dev_isavf,nbl-nbl,0 +lre22_dev_isfpd,nbl-nbl,11 +lre22_dev_iskfd,ara-arq,4 +lre22_dev_isndz,ara-arq,6 +lre22_dev_istwz,nbl-nbl,15 +lre22_dev_isxpy,orm-orm,5 +lre22_dev_iszkk,tir-tir,9 +lre22_dev_itdot,ara-ayl,9 +lre22_dev_itfgh,eng-iaf,9 +lre22_dev_itlqd,tir-tir,12 +lre22_dev_itmbo,ara-aeb,10 +lre22_dev_itznp,ara-aeb,3 +lre22_dev_iucwv,zul-zul,5 +lre22_dev_iuowb,ara-aeb,8 +lre22_dev_iupes,zul-zul,4 +lre22_dev_iurgk,fra-ntf,4 +lre22_dev_ivcpr,nbl-nbl,12 +lre22_dev_ivrwa,ven-ven,3 +lre22_dev_ivvlb,afr-afr,11 +lre22_dev_ivwhm,tir-tir,6 +lre22_dev_iwoya,ara-aeb,4 +lre22_dev_iwpvu,orm-orm,5 +lre22_dev_ixpuq,ara-ayl,5 +lre22_dev_ixpyb,tso-tso,11 +lre22_dev_iyfiz,eng-iaf,5 +lre22_dev_iylyu,xho-xho,12 +lre22_dev_iyuli,zul-zul,13 +lre22_dev_iyupt,orm-orm,5 +lre22_dev_iyxjf,zul-zul,12 +lre22_dev_iyzgz,tso-tso,10 +lre22_dev_izepb,ara-arq,4 +lre22_dev_izkix,ven-ven,10 +lre22_dev_izknz,ven-ven,12 +lre22_dev_jadfl,ara-arq,9 +lre22_dev_jafja,zul-zul,9 +lre22_dev_jamvn,ven-ven,1 +lre22_dev_jbach,eng-iaf,2 +lre22_dev_jbqcq,ara-aeb,6 +lre22_dev_jcxgo,afr-afr,6 +lre22_dev_jddrh,fra-ntf,13 +lre22_dev_jdjpg,tir-tir,12 +lre22_dev_jdtrb,eng-iaf,11 +lre22_dev_jdwjj,zul-zul,7 +lre22_dev_jdzqw,tir-tir,3 +lre22_dev_jeaev,nbl-nbl,8 +lre22_dev_jeobs,ara-aeb,9 +lre22_dev_jesxq,eng-ens,10 +lre22_dev_jgcla,ara-arq,2 +lre22_dev_jggxv,fra-ntf,3 +lre22_dev_jgntz,orm-orm,5 +lre22_dev_jhcao,ven-ven,7 +lre22_dev_jhgik,eng-ens,11 +lre22_dev_jhpkj,ara-arq,4 +lre22_dev_jhuof,orm-orm,15 +lre22_dev_jignq,ara-ayl,9 +lre22_dev_jjffc,ven-ven,13 +lre22_dev_jjkfe,eng-ens,9 +lre22_dev_jjqxi,ara-aeb,8 +lre22_dev_jjrgq,eng-iaf,4 +lre22_dev_jkacy,tso-tso,3 +lre22_dev_jkmin,orm-orm,15 +lre22_dev_jkobe,xho-xho,7 +lre22_dev_jkosd,zul-zul,10 +lre22_dev_jkovc,tso-tso,3 +lre22_dev_jktcq,zul-zul,7 +lre22_dev_jlodp,eng-ens,9 +lre22_dev_jmbjo,nbl-nbl,9 +lre22_dev_jmccw,ara-arq,3 +lre22_dev_jminj,fra-ntf,5 +lre22_dev_jmmyw,afr-afr,3 +lre22_dev_jobae,fra-ntf,13 +lre22_dev_jobsv,nbl-nbl,14 +lre22_dev_jobxi,ara-arq,5 +lre22_dev_joghi,ara-arq,6 +lre22_dev_johkj,xho-xho,7 +lre22_dev_jolqw,ara-ayl,5 +lre22_dev_jplye,fra-ntf,11 +lre22_dev_jpsmt,ara-arq,9 +lre22_dev_jqdnf,eng-iaf,13 +lre22_dev_jqqpg,orm-orm,5 +lre22_dev_jqqrs,nbl-nbl,11 +lre22_dev_jrmnp,tir-tir,9 +lre22_dev_jsahe,fra-ntf,12 +lre22_dev_jsciw,eng-ens,5 +lre22_dev_jsisu,eng-iaf,4 +lre22_dev_jstjq,zul-zul,4 +lre22_dev_jsxuw,eng-iaf,8 +lre22_dev_jtaxh,ven-ven,4 +lre22_dev_jtgjo,ara-arq,9 +lre22_dev_jtxor,orm-orm,3 +lre22_dev_junyj,orm-orm,5 +lre22_dev_juykt,ara-ayl,7 +lre22_dev_jvqzf,fra-ntf,9 +lre22_dev_jvvxl,afr-afr,7 +lre22_dev_jvxpt,nbl-nbl,1 +lre22_dev_jwfeb,eng-iaf,4 +lre22_dev_jwmmp,eng-ens,3 +lre22_dev_jwyiq,tso-tso,10 +lre22_dev_jxcmp,ara-aeb,10 +lre22_dev_jxfsy,ara-ayl,9 +lre22_dev_jxjar,tso-tso,10 +lre22_dev_jylrr,ara-aeb,9 +lre22_dev_jzciw,orm-orm,5 +lre22_dev_jzcyt,tso-tso,5 +lre22_dev_jzhpf,tso-tso,4 +lre22_dev_jzidh,afr-afr,11 +lre22_dev_jznzw,eng-iaf,6 +lre22_dev_jzoqd,afr-afr,7 +lre22_dev_jzwnu,ven-ven,11 +lre22_dev_kaoyk,afr-afr,6 +lre22_dev_kasoe,zul-zul,12 +lre22_dev_kaygq,eng-ens,9 +lre22_dev_kayqh,fra-ntf,8 +lre22_dev_kbpcw,eng-iaf,3 +lre22_dev_kbtrx,orm-orm,10 +lre22_dev_kcebk,ven-ven,7 +lre22_dev_kdbil,orm-orm,15 +lre22_dev_kddhf,ara-arq,10 +lre22_dev_kdeij,ara-ayl,3 +lre22_dev_kdiak,zul-zul,12 +lre22_dev_kedwl,nbl-nbl,12 +lre22_dev_keouf,fra-ntf,9 +lre22_dev_keozw,ara-aeb,10 +lre22_dev_kervm,eng-ens,7 +lre22_dev_kflpm,xho-xho,1 +lre22_dev_kfqpd,ara-arq,8 +lre22_dev_kgaqj,ara-aeb,8 +lre22_dev_kghnx,fra-ntf,3 +lre22_dev_kgoze,zul-zul,4 +lre22_dev_kgrxe,fra-ntf,9 +lre22_dev_kgsdu,ara-arq,5 +lre22_dev_kheef,xho-xho,15 +lre22_dev_khgyl,xho-xho,8 +lre22_dev_khsgr,tso-tso,7 +lre22_dev_khxvm,nbl-nbl,9 +lre22_dev_kijjo,ara-aeb,3 +lre22_dev_kiush,xho-xho,2 +lre22_dev_kiyso,ara-arq,1 +lre22_dev_kjewo,ven-ven,6 +lre22_dev_kjgkg,ara-ayl,5 +lre22_dev_kjksh,ven-ven,3 +lre22_dev_kjomd,afr-afr,4 +lre22_dev_kjrcy,afr-afr,11 +lre22_dev_kkauw,fra-ntf,10 +lre22_dev_kkiew,orm-orm,15 +lre22_dev_kkyyu,zul-zul,8 +lre22_dev_klafc,ara-ayl,4 +lre22_dev_klalo,eng-ens,5 +lre22_dev_kliip,afr-afr,1 +lre22_dev_klkxg,tso-tso,8 +lre22_dev_klqwc,ara-arq,7 +lre22_dev_kmbgg,tir-tir,12 +lre22_dev_kmgoo,tir-tir,8 +lre22_dev_kmnko,zul-zul,3 +lre22_dev_kmtyc,ara-aeb,8 +lre22_dev_kmxqj,xho-xho,8 +lre22_dev_kmzdw,fra-ntf,3 +lre22_dev_knxsi,ara-arq,9 +lre22_dev_kofob,orm-orm,7 +lre22_dev_kokfk,fra-ntf,14 +lre22_dev_kokir,nbl-nbl,12 +lre22_dev_kooxu,ara-arq,9 +lre22_dev_korip,tso-tso,7 +lre22_dev_kpbnd,zul-zul,4 +lre22_dev_kpnyf,eng-iaf,3 +lre22_dev_kpwts,ara-ayl,8 +lre22_dev_kpxne,orm-orm,6 +lre22_dev_kpzbl,ven-ven,12 +lre22_dev_kqact,zul-zul,0 +lre22_dev_kqfbl,eng-iaf,12 +lre22_dev_kqfsm,zul-zul,5 +lre22_dev_kqfyp,ara-arq,1 +lre22_dev_kqkqj,ara-ayl,7 +lre22_dev_kqvwr,xho-xho,13 +lre22_dev_kragl,zul-zul,13 +lre22_dev_krbdn,xho-xho,14 +lre22_dev_ksake,ara-aeb,8 +lre22_dev_ksoly,nbl-nbl,11 +lre22_dev_kttyt,orm-orm,5 +lre22_dev_kttzq,tso-tso,9 +lre22_dev_ktwaf,zul-zul,3 +lre22_dev_ktwqf,ven-ven,6 +lre22_dev_ktxef,zul-zul,0 +lre22_dev_ktztb,orm-orm,12 +lre22_dev_kufkm,nbl-nbl,15 +lre22_dev_kuqsu,afr-afr,9 +lre22_dev_kuyka,tir-tir,4 +lre22_dev_kvcpn,ara-ayl,3 +lre22_dev_kvghz,eng-iaf,10 +lre22_dev_kvswv,ven-ven,11 +lre22_dev_kxkos,orm-orm,10 +lre22_dev_kxkzg,ara-ayl,9 +lre22_dev_kxqef,ven-ven,12 +lre22_dev_kyjpf,ven-ven,7 +lre22_dev_kynap,ara-ayl,9 +lre22_dev_kyptg,ven-ven,8 +lre22_dev_kytyr,nbl-nbl,11 +lre22_dev_kywmf,orm-orm,4 +lre22_dev_kzibn,zul-zul,3 +lre22_dev_kzqxx,fra-ntf,1 +lre22_dev_lacgv,tso-tso,7 +lre22_dev_lagpe,tso-tso,6 +lre22_dev_lanuu,tso-tso,9 +lre22_dev_lapag,afr-afr,6 +lre22_dev_larnq,zul-zul,4 +lre22_dev_lbbvq,xho-xho,8 +lre22_dev_lbfca,ara-arq,8 +lre22_dev_lbhoj,orm-orm,11 +lre22_dev_lbiin,ara-ayl,4 +lre22_dev_lcdyj,ara-arq,9 +lre22_dev_ldasz,fra-ntf,9 +lre22_dev_ldbur,tso-tso,1 +lre22_dev_lddhs,orm-orm,12 +lre22_dev_ldedw,ara-aeb,5 +lre22_dev_ldmbr,ara-ayl,5 +lre22_dev_ldmqc,tir-tir,7 +lre22_dev_leadw,eng-iaf,3 +lre22_dev_leaqq,tso-tso,10 +lre22_dev_ledsh,afr-afr,11 +lre22_dev_leovk,afr-afr,6 +lre22_dev_lexlh,ara-aeb,2 +lre22_dev_lfilk,eng-ens,10 +lre22_dev_lfyll,zul-zul,10 +lre22_dev_lgada,zul-zul,6 +lre22_dev_lgcjy,afr-afr,9 +lre22_dev_lgfri,ara-aeb,5 +lre22_dev_lgkbt,xho-xho,4 +lre22_dev_lhbjq,ara-arq,0 +lre22_dev_lhemi,xho-xho,9 +lre22_dev_lhfne,ara-arq,6 +lre22_dev_lhmtg,ara-arq,9 +lre22_dev_lieso,ara-aeb,8 +lre22_dev_likcy,afr-afr,13 +lre22_dev_lipyu,zul-zul,12 +lre22_dev_lisum,ven-ven,4 +lre22_dev_ljevp,ara-ayl,3 +lre22_dev_ljijh,orm-orm,3 +lre22_dev_ljylg,nbl-nbl,13 +lre22_dev_lkfig,ara-ayl,2 +lre22_dev_lklnc,ara-arq,3 +lre22_dev_lkopy,tir-tir,9 +lre22_dev_lllwi,eng-iaf,5 +lre22_dev_llstb,nbl-nbl,10 +lre22_dev_lmeax,eng-iaf,10 +lre22_dev_lmkui,ara-arq,7 +lre22_dev_lmrbp,tir-tir,9 +lre22_dev_lnejh,eng-ens,10 +lre22_dev_lnttv,ven-ven,10 +lre22_dev_loxqz,eng-iaf,8 +lre22_dev_loybq,ara-aeb,10 +lre22_dev_lpadb,fra-ntf,4 +lre22_dev_lpahk,nbl-nbl,11 +lre22_dev_lphgs,tir-tir,7 +lre22_dev_lphoa,eng-ens,2 +lre22_dev_lpkie,eng-iaf,5 +lre22_dev_lpkpc,zul-zul,6 +lre22_dev_lptpx,eng-iaf,4 +lre22_dev_lqwcv,xho-xho,13 +lre22_dev_lrgwx,orm-orm,10 +lre22_dev_lruoj,orm-orm,2 +lre22_dev_lrwee,fra-ntf,10 +lre22_dev_lsess,ven-ven,1 +lre22_dev_lsycj,tir-tir,9 +lre22_dev_ltaoe,eng-ens,8 +lre22_dev_ltish,ara-aeb,5 +lre22_dev_ltqeb,eng-ens,8 +lre22_dev_ltzfg,ven-ven,10 +lre22_dev_luuhd,ara-arq,2 +lre22_dev_lvejl,zul-zul,11 +lre22_dev_lvgsm,tir-tir,10 +lre22_dev_lvwle,xho-xho,7 +lre22_dev_lvxea,tir-tir,8 +lre22_dev_lwsmk,eng-ens,10 +lre22_dev_lwzhq,ara-ayl,3 +lre22_dev_lxbdd,ara-ayl,8 +lre22_dev_lxdgx,nbl-nbl,1 +lre22_dev_lxjij,ara-ayl,7 +lre22_dev_lxldm,tso-tso,8 +lre22_dev_lxmsa,zul-zul,11 +lre22_dev_lxugv,zul-zul,13 +lre22_dev_lxwig,tso-tso,4 +lre22_dev_lyigi,xho-xho,4 +lre22_dev_lymzv,ara-arq,6 +lre22_dev_lyuls,ara-arq,4 +lre22_dev_lyyzw,ara-ayl,5 +lre22_dev_lzhrm,ara-arq,8 +lre22_dev_lzjgb,xho-xho,12 +lre22_dev_lzrpe,xho-xho,8 +lre22_dev_lzvmq,fra-ntf,13 +lre22_dev_maagy,ven-ven,6 +lre22_dev_mabmx,ara-arq,4 +lre22_dev_macre,zul-zul,7 +lre22_dev_maggb,nbl-nbl,7 +lre22_dev_margf,ara-ayl,6 +lre22_dev_maydg,eng-iaf,4 +lre22_dev_mbsgm,zul-zul,7 +lre22_dev_mbttd,fra-ntf,14 +lre22_dev_mcebh,tso-tso,8 +lre22_dev_mcfve,ara-ayl,3 +lre22_dev_mclrc,zul-zul,12 +lre22_dev_mcvgl,ara-ayl,5 +lre22_dev_mdgok,ara-aeb,5 +lre22_dev_mdilb,ven-ven,3 +lre22_dev_mdzqr,nbl-nbl,11 +lre22_dev_mehfu,ara-arq,3 +lre22_dev_meiyg,eng-ens,11 +lre22_dev_merbq,orm-orm,9 +lre22_dev_mfoys,afr-afr,8 +lre22_dev_mgpfx,xho-xho,8 +lre22_dev_mgtzj,zul-zul,12 +lre22_dev_mgxxc,ven-ven,11 +lre22_dev_mhldj,nbl-nbl,14 +lre22_dev_mhvio,eng-iaf,6 +lre22_dev_mhxgi,tir-tir,9 +lre22_dev_miegc,fra-ntf,6 +lre22_dev_miwyu,ara-aeb,8 +lre22_dev_mjocm,ara-aeb,2 +lre22_dev_mjqij,orm-orm,12 +lre22_dev_mjxgy,afr-afr,8 +lre22_dev_mkeyt,tir-tir,12 +lre22_dev_mklub,ven-ven,4 +lre22_dev_mknzf,ara-aeb,10 +lre22_dev_mlhes,ara-arq,9 +lre22_dev_mlhse,tso-tso,3 +lre22_dev_mlhtc,orm-orm,8 +lre22_dev_mlpuq,ven-ven,10 +lre22_dev_mluow,orm-orm,2 +lre22_dev_mmwtu,ara-arq,4 +lre22_dev_mmwzf,tso-tso,7 +lre22_dev_mnjdq,tir-tir,10 +lre22_dev_mnkfe,nbl-nbl,4 +lre22_dev_mnmcm,ara-arq,3 +lre22_dev_mocss,xho-xho,9 +lre22_dev_mohxo,zul-zul,12 +lre22_dev_mojui,fra-ntf,1 +lre22_dev_mojvy,xho-xho,7 +lre22_dev_molqa,fra-ntf,14 +lre22_dev_mopiq,nbl-nbl,14 +lre22_dev_moqto,tir-tir,12 +lre22_dev_morri,ara-aeb,8 +lre22_dev_mpxyg,eng-ens,4 +lre22_dev_mqiap,xho-xho,14 +lre22_dev_mqxep,ara-ayl,2 +lre22_dev_mrcoe,ara-ayl,7 +lre22_dev_mriiq,tso-tso,4 +lre22_dev_mryoy,eng-ens,11 +lre22_dev_mryzh,ara-arq,4 +lre22_dev_msadm,ven-ven,2 +lre22_dev_msghz,nbl-nbl,11 +lre22_dev_mtpfp,ara-aeb,9 +lre22_dev_mtqft,orm-orm,14 +lre22_dev_mtzvt,ara-aeb,10 +lre22_dev_munim,xho-xho,15 +lre22_dev_murhb,nbl-nbl,1 +lre22_dev_mvbra,xho-xho,4 +lre22_dev_mvhza,afr-afr,13 +lre22_dev_mviud,xho-xho,12 +lre22_dev_mvxjk,afr-afr,9 +lre22_dev_mwnkm,orm-orm,8 +lre22_dev_mwoml,xho-xho,9 +lre22_dev_mxhup,eng-ens,8 +lre22_dev_mykuh,ara-ayl,5 +lre22_dev_myqfn,eng-iaf,4 +lre22_dev_mywmj,ven-ven,9 +lre22_dev_mzbrr,ara-arq,10 +lre22_dev_mzsiq,afr-afr,9 +lre22_dev_mztms,eng-ens,3 +lre22_dev_mzuxc,ara-arq,9 +lre22_dev_nbdbe,ara-ayl,7 +lre22_dev_nbjqz,ara-aeb,9 +lre22_dev_nbyhp,afr-afr,3 +lre22_dev_ncnyb,ven-ven,8 +lre22_dev_ncocl,nbl-nbl,6 +lre22_dev_ndecq,ara-ayl,8 +lre22_dev_ndjsl,nbl-nbl,6 +lre22_dev_nelsk,orm-orm,0 +lre22_dev_nenly,eng-iaf,11 +lre22_dev_neqkb,ven-ven,2 +lre22_dev_nfjid,orm-orm,12 +lre22_dev_nfkqr,orm-orm,8 +lre22_dev_nfoas,orm-orm,15 +lre22_dev_ngjbm,eng-ens,10 +lre22_dev_ngmbz,eng-iaf,9 +lre22_dev_ngnua,fra-ntf,10 +lre22_dev_nguuu,fra-ntf,13 +lre22_dev_ngyse,ven-ven,7 +lre22_dev_nhfso,fra-ntf,14 +lre22_dev_nhuue,zul-zul,1 +lre22_dev_niack,ara-ayl,8 +lre22_dev_niari,ven-ven,7 +lre22_dev_nibme,ara-arq,9 +lre22_dev_nikby,tso-tso,10 +lre22_dev_nimex,ara-ayl,8 +lre22_dev_nivmv,xho-xho,11 +lre22_dev_nkebu,eng-ens,5 +lre22_dev_nkgml,eng-ens,10 +lre22_dev_nkofi,fra-ntf,11 +lre22_dev_nkrez,xho-xho,5 +lre22_dev_nkscn,tso-tso,5 +lre22_dev_nkwrs,ara-aeb,2 +lre22_dev_nkxcy,afr-afr,4 +lre22_dev_nlast,xho-xho,12 +lre22_dev_nlcun,eng-ens,0 +lre22_dev_nljyr,afr-afr,5 +lre22_dev_nlkdv,eng-iaf,12 +lre22_dev_nlpcs,ara-ayl,7 +lre22_dev_nlrcn,ara-ayl,4 +lre22_dev_nlxla,xho-xho,0 +lre22_dev_nmmij,ara-ayl,4 +lre22_dev_nmrkv,fra-ntf,12 +lre22_dev_nmufp,tso-tso,10 +lre22_dev_nnbmo,tso-tso,10 +lre22_dev_nnnpi,afr-afr,4 +lre22_dev_nnzok,tir-tir,5 +lre22_dev_noqch,fra-ntf,12 +lre22_dev_nownd,xho-xho,2 +lre22_dev_npabl,nbl-nbl,5 +lre22_dev_npjhu,afr-afr,6 +lre22_dev_nqbks,afr-afr,11 +lre22_dev_nqijo,orm-orm,7 +lre22_dev_nqljj,ara-arq,6 +lre22_dev_nqvfr,tir-tir,7 +lre22_dev_nrtej,tir-tir,11 +lre22_dev_nshvj,nbl-nbl,7 +lre22_dev_nsmyy,tir-tir,12 +lre22_dev_nsqcm,fra-ntf,13 +lre22_dev_nstrj,nbl-nbl,9 +lre22_dev_nsvla,nbl-nbl,10 +lre22_dev_nthbx,eng-ens,0 +lre22_dev_nvwkf,ven-ven,0 +lre22_dev_nvwzy,tso-tso,11 +lre22_dev_nvyyg,orm-orm,7 +lre22_dev_nxdml,eng-ens,1 +lre22_dev_nxmxb,zul-zul,12 +lre22_dev_nxqpl,nbl-nbl,13 +lre22_dev_nxslf,fra-ntf,9 +lre22_dev_nyaof,nbl-nbl,5 +lre22_dev_nzeot,zul-zul,12 +lre22_dev_nzhhf,ara-ayl,7 +lre22_dev_nzpbh,fra-ntf,14 +lre22_dev_nzyjp,orm-orm,4 +lre22_dev_nzzyd,xho-xho,11 +lre22_dev_oaiij,ven-ven,7 +lre22_dev_oaimr,orm-orm,14 +lre22_dev_oatzl,fra-ntf,13 +lre22_dev_oaycx,ara-ayl,8 +lre22_dev_objwd,eng-ens,1 +lre22_dev_oboem,tir-tir,9 +lre22_dev_obzyj,xho-xho,5 +lre22_dev_occhn,fra-ntf,9 +lre22_dev_ocfcr,ven-ven,7 +lre22_dev_ochni,ven-ven,13 +lre22_dev_ociva,tir-tir,5 +lre22_dev_odofq,xho-xho,5 +lre22_dev_odtjr,eng-ens,11 +lre22_dev_oejjy,fra-ntf,4 +lre22_dev_offnw,afr-afr,8 +lre22_dev_ofgqs,ara-ayl,6 +lre22_dev_ofkvj,xho-xho,15 +lre22_dev_ofzhh,orm-orm,11 +lre22_dev_ogilp,afr-afr,6 +lre22_dev_oglxd,ara-ayl,4 +lre22_dev_ogoyt,tso-tso,8 +lre22_dev_ogpou,ven-ven,3 +lre22_dev_ohatz,eng-ens,10 +lre22_dev_ohlzs,nbl-nbl,15 +lre22_dev_ohpzj,tir-tir,4 +lre22_dev_ohzdt,ara-aeb,5 +lre22_dev_oicrh,eng-ens,9 +lre22_dev_oigem,orm-orm,14 +lre22_dev_ojbnw,ara-arq,4 +lre22_dev_ojebm,ven-ven,7 +lre22_dev_ojila,ara-arq,4 +lre22_dev_ojiso,fra-ntf,5 +lre22_dev_ojpdy,tso-tso,9 +lre22_dev_ojtki,tir-tir,11 +lre22_dev_ojxso,nbl-nbl,4 +lre22_dev_okdqa,fra-ntf,14 +lre22_dev_oktvp,ara-ayl,7 +lre22_dev_okvsg,zul-zul,10 +lre22_dev_okyah,tso-tso,11 +lre22_dev_olabw,ara-arq,4 +lre22_dev_omhry,tir-tir,4 +lre22_dev_omnrf,eng-iaf,13 +lre22_dev_omptm,ven-ven,6 +lre22_dev_omqfq,fra-ntf,4 +lre22_dev_onqdn,fra-ntf,13 +lre22_dev_onsyx,tso-tso,9 +lre22_dev_onvgj,tir-tir,6 +lre22_dev_onzha,zul-zul,10 +lre22_dev_ooptw,nbl-nbl,5 +lre22_dev_oowvo,eng-ens,11 +lre22_dev_ooyea,tso-tso,2 +lre22_dev_oozri,ven-ven,0 +lre22_dev_opazz,ara-ayl,1 +lre22_dev_opqkl,nbl-nbl,11 +lre22_dev_oqsva,ara-ayl,2 +lre22_dev_oquxw,nbl-nbl,15 +lre22_dev_orktv,afr-afr,5 +lre22_dev_ornjf,ara-ayl,6 +lre22_dev_ortbp,ara-arq,0 +lre22_dev_osauy,fra-ntf,12 +lre22_dev_osnch,afr-afr,1 +lre22_dev_otelo,eng-iaf,7 +lre22_dev_otewx,tso-tso,10 +lre22_dev_otnwj,eng-ens,3 +lre22_dev_ouecw,ara-aeb,10 +lre22_dev_ouzui,ara-arq,3 +lre22_dev_ovdtj,ara-ayl,6 +lre22_dev_ovjny,tso-tso,1 +lre22_dev_ovqwp,ara-ayl,7 +lre22_dev_ovvkn,afr-afr,11 +lre22_dev_ovvmi,tso-tso,2 +lre22_dev_owyeq,ara-arq,6 +lre22_dev_oxlrt,ara-aeb,10 +lre22_dev_oybst,zul-zul,9 +lre22_dev_oybua,nbl-nbl,2 +lre22_dev_oykjs,tso-tso,4 +lre22_dev_oyswm,ara-arq,8 +lre22_dev_oyxbj,ven-ven,8 +lre22_dev_oyxtq,eng-ens,11 +lre22_dev_oyyxh,ara-arq,8 +lre22_dev_ozbct,tir-tir,12 +lre22_dev_ozcvt,ara-aeb,10 +lre22_dev_ozjel,ara-arq,10 +lre22_dev_ozmuj,zul-zul,3 +lre22_dev_ozuvk,tir-tir,10 +lre22_dev_paguh,fra-ntf,1 +lre22_dev_paspj,tir-tir,6 +lre22_dev_pbmai,fra-ntf,6 +lre22_dev_pbpug,zul-zul,10 +lre22_dev_pbsbs,tso-tso,10 +lre22_dev_pbszl,tso-tso,1 +lre22_dev_pbxxf,eng-iaf,2 +lre22_dev_pcgvn,eng-iaf,3 +lre22_dev_pcmbn,eng-ens,1 +lre22_dev_pcqce,ara-arq,8 +lre22_dev_pdlnr,tso-tso,2 +lre22_dev_pdrus,orm-orm,1 +lre22_dev_pedyx,eng-iaf,12 +lre22_dev_pegyr,nbl-nbl,11 +lre22_dev_pesej,ara-arq,4 +lre22_dev_pevhh,tir-tir,12 +lre22_dev_peykl,xho-xho,13 +lre22_dev_pezwc,tso-tso,4 +lre22_dev_pfemh,eng-iaf,4 +lre22_dev_pfrfc,ven-ven,8 +lre22_dev_pfsoa,nbl-nbl,15 +lre22_dev_pgeoo,tso-tso,9 +lre22_dev_pgwei,orm-orm,2 +lre22_dev_pgxyv,tso-tso,4 +lre22_dev_phofb,ara-ayl,8 +lre22_dev_phula,nbl-nbl,14 +lre22_dev_phwnf,tso-tso,9 +lre22_dev_pifyx,orm-orm,9 +lre22_dev_pilvp,tso-tso,11 +lre22_dev_pinzj,nbl-nbl,11 +lre22_dev_piocw,ara-aeb,8 +lre22_dev_pipas,zul-zul,13 +lre22_dev_pipgo,afr-afr,3 +lre22_dev_pitmn,ara-arq,10 +lre22_dev_pizdz,ara-aeb,2 +lre22_dev_pizlx,ara-ayl,6 +lre22_dev_pjatg,ven-ven,9 +lre22_dev_pjavt,orm-orm,11 +lre22_dev_pjcec,eng-iaf,12 +lre22_dev_pjdwy,afr-afr,1 +lre22_dev_pjlmw,ara-ayl,7 +lre22_dev_pjsqe,eng-ens,7 +lre22_dev_pkdij,ara-ayl,3 +lre22_dev_pkekq,ara-aeb,3 +lre22_dev_pkpst,eng-iaf,9 +lre22_dev_plhqb,nbl-nbl,13 +lre22_dev_plowv,nbl-nbl,5 +lre22_dev_plrjb,xho-xho,12 +lre22_dev_pmove,eng-iaf,4 +lre22_dev_pneax,eng-ens,11 +lre22_dev_pnexr,nbl-nbl,9 +lre22_dev_pngea,nbl-nbl,11 +lre22_dev_pnipe,eng-ens,9 +lre22_dev_pnmlr,ara-arq,5 +lre22_dev_pnsuk,xho-xho,2 +lre22_dev_pnuct,tir-tir,10 +lre22_dev_pocev,ara-arq,4 +lre22_dev_powkd,eng-ens,9 +lre22_dev_pprvm,ara-ayl,7 +lre22_dev_ppyle,ara-aeb,7 +lre22_dev_pqfda,fra-ntf,5 +lre22_dev_pqryo,afr-afr,4 +lre22_dev_prrzc,afr-afr,9 +lre22_dev_psjuf,afr-afr,13 +lre22_dev_psngm,zul-zul,13 +lre22_dev_psroz,fra-ntf,13 +lre22_dev_pssqo,orm-orm,10 +lre22_dev_psvlh,fra-ntf,13 +lre22_dev_pswld,tir-tir,10 +lre22_dev_ptcns,nbl-nbl,11 +lre22_dev_ptobm,afr-afr,6 +lre22_dev_ptowg,tir-tir,8 +lre22_dev_ptreu,xho-xho,15 +lre22_dev_ptwru,fra-ntf,14 +lre22_dev_ptyff,ara-ayl,1 +lre22_dev_ptygm,tir-tir,3 +lre22_dev_pudne,ara-arq,4 +lre22_dev_puelp,zul-zul,9 +lre22_dev_purej,nbl-nbl,9 +lre22_dev_puyvb,ara-ayl,3 +lre22_dev_pvrdh,ara-aeb,9 +lre22_dev_pvryr,eng-ens,11 +lre22_dev_pwets,tir-tir,9 +lre22_dev_pwgnk,tir-tir,10 +lre22_dev_pwhyy,tir-tir,11 +lre22_dev_pwkgs,zul-zul,2 +lre22_dev_pwtdp,eng-iaf,0 +lre22_dev_pxccc,ara-ayl,5 +lre22_dev_pxpdo,xho-xho,14 +lre22_dev_pxsot,xho-xho,14 +lre22_dev_pxuhy,ara-aeb,6 +lre22_dev_pybxn,eng-iaf,11 +lre22_dev_pyoft,eng-iaf,12 +lre22_dev_pyvql,eng-iaf,7 +lre22_dev_pzcnz,nbl-nbl,2 +lre22_dev_pzhrk,ara-aeb,4 +lre22_dev_qadjy,ven-ven,7 +lre22_dev_qaeek,ven-ven,7 +lre22_dev_qafse,eng-iaf,11 +lre22_dev_qahft,ven-ven,13 +lre22_dev_qakoa,zul-zul,9 +lre22_dev_qalhd,ara-ayl,2 +lre22_dev_qazjh,ven-ven,11 +lre22_dev_qbfkw,eng-iaf,6 +lre22_dev_qbgcd,fra-ntf,14 +lre22_dev_qbisr,ara-ayl,3 +lre22_dev_qcnbm,ven-ven,3 +lre22_dev_qdcbb,tir-tir,5 +lre22_dev_qdfgi,zul-zul,12 +lre22_dev_qdmbj,eng-ens,4 +lre22_dev_qdwtg,fra-ntf,11 +lre22_dev_qefvt,ara-ayl,7 +lre22_dev_qffki,orm-orm,13 +lre22_dev_qfplk,tir-tir,8 +lre22_dev_qgxdl,xho-xho,14 +lre22_dev_qhadd,afr-afr,2 +lre22_dev_qhgaf,ara-ayl,7 +lre22_dev_qhinf,tir-tir,6 +lre22_dev_qhkjz,ara-aeb,6 +lre22_dev_qhlwj,ara-arq,8 +lre22_dev_qiarf,ara-arq,4 +lre22_dev_qidwl,ara-arq,5 +lre22_dev_qivzc,orm-orm,12 +lre22_dev_qizyt,ara-ayl,2 +lre22_dev_qjeue,ara-arq,9 +lre22_dev_qjgxh,ara-arq,1 +lre22_dev_qkdhb,afr-afr,1 +lre22_dev_qkiqi,orm-orm,4 +lre22_dev_qkoth,tir-tir,5 +lre22_dev_qkucq,fra-ntf,3 +lre22_dev_qltea,nbl-nbl,2 +lre22_dev_qlube,ara-aeb,5 +lre22_dev_qmcji,nbl-nbl,15 +lre22_dev_qmpzc,nbl-nbl,11 +lre22_dev_qmsog,tir-tir,3 +lre22_dev_qoech,eng-iaf,7 +lre22_dev_qovfg,ara-arq,10 +lre22_dev_qozzv,tir-tir,2 +lre22_dev_qpasx,tir-tir,3 +lre22_dev_qpauj,ara-aeb,4 +lre22_dev_qpfch,orm-orm,6 +lre22_dev_qpvea,orm-orm,9 +lre22_dev_qrgka,ara-arq,8 +lre22_dev_qrqmm,ara-ayl,7 +lre22_dev_qsaol,xho-xho,14 +lre22_dev_qsgpx,ara-arq,10 +lre22_dev_qspeg,eng-ens,7 +lre22_dev_qsvbe,fra-ntf,3 +lre22_dev_qsxoh,fra-ntf,5 +lre22_dev_qtbnc,xho-xho,7 +lre22_dev_qthzi,afr-afr,12 +lre22_dev_qtmaw,fra-ntf,13 +lre22_dev_qtnqh,eng-iaf,13 +lre22_dev_qtpsb,tso-tso,8 +lre22_dev_qtqpc,eng-iaf,12 +lre22_dev_qtwfv,eng-iaf,4 +lre22_dev_qvamq,fra-ntf,9 +lre22_dev_qveuq,tir-tir,9 +lre22_dev_qvffg,orm-orm,0 +lre22_dev_qvplf,xho-xho,6 +lre22_dev_qvqvi,ven-ven,7 +lre22_dev_qwhsh,afr-afr,7 +lre22_dev_qwiwm,eng-ens,9 +lre22_dev_qxbch,ara-aeb,9 +lre22_dev_qxlca,nbl-nbl,2 +lre22_dev_qxscb,afr-afr,2 +lre22_dev_qyoqn,fra-ntf,9 +lre22_dev_qyrgs,nbl-nbl,3 +lre22_dev_qytdl,fra-ntf,9 +lre22_dev_qyyeb,eng-iaf,12 +lre22_dev_qyzqb,tso-tso,8 +lre22_dev_qzayi,orm-orm,12 +lre22_dev_qzexr,eng-iaf,5 +lre22_dev_qzrfi,ara-arq,10 +lre22_dev_qztjh,orm-orm,3 +lre22_dev_qztze,eng-iaf,12 +lre22_dev_raent,eng-iaf,2 +lre22_dev_ragjh,orm-orm,14 +lre22_dev_ramzu,ara-ayl,6 +lre22_dev_ratmr,ven-ven,7 +lre22_dev_rawak,ara-arq,9 +lre22_dev_rbbne,ven-ven,7 +lre22_dev_rbcul,eng-iaf,10 +lre22_dev_rbsoy,eng-iaf,12 +lre22_dev_rbxqy,tso-tso,9 +lre22_dev_rcejf,xho-xho,7 +lre22_dev_rdbzt,zul-zul,7 +lre22_dev_rdhpu,ara-aeb,8 +lre22_dev_rdsew,ven-ven,2 +lre22_dev_rdtkf,ven-ven,11 +lre22_dev_reeba,ara-ayl,6 +lre22_dev_relip,eng-iaf,11 +lre22_dev_rfdoh,ara-aeb,9 +lre22_dev_rfkja,xho-xho,11 +lre22_dev_rflev,ven-ven,3 +lre22_dev_rfqcx,nbl-nbl,14 +lre22_dev_rfwuv,eng-ens,1 +lre22_dev_rgsil,fra-ntf,6 +lre22_dev_rhcuj,ara-aeb,8 +lre22_dev_rhdgz,eng-iaf,12 +lre22_dev_rhpmn,ven-ven,7 +lre22_dev_rhtoe,eng-iaf,11 +lre22_dev_rhyqq,ara-aeb,2 +lre22_dev_riltn,ara-aeb,10 +lre22_dev_rinti,xho-xho,12 +lre22_dev_rioxh,xho-xho,12 +lre22_dev_ripix,tir-tir,10 +lre22_dev_rjbji,ven-ven,10 +lre22_dev_rjqbz,eng-iaf,0 +lre22_dev_rkemd,tir-tir,8 +lre22_dev_rktzl,nbl-nbl,13 +lre22_dev_rkuni,xho-xho,15 +lre22_dev_rlsgd,fra-ntf,5 +lre22_dev_rlypa,afr-afr,7 +lre22_dev_rmeav,ven-ven,8 +lre22_dev_rmejy,fra-ntf,12 +lre22_dev_rmeuz,zul-zul,6 +lre22_dev_rmjsj,nbl-nbl,5 +lre22_dev_rmtxj,eng-iaf,13 +lre22_dev_rnpyc,ara-ayl,2 +lre22_dev_rnunw,orm-orm,9 +lre22_dev_rnvvw,tso-tso,9 +lre22_dev_roavh,fra-ntf,6 +lre22_dev_rodbi,xho-xho,15 +lre22_dev_roeph,xho-xho,13 +lre22_dev_rolun,ara-ayl,3 +lre22_dev_roydh,xho-xho,7 +lre22_dev_rpajy,ara-aeb,8 +lre22_dev_rpdsm,ara-ayl,5 +lre22_dev_rpfae,afr-afr,9 +lre22_dev_rpvyc,eng-iaf,9 +lre22_dev_rqxot,tso-tso,9 +lre22_dev_rumiv,ara-aeb,9 +lre22_dev_runhh,afr-afr,6 +lre22_dev_ruvpd,eng-iaf,4 +lre22_dev_rvpkd,fra-ntf,1 +lre22_dev_rvqxq,orm-orm,12 +lre22_dev_rvstc,ara-arq,7 +lre22_dev_rwbea,tir-tir,9 +lre22_dev_rweyk,nbl-nbl,2 +lre22_dev_rwnfb,eng-ens,8 +lre22_dev_rwrhn,afr-afr,11 +lre22_dev_rxhkp,ara-arq,3 +lre22_dev_rxixz,nbl-nbl,15 +lre22_dev_rxmft,zul-zul,7 +lre22_dev_ryknh,ara-ayl,5 +lre22_dev_rytyf,zul-zul,12 +lre22_dev_rywss,tso-tso,1 +lre22_dev_rzjrd,nbl-nbl,7 +lre22_dev_rzpyx,tso-tso,2 +lre22_dev_satbk,ven-ven,7 +lre22_dev_sbfhc,fra-ntf,6 +lre22_dev_sboxi,xho-xho,15 +lre22_dev_scxxn,eng-iaf,5 +lre22_dev_scyvp,ara-aeb,6 +lre22_dev_sdbou,tir-tir,10 +lre22_dev_sddua,tir-tir,11 +lre22_dev_seasj,afr-afr,7 +lre22_dev_sevcw,tir-tir,12 +lre22_dev_sfevx,tso-tso,4 +lre22_dev_sfqgm,fra-ntf,1 +lre22_dev_sgaza,ara-aeb,8 +lre22_dev_sgkrh,afr-afr,9 +lre22_dev_sgmjh,nbl-nbl,14 +lre22_dev_shafn,ven-ven,8 +lre22_dev_shaob,orm-orm,10 +lre22_dev_shnns,afr-afr,6 +lre22_dev_siprc,ven-ven,7 +lre22_dev_sisge,afr-afr,13 +lre22_dev_siuwu,ara-arq,10 +lre22_dev_sivik,fra-ntf,2 +lre22_dev_sjyoo,afr-afr,1 +lre22_dev_skacz,fra-ntf,13 +lre22_dev_skcai,orm-orm,12 +lre22_dev_skctw,nbl-nbl,0 +lre22_dev_skygk,afr-afr,13 +lre22_dev_slraf,ara-aeb,6 +lre22_dev_slrzl,eng-ens,11 +lre22_dev_sltzh,xho-xho,6 +lre22_dev_sluki,ven-ven,1 +lre22_dev_slyez,tso-tso,8 +lre22_dev_slzuh,xho-xho,15 +lre22_dev_smdsm,nbl-nbl,7 +lre22_dev_smhae,ara-ayl,3 +lre22_dev_smxhe,ara-aeb,10 +lre22_dev_snayr,afr-afr,2 +lre22_dev_snbxs,eng-ens,8 +lre22_dev_sngol,tso-tso,9 +lre22_dev_snhun,fra-ntf,13 +lre22_dev_snkib,ven-ven,8 +lre22_dev_snqld,eng-iaf,2 +lre22_dev_sntvb,eng-ens,11 +lre22_dev_snzbl,tir-tir,12 +lre22_dev_sobid,afr-afr,3 +lre22_dev_soknx,orm-orm,15 +lre22_dev_spesw,ven-ven,13 +lre22_dev_sphuq,eng-iaf,12 +lre22_dev_spqcy,xho-xho,11 +lre22_dev_sqcyu,zul-zul,9 +lre22_dev_sqdkr,eng-iaf,13 +lre22_dev_sqfnt,ara-aeb,9 +lre22_dev_sqhrr,eng-ens,11 +lre22_dev_sqyiu,ara-ayl,4 +lre22_dev_srbwp,ara-aeb,10 +lre22_dev_srokn,afr-afr,6 +lre22_dev_srzck,ara-ayl,3 +lre22_dev_ssbei,tso-tso,10 +lre22_dev_ssfmz,eng-iaf,12 +lre22_dev_ssmgk,xho-xho,10 +lre22_dev_ssmsy,xho-xho,4 +lre22_dev_stgcb,afr-afr,10 +lre22_dev_stihb,afr-afr,0 +lre22_dev_stkav,ara-aeb,9 +lre22_dev_stkrw,xho-xho,3 +lre22_dev_sttnk,fra-ntf,8 +lre22_dev_stwkk,eng-iaf,12 +lre22_dev_stwrt,nbl-nbl,1 +lre22_dev_subio,afr-afr,1 +lre22_dev_sumjk,ara-arq,6 +lre22_dev_suocb,nbl-nbl,6 +lre22_dev_svcbx,tso-tso,9 +lre22_dev_svllg,fra-ntf,14 +lre22_dev_svvqs,afr-afr,3 +lre22_dev_svxyz,ara-ayl,1 +lre22_dev_swhlf,ara-aeb,10 +lre22_dev_swhnk,fra-ntf,12 +lre22_dev_swnrg,ven-ven,12 +lre22_dev_swofz,zul-zul,4 +lre22_dev_swuls,tso-tso,8 +lre22_dev_sxfkn,ara-aeb,2 +lre22_dev_sycoz,tir-tir,10 +lre22_dev_syoek,fra-ntf,5 +lre22_dev_sypnb,ven-ven,13 +lre22_dev_syvrt,eng-iaf,8 +lre22_dev_szmoc,ven-ven,6 +lre22_dev_szmwp,eng-ens,8 +lre22_dev_talec,ven-ven,11 +lre22_dev_tasfs,ven-ven,7 +lre22_dev_tbbrr,xho-xho,5 +lre22_dev_tbcun,ara-aeb,3 +lre22_dev_tbhnw,nbl-nbl,15 +lre22_dev_tblhf,ven-ven,12 +lre22_dev_tbozq,xho-xho,1 +lre22_dev_tcckd,ara-ayl,3 +lre22_dev_tcele,tso-tso,11 +lre22_dev_tciob,tso-tso,10 +lre22_dev_tcpxj,tir-tir,9 +lre22_dev_tdejo,tir-tir,6 +lre22_dev_tdfqo,tso-tso,0 +lre22_dev_tdhhf,zul-zul,10 +lre22_dev_tdjje,ven-ven,10 +lre22_dev_tdkrp,orm-orm,6 +lre22_dev_tebop,tso-tso,10 +lre22_dev_teeqm,ven-ven,6 +lre22_dev_tejsn,tir-tir,12 +lre22_dev_teptc,ara-arq,10 +lre22_dev_tetmt,orm-orm,9 +lre22_dev_tfkij,ara-aeb,2 +lre22_dev_tfnin,tir-tir,3 +lre22_dev_tfyqz,tir-tir,3 +lre22_dev_tgbui,ara-aeb,5 +lre22_dev_tgixi,xho-xho,13 +lre22_dev_tgmud,eng-iaf,6 +lre22_dev_tgult,eng-ens,2 +lre22_dev_thcjv,tso-tso,5 +lre22_dev_thzir,eng-ens,11 +lre22_dev_tisfm,fra-ntf,9 +lre22_dev_tixou,xho-xho,2 +lre22_dev_tiyuw,afr-afr,5 +lre22_dev_tjdcc,afr-afr,13 +lre22_dev_tjikt,zul-zul,12 +lre22_dev_tjpdw,ara-arq,8 +lre22_dev_tkadi,ven-ven,12 +lre22_dev_tkcbm,afr-afr,6 +lre22_dev_tkgfw,eng-ens,11 +lre22_dev_tkiks,ara-aeb,6 +lre22_dev_tlgzi,xho-xho,1 +lre22_dev_tlhlw,tir-tir,6 +lre22_dev_tloqn,afr-afr,6 +lre22_dev_tmcje,eng-ens,4 +lre22_dev_tmjpw,eng-iaf,2 +lre22_dev_tmxtu,ven-ven,2 +lre22_dev_tngwh,tir-tir,8 +lre22_dev_tnqdv,ara-aeb,9 +lre22_dev_tnqro,xho-xho,15 +lre22_dev_tnqzy,orm-orm,7 +lre22_dev_tnskm,xho-xho,12 +lre22_dev_tnvhc,ven-ven,12 +lre22_dev_tofhy,zul-zul,6 +lre22_dev_tohkd,zul-zul,9 +lre22_dev_tonqb,ven-ven,6 +lre22_dev_tpbib,tso-tso,1 +lre22_dev_tpejq,ara-arq,3 +lre22_dev_tpfir,eng-ens,11 +lre22_dev_tphgn,zul-zul,12 +lre22_dev_tpidd,ara-arq,6 +lre22_dev_tpkce,eng-ens,11 +lre22_dev_tpszi,orm-orm,15 +lre22_dev_tpwcn,eng-iaf,6 +lre22_dev_trdfy,ara-ayl,3 +lre22_dev_tsbms,ara-ayl,4 +lre22_dev_tslui,tso-tso,6 +lre22_dev_tsvvy,zul-zul,10 +lre22_dev_tsyey,xho-xho,10 +lre22_dev_ttlco,eng-iaf,12 +lre22_dev_tubpr,orm-orm,13 +lre22_dev_tugpl,eng-ens,9 +lre22_dev_tuoiq,tir-tir,4 +lre22_dev_tuxfx,zul-zul,3 +lre22_dev_tvahj,tir-tir,9 +lre22_dev_tvewc,eng-iaf,3 +lre22_dev_tvfvc,ara-ayl,8 +lre22_dev_tvkod,xho-xho,5 +lre22_dev_tvkwe,zul-zul,9 +lre22_dev_tvopo,xho-xho,12 +lre22_dev_tvqui,eng-ens,7 +lre22_dev_tvsbw,ara-arq,6 +lre22_dev_tvxvk,ven-ven,8 +lre22_dev_twbkf,nbl-nbl,9 +lre22_dev_twfot,ara-arq,6 +lre22_dev_twkns,ara-ayl,4 +lre22_dev_twuvf,eng-ens,10 +lre22_dev_txahv,eng-ens,8 +lre22_dev_txcob,ara-aeb,6 +lre22_dev_txnvi,zul-zul,3 +lre22_dev_txurh,afr-afr,7 +lre22_dev_txzkl,ara-arq,5 +lre22_dev_tyfad,tso-tso,7 +lre22_dev_tyhwp,ara-aeb,8 +lre22_dev_tzism,tir-tir,12 +lre22_dev_tzsfj,tir-tir,12 +lre22_dev_tzwof,eng-iaf,9 +lre22_dev_uahzm,afr-afr,5 +lre22_dev_uajwt,tso-tso,7 +lre22_dev_uanlr,zul-zul,13 +lre22_dev_uaoju,zul-zul,8 +lre22_dev_uaryk,xho-xho,15 +lre22_dev_ubfaf,ven-ven,12 +lre22_dev_ucbje,ara-aeb,8 +lre22_dev_ucrpa,ara-arq,3 +lre22_dev_udtzx,eng-iaf,7 +lre22_dev_uduja,fra-ntf,6 +lre22_dev_udxpl,tso-tso,2 +lre22_dev_uesmx,eng-iaf,5 +lre22_dev_ufewk,eng-iaf,8 +lre22_dev_ugjxy,tir-tir,4 +lre22_dev_ugsxl,eng-ens,3 +lre22_dev_ugvov,tso-tso,8 +lre22_dev_uhmdw,tso-tso,10 +lre22_dev_uhqng,nbl-nbl,12 +lre22_dev_uhymw,tir-tir,8 +lre22_dev_uhzmr,eng-ens,2 +lre22_dev_uimtg,ara-ayl,4 +lre22_dev_uirdr,nbl-nbl,13 +lre22_dev_uiszj,ara-aeb,8 +lre22_dev_ujada,ara-ayl,9 +lre22_dev_ujmqw,ven-ven,4 +lre22_dev_ujswr,afr-afr,11 +lre22_dev_ujvve,xho-xho,10 +lre22_dev_ukfha,ara-ayl,6 +lre22_dev_ukkpr,eng-ens,10 +lre22_dev_ukpdg,fra-ntf,13 +lre22_dev_ukpoy,nbl-nbl,15 +lre22_dev_uktod,ara-ayl,4 +lre22_dev_uktvh,zul-zul,13 +lre22_dev_ukuwo,ara-ayl,5 +lre22_dev_ukynv,zul-zul,12 +lre22_dev_ulepv,ara-ayl,5 +lre22_dev_ulgtj,zul-zul,7 +lre22_dev_ulofk,eng-iaf,11 +lre22_dev_uluog,ara-arq,3 +lre22_dev_umbpy,zul-zul,13 +lre22_dev_umjzo,tso-tso,5 +lre22_dev_uncdb,ara-arq,9 +lre22_dev_unffr,ara-ayl,8 +lre22_dev_unpif,eng-ens,9 +lre22_dev_uoikj,eng-iaf,13 +lre22_dev_uopfp,nbl-nbl,7 +lre22_dev_upenl,eng-iaf,13 +lre22_dev_uphuw,xho-xho,11 +lre22_dev_upkbw,ara-ayl,4 +lre22_dev_uplen,xho-xho,9 +lre22_dev_upqod,orm-orm,6 +lre22_dev_upspe,afr-afr,12 +lre22_dev_uqnkk,tir-tir,12 +lre22_dev_uqvxc,eng-ens,0 +lre22_dev_urgqx,ara-ayl,8 +lre22_dev_urkgk,tir-tir,12 +lre22_dev_uscky,xho-xho,3 +lre22_dev_usiwx,tir-tir,9 +lre22_dev_usnzj,zul-zul,5 +lre22_dev_usopt,xho-xho,8 +lre22_dev_uswgv,nbl-nbl,11 +lre22_dev_uszcb,ara-arq,4 +lre22_dev_utahf,ara-ayl,7 +lre22_dev_utaxq,tso-tso,9 +lre22_dev_utcwb,afr-afr,10 +lre22_dev_uuhry,tir-tir,9 +lre22_dev_uuprr,eng-ens,7 +lre22_dev_uuvqh,zul-zul,2 +lre22_dev_uwcmh,orm-orm,4 +lre22_dev_uwiev,zul-zul,13 +lre22_dev_uwjzb,ven-ven,10 +lre22_dev_uwony,orm-orm,1 +lre22_dev_uwqeq,orm-orm,2 +lre22_dev_uwvfl,nbl-nbl,5 +lre22_dev_uxdjn,xho-xho,12 +lre22_dev_uxqte,zul-zul,13 +lre22_dev_uxryh,ven-ven,11 +lre22_dev_uyhzp,orm-orm,15 +lre22_dev_uyrjl,tso-tso,10 +lre22_dev_uyzcl,eng-ens,11 +lre22_dev_uzbqz,fra-ntf,4 +lre22_dev_uzoxq,ara-aeb,9 +lre22_dev_vabxl,nbl-nbl,11 +lre22_dev_vafyo,nbl-nbl,15 +lre22_dev_vascl,nbl-nbl,0 +lre22_dev_vauqx,ara-arq,10 +lre22_dev_vbscm,xho-xho,3 +lre22_dev_vbulh,xho-xho,12 +lre22_dev_vbwwp,xho-xho,15 +lre22_dev_vbznk,ara-arq,6 +lre22_dev_vcibu,nbl-nbl,9 +lre22_dev_vcjun,zul-zul,12 +lre22_dev_vckxt,xho-xho,7 +lre22_dev_vdkjy,fra-ntf,14 +lre22_dev_vdmyt,ara-ayl,0 +lre22_dev_vdoif,ven-ven,13 +lre22_dev_vdvjv,orm-orm,12 +lre22_dev_vebet,ara-aeb,1 +lre22_dev_velkr,ara-aeb,1 +lre22_dev_vgbmm,tir-tir,9 +lre22_dev_vgucw,nbl-nbl,7 +lre22_dev_vhiyb,afr-afr,9 +lre22_dev_vhoej,tir-tir,5 +lre22_dev_vhryd,orm-orm,13 +lre22_dev_vhzdh,tso-tso,10 +lre22_dev_viapx,tso-tso,3 +lre22_dev_vifdj,ara-ayl,4 +lre22_dev_vijbo,zul-zul,12 +lre22_dev_virnr,eng-ens,6 +lre22_dev_vjhbd,orm-orm,6 +lre22_dev_vjoca,ara-aeb,10 +lre22_dev_vjtou,eng-ens,5 +lre22_dev_vjxpv,ara-aeb,10 +lre22_dev_vkmab,fra-ntf,2 +lre22_dev_vkrvz,tir-tir,8 +lre22_dev_vkwwf,tso-tso,9 +lre22_dev_vlbdk,zul-zul,6 +lre22_dev_vliie,orm-orm,9 +lre22_dev_vlrve,eng-iaf,2 +lre22_dev_vmaet,tir-tir,3 +lre22_dev_vmdhi,eng-ens,10 +lre22_dev_vmdjw,nbl-nbl,13 +lre22_dev_vmjut,fra-ntf,9 +lre22_dev_vmrrg,eng-ens,3 +lre22_dev_vnjxn,nbl-nbl,7 +lre22_dev_vnmxm,ven-ven,12 +lre22_dev_vnykj,zul-zul,10 +lre22_dev_vovab,zul-zul,11 +lre22_dev_vovvl,zul-zul,11 +lre22_dev_vpcey,tir-tir,6 +lre22_dev_vpodd,nbl-nbl,11 +lre22_dev_vptke,eng-ens,4 +lre22_dev_vpulr,xho-xho,15 +lre22_dev_vpuve,tir-tir,8 +lre22_dev_vqttr,eng-iaf,12 +lre22_dev_vqzae,eng-iaf,11 +lre22_dev_vrnsg,tso-tso,8 +lre22_dev_vshpc,ara-aeb,6 +lre22_dev_vslbh,ara-arq,9 +lre22_dev_vsmaz,tir-tir,5 +lre22_dev_vsnez,tso-tso,8 +lre22_dev_vsnjp,fra-ntf,14 +lre22_dev_vsocn,ven-ven,7 +lre22_dev_vsvom,afr-afr,8 +lre22_dev_vtnfc,tir-tir,4 +lre22_dev_vtnlb,eng-ens,4 +lre22_dev_vubwb,eng-ens,8 +lre22_dev_vufsn,ara-aeb,3 +lre22_dev_vuiqu,tir-tir,8 +lre22_dev_vumeq,xho-xho,0 +lre22_dev_vupse,ven-ven,6 +lre22_dev_vvauz,xho-xho,14 +lre22_dev_vvfze,eng-ens,11 +lre22_dev_vviyr,zul-zul,12 +lre22_dev_vvwiq,fra-ntf,5 +lre22_dev_vwnkj,zul-zul,5 +lre22_dev_vwoww,orm-orm,7 +lre22_dev_vwtne,afr-afr,5 +lre22_dev_vwxgt,ara-arq,10 +lre22_dev_vxabl,eng-ens,8 +lre22_dev_vxnsl,afr-afr,7 +lre22_dev_vxslj,tir-tir,10 +lre22_dev_vxsvc,tir-tir,11 +lre22_dev_vxuiz,ara-aeb,10 +lre22_dev_vzarl,ara-ayl,7 +lre22_dev_vzeew,ven-ven,6 +lre22_dev_vzjtc,ara-arq,0 +lre22_dev_vzkdb,tso-tso,10 +lre22_dev_vzvpq,ara-arq,9 +lre22_dev_waqyh,xho-xho,15 +lre22_dev_wawwu,xho-xho,14 +lre22_dev_wbgqi,tso-tso,11 +lre22_dev_wcctp,eng-ens,10 +lre22_dev_wdcer,afr-afr,3 +lre22_dev_wdeor,fra-ntf,14 +lre22_dev_wdfdd,eng-iaf,2 +lre22_dev_wdkvb,eng-ens,11 +lre22_dev_wdogx,ara-aeb,7 +lre22_dev_wdqdq,ara-arq,10 +lre22_dev_wdxwu,tir-tir,5 +lre22_dev_weaek,ara-arq,4 +lre22_dev_wefui,tso-tso,10 +lre22_dev_wehjh,tir-tir,10 +lre22_dev_weypz,nbl-nbl,12 +lre22_dev_wffdy,zul-zul,12 +lre22_dev_wffgq,tso-tso,8 +lre22_dev_wfvlh,ven-ven,8 +lre22_dev_wgago,eng-ens,5 +lre22_dev_wglzd,afr-afr,11 +lre22_dev_wgsbu,afr-afr,5 +lre22_dev_whdhw,nbl-nbl,7 +lre22_dev_whogu,eng-iaf,13 +lre22_dev_whpee,tso-tso,9 +lre22_dev_whqpd,ara-aeb,9 +lre22_dev_wikrr,ven-ven,11 +lre22_dev_witju,fra-ntf,11 +lre22_dev_wjcme,orm-orm,10 +lre22_dev_wkare,ara-arq,2 +lre22_dev_wkbfe,afr-afr,9 +lre22_dev_wkecn,xho-xho,13 +lre22_dev_wkhxo,afr-afr,9 +lre22_dev_wlgae,ara-arq,6 +lre22_dev_wlnls,eng-iaf,7 +lre22_dev_wlsxb,eng-ens,1 +lre22_dev_wlwuc,nbl-nbl,8 +lre22_dev_wnaqr,nbl-nbl,9 +lre22_dev_wndpq,fra-ntf,13 +lre22_dev_wnkdc,ara-ayl,2 +lre22_dev_wnknc,nbl-nbl,9 +lre22_dev_wnppz,orm-orm,15 +lre22_dev_wpzgm,afr-afr,13 +lre22_dev_wqhqj,ara-ayl,9 +lre22_dev_wqreb,afr-afr,11 +lre22_dev_wqrez,eng-ens,4 +lre22_dev_wqtsf,ara-arq,8 +lre22_dev_wqwtc,orm-orm,3 +lre22_dev_wrfwf,ven-ven,7 +lre22_dev_wrqqt,orm-orm,15 +lre22_dev_wrutf,afr-afr,7 +lre22_dev_wrvzk,nbl-nbl,1 +lre22_dev_wrxly,fra-ntf,13 +lre22_dev_wsbiw,ara-aeb,8 +lre22_dev_wshay,zul-zul,8 +lre22_dev_wsous,tso-tso,5 +lre22_dev_wszpj,ven-ven,7 +lre22_dev_wtksi,afr-afr,8 +lre22_dev_wugbw,xho-xho,6 +lre22_dev_wujfv,afr-afr,11 +lre22_dev_wuwek,xho-xho,12 +lre22_dev_wvhhk,fra-ntf,2 +lre22_dev_wvosz,nbl-nbl,3 +lre22_dev_wwagu,xho-xho,14 +lre22_dev_wwbuj,eng-iaf,2 +lre22_dev_wwgnr,afr-afr,10 +lre22_dev_wwjev,afr-afr,12 +lre22_dev_wwmsu,ara-arq,4 +lre22_dev_wwrmy,ven-ven,7 +lre22_dev_wwvhd,ara-arq,9 +lre22_dev_wxdjv,ara-ayl,6 +lre22_dev_wygox,tir-tir,6 +lre22_dev_wyhuq,zul-zul,13 +lre22_dev_wzoir,xho-xho,15 +lre22_dev_wzvwa,orm-orm,6 +lre22_dev_xapvn,tso-tso,8 +lre22_dev_xarkl,eng-ens,5 +lre22_dev_xavhh,nbl-nbl,10 +lre22_dev_xazuy,orm-orm,3 +lre22_dev_xbnft,eng-iaf,0 +lre22_dev_xbqbc,fra-ntf,7 +lre22_dev_xbzfw,tir-tir,11 +lre22_dev_xccde,ara-arq,3 +lre22_dev_xcdty,zul-zul,8 +lre22_dev_xcjkb,ara-ayl,7 +lre22_dev_xcmty,ara-arq,10 +lre22_dev_xcsbc,tso-tso,1 +lre22_dev_xdkjb,nbl-nbl,11 +lre22_dev_xdknq,nbl-nbl,11 +lre22_dev_xdoik,eng-ens,10 +lre22_dev_xdtyd,nbl-nbl,4 +lre22_dev_xearl,eng-iaf,3 +lre22_dev_xedqa,nbl-nbl,11 +lre22_dev_xefnx,eng-ens,11 +lre22_dev_xeipr,tir-tir,11 +lre22_dev_xekhs,zul-zul,9 +lre22_dev_xelzr,ara-aeb,9 +lre22_dev_xenhb,ara-aeb,3 +lre22_dev_xfdsx,xho-xho,12 +lre22_dev_xfggl,xho-xho,9 +lre22_dev_xgspz,eng-iaf,13 +lre22_dev_xgwmu,tso-tso,8 +lre22_dev_xhbmk,orm-orm,15 +lre22_dev_xhdtl,orm-orm,3 +lre22_dev_xisjn,ara-arq,8 +lre22_dev_xitdz,nbl-nbl,10 +lre22_dev_xizbg,xho-xho,14 +lre22_dev_xjcph,xho-xho,10 +lre22_dev_xjcvd,zul-zul,7 +lre22_dev_xjlgm,ara-aeb,3 +lre22_dev_xjxzy,eng-ens,2 +lre22_dev_xkfsd,ven-ven,12 +lre22_dev_xkktj,eng-iaf,12 +lre22_dev_xkmmy,ara-aeb,10 +lre22_dev_xltgz,ara-ayl,5 +lre22_dev_xmbby,orm-orm,3 +lre22_dev_xmcmv,xho-xho,14 +lre22_dev_xngam,fra-ntf,14 +lre22_dev_xnsev,ara-ayl,8 +lre22_dev_xnwsq,ara-arq,8 +lre22_dev_xnwwh,zul-zul,13 +lre22_dev_xobeh,tir-tir,11 +lre22_dev_xolau,ven-ven,13 +lre22_dev_xoqtn,eng-iaf,10 +lre22_dev_xovpd,eng-iaf,10 +lre22_dev_xpaff,eng-ens,9 +lre22_dev_xpahm,ara-arq,4 +lre22_dev_xpcrs,tso-tso,5 +lre22_dev_xpdsg,eng-iaf,5 +lre22_dev_xpjqj,nbl-nbl,6 +lre22_dev_xqwtk,ara-arq,10 +lre22_dev_xrfge,ara-arq,8 +lre22_dev_xrhka,orm-orm,9 +lre22_dev_xrpup,zul-zul,8 +lre22_dev_xsbff,ara-aeb,9 +lre22_dev_xsffv,tso-tso,1 +lre22_dev_xstnu,eng-ens,5 +lre22_dev_xthfd,ara-aeb,8 +lre22_dev_xthzz,ven-ven,4 +lre22_dev_xtmgg,eng-iaf,13 +lre22_dev_xtyic,nbl-nbl,14 +lre22_dev_xucyl,eng-ens,7 +lre22_dev_xudii,ara-ayl,3 +lre22_dev_xugux,afr-afr,0 +lre22_dev_xuqnj,ara-ayl,4 +lre22_dev_xvaoh,nbl-nbl,9 +lre22_dev_xvclh,afr-afr,9 +lre22_dev_xveae,xho-xho,4 +lre22_dev_xxpqz,ara-arq,9 +lre22_dev_xxqad,tso-tso,10 +lre22_dev_xybed,tir-tir,9 +lre22_dev_xyrex,eng-ens,11 +lre22_dev_xzlas,eng-iaf,9 +lre22_dev_xztyr,orm-orm,9 +lre22_dev_yaxkb,zul-zul,12 +lre22_dev_ybcvu,xho-xho,13 +lre22_dev_ybjon,orm-orm,2 +lre22_dev_ybubm,ven-ven,5 +lre22_dev_ycarc,eng-ens,6 +lre22_dev_ychjj,orm-orm,2 +lre22_dev_ycnyc,tir-tir,7 +lre22_dev_ycsvt,afr-afr,12 +lre22_dev_ydaxa,nbl-nbl,8 +lre22_dev_ydrxu,nbl-nbl,1 +lre22_dev_yeekw,fra-ntf,13 +lre22_dev_yevan,tir-tir,11 +lre22_dev_yfaan,tir-tir,10 +lre22_dev_yfayx,afr-afr,6 +lre22_dev_yfpsd,fra-ntf,1 +lre22_dev_yfxkm,ven-ven,7 +lre22_dev_yguqk,ven-ven,3 +lre22_dev_yhrgj,afr-afr,8 +lre22_dev_yhzyq,ara-ayl,5 +lre22_dev_yiqui,eng-iaf,12 +lre22_dev_yjens,ara-ayl,7 +lre22_dev_yjkxx,eng-ens,8 +lre22_dev_yjypk,ara-ayl,9 +lre22_dev_ykchd,ven-ven,8 +lre22_dev_ykktl,xho-xho,0 +lre22_dev_ylhwh,orm-orm,9 +lre22_dev_ylnms,tso-tso,2 +lre22_dev_ylsdz,ven-ven,7 +lre22_dev_ymcmp,eng-iaf,8 +lre22_dev_ymfzx,tso-tso,7 +lre22_dev_ymizm,fra-ntf,0 +lre22_dev_ympvj,tir-tir,9 +lre22_dev_ymslh,tir-tir,12 +lre22_dev_ynavg,zul-zul,9 +lre22_dev_ynhlk,tir-tir,9 +lre22_dev_ynnkb,eng-ens,10 +lre22_dev_yogkc,fra-ntf,7 +lre22_dev_yokld,eng-ens,4 +lre22_dev_yokve,tir-tir,6 +lre22_dev_yomdz,ara-ayl,6 +lre22_dev_yomuu,xho-xho,12 +lre22_dev_yoobm,ara-ayl,8 +lre22_dev_yoocz,eng-ens,10 +lre22_dev_yopyf,eng-iaf,5 +lre22_dev_yoxoc,tir-tir,8 +lre22_dev_ypaem,afr-afr,5 +lre22_dev_ypamp,afr-afr,7 +lre22_dev_ypjpq,tir-tir,8 +lre22_dev_yplba,ara-arq,9 +lre22_dev_ypnrh,fra-ntf,1 +lre22_dev_ypqfg,eng-ens,7 +lre22_dev_yrdsl,eng-ens,2 +lre22_dev_yrtkv,afr-afr,7 +lre22_dev_yrwrb,nbl-nbl,9 +lre22_dev_ysmlk,eng-ens,11 +lre22_dev_yspja,orm-orm,5 +lre22_dev_ytfnn,fra-ntf,14 +lre22_dev_yturp,ara-aeb,6 +lre22_dev_ytvbd,afr-afr,4 +lre22_dev_yuhvo,tso-tso,8 +lre22_dev_yundi,ara-arq,3 +lre22_dev_yvmnx,ara-arq,10 +lre22_dev_yvqud,xho-xho,15 +lre22_dev_yvxdd,ara-ayl,4 +lre22_dev_ywjtq,xho-xho,5 +lre22_dev_ywnza,fra-ntf,12 +lre22_dev_yxnno,tso-tso,10 +lre22_dev_yxoww,tir-tir,7 +lre22_dev_yxpgi,ara-arq,5 +lre22_dev_yxsta,eng-ens,7 +lre22_dev_yyltz,xho-xho,8 +lre22_dev_yyqqx,fra-ntf,12 +lre22_dev_yzloh,ara-ayl,7 +lre22_dev_zacdy,ara-ayl,3 +lre22_dev_zadkk,tir-tir,9 +lre22_dev_zalpc,afr-afr,6 +lre22_dev_zarod,orm-orm,8 +lre22_dev_zasvb,afr-afr,11 +lre22_dev_zazom,ara-arq,9 +lre22_dev_zbfqk,afr-afr,13 +lre22_dev_zbqew,tso-tso,2 +lre22_dev_zbrkn,eng-ens,7 +lre22_dev_zbubp,zul-zul,9 +lre22_dev_zbytc,ara-arq,8 +lre22_dev_zcfns,tir-tir,6 +lre22_dev_zcfzk,afr-afr,7 +lre22_dev_zcrgv,ara-arq,10 +lre22_dev_zdxdn,ara-ayl,7 +lre22_dev_zdydi,eng-ens,1 +lre22_dev_zebzq,ven-ven,4 +lre22_dev_zedlk,xho-xho,14 +lre22_dev_zeqpp,tir-tir,12 +lre22_dev_zfjbm,ara-arq,10 +lre22_dev_zfkne,nbl-nbl,13 +lre22_dev_zflnr,ven-ven,13 +lre22_dev_zfoyd,xho-xho,4 +lre22_dev_zgdyu,eng-iaf,8 +lre22_dev_zgmja,zul-zul,9 +lre22_dev_zgvfs,ara-arq,6 +lre22_dev_zhmud,orm-orm,14 +lre22_dev_zhoml,tso-tso,9 +lre22_dev_zijcb,xho-xho,10 +lre22_dev_ziktm,ara-aeb,10 +lre22_dev_zipxy,ara-arq,9 +lre22_dev_ziqxc,eng-iaf,1 +lre22_dev_zjhir,ven-ven,7 +lre22_dev_zjmqp,orm-orm,13 +lre22_dev_zjrrk,tso-tso,11 +lre22_dev_zjtwd,ara-aeb,3 +lre22_dev_zkfcf,xho-xho,6 +lre22_dev_zkftc,nbl-nbl,4 +lre22_dev_zkqei,ara-ayl,7 +lre22_dev_zkwqo,zul-zul,11 +lre22_dev_zlamn,nbl-nbl,6 +lre22_dev_zlbor,xho-xho,14 +lre22_dev_zloet,ven-ven,8 +lre22_dev_zlvhk,zul-zul,5 +lre22_dev_zlzqv,fra-ntf,12 +lre22_dev_zmobq,ara-ayl,7 +lre22_dev_zmuiv,zul-zul,9 +lre22_dev_znvqw,zul-zul,4 +lre22_dev_znzuu,tir-tir,0 +lre22_dev_zoava,eng-iaf,6 +lre22_dev_zodvu,tso-tso,0 +lre22_dev_zosdw,nbl-nbl,15 +lre22_dev_zpnvq,xho-xho,6 +lre22_dev_zqeby,eng-iaf,12 +lre22_dev_zqgdd,nbl-nbl,9 +lre22_dev_zqhaw,nbl-nbl,5 +lre22_dev_zqkau,orm-orm,8 +lre22_dev_zqkel,ara-ayl,9 +lre22_dev_zqlnd,ara-aeb,8 +lre22_dev_zrnpw,orm-orm,8 +lre22_dev_zrqvc,afr-afr,9 +lre22_dev_zrrgq,ven-ven,8 +lre22_dev_zryit,zul-zul,8 +lre22_dev_zsckt,zul-zul,4 +lre22_dev_zucqq,orm-orm,4 +lre22_dev_zusln,orm-orm,11 +lre22_dev_zuxzw,tir-tir,0 +lre22_dev_zvabs,tir-tir,11 +lre22_dev_zvlid,tso-tso,11 +lre22_dev_zvned,eng-iaf,5 +lre22_dev_zvtwr,xho-xho,11 +lre22_dev_zwmim,orm-orm,11 +lre22_dev_zwnsu,ara-arq,8 +lre22_dev_zwtxn,ara-arq,10 +lre22_dev_zxfcm,orm-orm,3 +lre22_dev_zxsgm,tir-tir,5 +lre22_dev_zybya,eng-iaf,10 +lre22_dev_zygak,zul-zul,1 +lre22_dev_zylqc,eng-ens,3 +lre22_dev_zyppc,fra-ntf,8 +lre22_dev_zywem,eng-ens,8 +lre22_dev_zzapx,ara-ayl,5 +lre22_dev_zzumc,ara-arq,2 +lre22_dev_zzvdl,fra-ntf,5 +lre22_dev_zzvjv,nbl-nbl,14 diff --git a/egs/lre22/fixed.v1.8k/resources/lre17_ara-ary/segs_ara-ary.csv b/egs/lre22/fixed.v1.8k/resources/lre17_ara-ary/segs_ara-ary.csv new file mode 100644 index 00000000..4f5caa4d --- /dev/null +++ b/egs/lre22/fixed.v1.8k/resources/lre17_ara-ary/segs_ara-ary.csv @@ -0,0 +1,1306 @@ +id,class_id,logp +20110112_085632_25-a.sph,ara-arq,0.9999215882183581 +20110112_085632_25-b.sph,ara-arq,0.9933264028811798 +20110112_093821_26-a.sph,ara-arq,0.9982419072530201 +20110112_093821_26-b.sph,ara-arq,0.9877989962861538 +20110112_100739_27-a.sph,ara-arq,0.9998601825318931 +20110112_100739_27-b.sph,ara-arq,0.9998461026324816 +20110112_102931_28-a.sph,ara-arq,0.9988240996203235 +20110112_102931_28-b.sph,ara-arq,0.9992945069664346 +20110112_110035_29-a.sph,ara-arq,0.9956050254373241 +20110112_110035_29-b.sph,ara-arq,0.9998179655506749 +20110112_120034_30-a.sph,ara-arq,0.9999956254632509 +20110112_120034_30-b.sph,ara-arq,0.9999961650306969 +20110112_121837_31-a.sph,ara-arq,0.9992484722468692 +20110112_121837_31-b.sph,ara-arq,0.9988521768999281 +20110112_125124_32-a.sph,ara-arq,0.9992562768302394 +20110112_125124_32-b.sph,ara-arq,0.9965659470692162 +20110112_131159_33-a.sph,ara-arq,0.9999484673386585 +20110112_131159_33-b.sph,ara-arq,0.999988595598022 +20110112_135057_34-a.sph,ara-arq,0.9964035382836839 +20110112_140409_35-b.sph,ara-arq,0.9993163776118849 +20110112_143151_37-a.sph,ara-arq,0.9978091561892594 +20110112_144321_38-a.sph,ara-arq,0.9882030389155663 +20110112_151915_40-a.sph,ara-arq,0.9999863814255752 +20110112_151915_40-b.sph,ara-arq,0.9999867872908822 +20110112_164438_41-a.sph,ara-arq,0.9911464286505152 +20110112_164438_41-b.sph,ara-arq,0.9982784286981317 +20110112_170310_42-b.sph,ara-arq,0.9957287722811907 +20110112_174334_44-a.sph,ara-arq,0.9994637389176244 +20110112_174334_44-b.sph,ara-arq,0.9999979913404728 +20110112_175917_46-a.sph,ara-arq,0.999896350765443 +20110112_175917_46-b.sph,ara-arq,0.9992906517833624 +20110112_181316_47-a.sph,ara-arq,0.9865135533386489 +20110112_181316_47-b.sph,ara-arq,0.9799283164010801 +20110112_184303_48-a.sph,ara-arq,0.9999941253436267 +20110112_184303_48-b.sph,ara-arq,0.9894727409462367 +20110112_185018_49-a.sph,ara-arq,0.9993235515178335 +20110112_185018_49-b.sph,ara-arq,0.9964110986149859 +20110112_190919_51-a.sph,ara-arq,0.9975136717392243 +20110112_195355_54-a.sph,ara-arq,0.9994973714549525 +20110112_210716_56-b.sph,ara-arq,0.9999953505624561 +20110113_154325_58-a.sph,ara-arq,0.9999971862717217 +20110113_155707_60-a.sph,ara-aeb,0.999254762801491 +20110113_155707_60-b.sph,ara-arq,0.9999588882350571 +20110113_160907_61-a.sph,ara-arq,0.9999960880781135 +20110113_160907_61-b.sph,ara-arq,0.9999863740819315 +20110113_210803_66-a.sph,ara-arq,0.9973323794741106 +20110113_210803_66-b.sph,ara-arq,0.9989444092443852 +20110114_010743_69-a.sph,ara-arq,0.9999099284602719 +20110114_010743_69-b.sph,ara-arq,0.9763890642555946 +20110114_132253_70-a.sph,ara-arq,0.9999970659217092 +20110114_170901_71-a.sph,ara-arq,0.9999919868727419 +20110114_170901_71-b.sph,ara-arq,0.9854190368540645 +20110114_174847_72-a.sph,ara-arq,0.9999794141191631 +20110114_174847_72-b.sph,ara-arq,0.9999942469709167 +20110115_083054_73-a.sph,ara-arq,0.999835593793634 +20110115_083054_73-b.sph,ara-arq,0.9874468440342952 +20110115_090248_75-a.sph,ara-arq,0.9999406720119036 +20110115_090248_75-b.sph,ara-arq,0.992340537440409 +20110115_093602_76-a.sph,ara-arq,0.999989554911121 +20110115_093602_76-b.sph,ara-arq,0.9999977173414476 +20110115_094928_77-a.sph,ara-arq,0.9978338195939027 +20110115_094928_77-b.sph,ara-arq,0.9999543347173833 +20110115_101940_80-a.sph,ara-arq,0.9999919712722404 +20110115_101940_80-b.sph,ara-arq,0.9999936335315381 +20110115_114622_88-a.sph,ara-arq,0.9994230181724559 +20110115_114622_88-b.sph,ara-arq,0.9987494039544832 +20110115_115414_89-a.sph,ara-arq,0.9997873068050884 +20110115_115414_89-b.sph,ara-arq,0.9996879869582883 +20110115_120333_90-a.sph,ara-arq,0.9999911829644114 +20110115_120333_90-b.sph,ara-aeb,0.9907357013205295 +20110115_154229_93-a.sph,ara-arq,0.9999908976134848 +20110115_160534_94-a.sph,ara-arq,0.9999941336401025 +20110115_160534_94-b.sph,ara-arq,0.9970317402712736 +20110115_170405_96-a.sph,ara-arq,0.9991143532054212 +20110115_170405_96-b.sph,ara-arq,0.9996419863942765 +20110115_172633_97-a.sph,ara-arq,0.983493662520541 +20110115_172633_97-b.sph,ara-arq,0.9999960891934105 +20110115_173918_98-b.sph,ara-arq,0.9899208181081557 +20110116_212111_99-a.sph,ara-arq,0.9998892280312053 +20110118_122005_102-a.sph,ara-arq,0.9999988862765399 +20110118_122005_102-b.sph,ara-arq,0.9909318482173783 +20110118_154651_104-a.sph,ara-arq,0.999987636189934 +20110119_101115_108-a.sph,ara-arq,0.9999649201104261 +20110119_101115_108-b.sph,ara-arq,0.9970793271378511 +20110119_103907_109-a.sph,ara-arq,0.999982064110703 +20110119_103907_109-b.sph,ara-arq,0.9930181910779016 +20110119_123138_110-b.sph,ara-arq,0.9865006019221569 +20110119_130923_111-a.sph,ara-arq,0.997152167078472 +20110119_130923_111-b.sph,ara-arq,0.9981643655714929 +20110119_131501_113-b.sph,ara-arq,0.9999035286024007 +20110119_162158_114-a.sph,ara-arq,0.9999999535234435 +20110119_162158_114-b.sph,ara-arq,0.9972175314415649 +20110119_164045_115-a.sph,ara-arq,0.999995752838363 +20110119_164045_115-b.sph,ara-arq,0.9997628034439311 +20110119_185412_118-a.sph,ara-arq,0.9998202289146529 +20110119_185412_118-b.sph,ara-arq,0.9984931074535468 +20110119_191933_119-a.sph,ara-arq,0.9998427532239414 +20110119_191933_119-b.sph,ara-arq,0.9999158034849497 +20110120_063303_126-a.sph,ara-aeb,0.9983408495530686 +20110120_065333_127-a.sph,ara-arq,0.9992096176188296 +20110120_065333_127-b.sph,ara-arq,0.9999999917747343 +20110120_103241_131-a.sph,ara-arq,0.9993182345500153 +20110121_133744_171-a.sph,ara-arq,0.9999999215696308 +20110121_135108_172-a.sph,ara-arq,0.9997247255224011 +20110121_135108_172-b.sph,ara-arq,0.9948115916234687 +20110121_150759_174-a.sph,ara-arq,0.9999584736982609 +20110121_205639_189-a.sph,ara-arq,0.9999897466147079 +20110121_205639_189-b.sph,ara-arq,0.9999990343864227 +20110122_102217_196-a.sph,ara-arq,0.980904081808585 +20110122_182307_237-b.sph,ara-arq,0.9999628382800599 +20110122_213252_254-a.sph,ara-aeb,0.9977468847806278 +20110123_065916_259-a.sph,ara-arq,0.9998361236710239 +20110123_065916_259-b.sph,ara-arq,0.9999198626609019 +20110123_082139_260-a.sph,ara-arq,0.9998122368739342 +20110123_091452_261-a.sph,ara-arq,0.9937530884216169 +20110124_150410_307-b.sph,ara-arq,0.9998523760407785 +20110124_160331_310-a.sph,ara-arq,0.9999119580552518 +20110124_160331_310-b.sph,ara-arq,0.9996355765737956 +20110126_231521_427-a.sph,ara-arq,0.9999990527242926 +20110126_231521_427-b.sph,ara-arq,0.999082522282352 +20110126_233137_428-a.sph,ara-arq,0.9999980559906911 +20110126_233137_428-b.sph,ara-arq,0.999549593523283 +20110127_113123_434-a.sph,ara-arq,0.9998900000422434 +20110127_133351_443-a.sph,ara-arq,0.9999924768392251 +20110127_133351_443-b.sph,ara-arq,0.9984673062603949 +20110127_200135_452-a.sph,ara-arq,0.9995916079616751 +20110127_200135_452-b.sph,ara-arq,0.9928940070693326 +20110127_201455_453-a.sph,ara-arq,0.9999998409583539 +20110127_201455_453-b.sph,ara-arq,0.9917325527348603 +20110127_211633_454-a.sph,ara-arq,0.9999252600002078 +20110128_182748_472-a.sph,ara-arq,0.9999937657077407 +20110128_182748_472-b.sph,ara-arq,0.996288694787586 +20110128_185835_473-b.sph,ara-arq,0.9993941153143588 +20110128_193520_475-a.sph,ara-arq,0.9999530106214491 +20110128_193520_475-b.sph,ara-arq,0.9999527486872603 +20110128_200815_476-a.sph,ara-arq,0.9999873059955551 +20110128_200815_476-b.sph,ara-arq,0.9964487496070723 +20110128_203824_477-b.sph,ara-arq,0.9997165845051637 +20110128_222333_480-a.sph,ara-arq,0.9999970783462304 +20110130_080611_510-a.sph,ara-arq,0.9999749215499425 +20110130_080611_510-b.sph,ara-aeb,0.9999966593413755 +20110130_085820_512-a.sph,ara-arq,0.9999636011539246 +20110130_085820_512-b.sph,ara-arq,0.9813498028090423 +20110130_092246_513-b.sph,ara-arq,0.9999059749555838 +20110130_100253_514-a.sph,ara-arq,0.9921278081859116 +20110130_100253_514-b.sph,ara-arq,0.9999516899828312 +20110130_155522_528-a.sph,ara-arq,0.9999069680862643 +20110130_155522_528-b.sph,ara-arq,0.9997900343245884 +20110130_161649_529-a.sph,ara-arq,0.9998613523919572 +20110130_161649_529-b.sph,ara-arq,0.9977029871347945 +20110130_164452_531-a.sph,ara-arq,0.9999976354441193 +20110130_164452_531-b.sph,ara-arq,0.9965312960271767 +20110130_184540_532-a.sph,ara-ayl,0.999999999998713 +20110130_184540_532-b.sph,ara-arq,0.9849336598416535 +20110201_140835_576-a.sph,ara-arq,0.9975300712529358 +20110201_140835_576-b.sph,ara-arq,0.9999993561949782 +20110201_163316_581-a.sph,ara-arq,0.9999983446810359 +20110201_163316_581-b.sph,ara-aeb,0.9924557962400737 +20110203_191239_616-b.sph,ara-arq,0.9998885155856538 +20110204_153604_625-a.sph,ara-arq,0.9999972105890849 +20110204_153604_625-b.sph,ara-arq,0.9999410840414572 +20110204_163201_626-a.sph,ara-arq,0.9976110910766881 +20110204_163201_626-b.sph,ara-arq,0.9999876934539865 +20110204_164625_627-b.sph,ara-arq,0.9999477802299377 +20110204_171649_628-a.sph,ara-arq,0.999905415195764 +20110204_171649_628-b.sph,ara-arq,0.9999920273859438 +20110204_174823_629-a.sph,ara-arq,0.9999950593350131 +20110204_183311_631-a.sph,ara-arq,0.9999953899932322 +20110204_183311_631-b.sph,ara-arq,0.9999635149153278 +20110204_190013_632-a.sph,ara-arq,0.9999387012014681 +20110204_190013_632-b.sph,ara-arq,0.9999962749405605 +20110204_190208_633-a.sph,ara-arq,0.9999999768635758 +20110204_190208_633-b.sph,ara-arq,0.9977926594648339 +20110204_200618_634-a.sph,ara-arq,0.9936747368214611 +20110204_200618_634-b.sph,ara-arq,0.9999479114428771 +20110204_203655_635-b.sph,ara-arq,0.9996727006128494 +20110204_205300_638-a.sph,ara-arq,0.9992486308165472 +20110205_153631_666-b.sph,ara-arq,0.9996547635561664 +20110205_172120_671-a.sph,ara-arq,0.9999915914855924 +20110206_095118_685-a.sph,ara-arq,0.9999999541857382 +20110206_105102_688-a.sph,ara-arq,0.9999997707236105 +20110206_105102_688-b.sph,ara-arq,0.9993810260864877 +20110206_105820_689-a.sph,ara-arq,0.9987592088484722 +20110206_113326_691-a.sph,ara-arq,0.9998279296292647 +20110206_120354_693-b.sph,ara-arq,0.9980948428595703 +20110206_122113_696-a.sph,ara-arq,0.9995909791238744 +20110206_122113_696-b.sph,ara-arq,0.9999192768200029 +20110206_132644_702-a.sph,ara-arq,0.9999998034776016 +20110206_152016_714-a.sph,ara-arq,0.9999867036857935 +20110206_155159_717-a.sph,ara-arq,0.9998418634479207 +20110206_155159_717-b.sph,ara-arq,0.9888916327994056 +20110206_165119_720-a.sph,ara-arq,0.9999902498906699 +20110206_165119_720-b.sph,ara-arq,0.9920166455204702 +20110206_172320_721-a.sph,ara-arq,0.9997423969682552 +20110206_172320_721-b.sph,ara-arq,0.9804934273026574 +20110206_192709_726-a.sph,ara-arq,0.9840175835722847 +20110206_192709_726-b.sph,ara-arq,0.9977287985909209 +20110206_194621_727-a.sph,ara-arq,0.9831344460618126 +20110208_175519_758-b.sph,ara-arq,0.9999988769528441 +20110209_181948_779-a.sph,ara-arq,0.9936522404471088 +20110209_181948_779-b.sph,ara-arq,0.9997791918710421 +20110209_183724_782-a.sph,ara-arq,0.9998784380878949 +20110209_183724_782-b.sph,ara-arq,0.9999246025038221 +20110210_183402_800-a.sph,ara-arq,0.9984594573811986 +20110210_183402_800-b.sph,ara-arq,0.9980188028033471 +20110210_185230_803-a.sph,ara-arq,0.9999982013809502 +20110210_185230_803-b.sph,ara-arq,0.9995995024241807 +20110211_120852_808-a.sph,ara-arq,0.9932259220823395 +20110211_152026_820-a.sph,ara-arq,0.9999980021493747 +20110211_153702_822-a.sph,ara-arq,0.9969999901106751 +20110211_153702_822-b.sph,ara-arq,0.9997316547372318 +20110211_155607_823-a.sph,ara-arq,0.9999999999999418 +20110211_155607_823-b.sph,ara-arq,0.9998718954393995 +20110212_181444_870-a.sph,ara-arq,0.9989480808880217 +20110212_181444_870-b.sph,ara-arq,0.9999872566336785 +20110212_183328_871-a.sph,ara-arq,0.9903448729423271 +20110212_183328_871-b.sph,ara-arq,0.9810930960909464 +20110212_185203_872-a.sph,ara-arq,0.981273333555111 +20110212_191246_873-b.sph,ara-arq,0.9995332526132436 +20110213_114400_879-a.sph,ara-arq,0.9960907157237656 +20110213_114400_879-b.sph,ara-arq,0.9857366670372355 +20110213_120949_881-a.sph,ara-arq,0.9844207993983157 +20110213_120949_881-b.sph,ara-arq,0.9903953303779026 +20110213_122816_882-a.sph,ara-arq,0.9891871489623377 +20110213_122816_882-b.sph,ara-arq,0.9999935070606318 +20110213_131054_885-b.sph,ara-arq,0.9999028302287125 +20110213_133818_888-b.sph,ara-arq,0.9970198413986752 +20110213_142146_896-a.sph,ara-arq,0.9997721189705135 +20110213_142146_896-b.sph,ara-arq,0.998800376513235 +20110213_144952_900-a.sph,ara-arq,0.9968885288172212 +20110213_164838_913-b.sph,ara-arq,0.9999808228052904 +20110213_181716_914-a.sph,ara-arq,0.9996267031488214 +20110213_181716_914-b.sph,ara-arq,0.9978692419798673 +20110214_210504_930-a.sph,ara-arq,0.9999999993069508 +20110214_212408_932-b.sph,ara-arq,0.9994783176628016 +20110217_133012_975-b.sph,ara-arq,0.9999978652333108 +20110217_134937_976-b.sph,ara-arq,0.9981620062883075 +20110217_135627_977-a.sph,ara-arq,0.9923823762680786 +20110217_135627_977-b.sph,ara-ayl,0.99994304153697 +20110217_140828_980-b.sph,ara-arq,0.9998865552711499 +20110217_142557_982-a.sph,ara-arq,0.9927005912288147 +20110217_145020_983-a.sph,ara-arq,0.9992976883509107 +20110217_145020_983-b.sph,ara-arq,0.9994242961468159 +20110217_171932_987-a.sph,ara-arq,0.9998300688490686 +20110217_171932_987-b.sph,ara-arq,0.9999938868544527 +20110217_173619_988-a.sph,ara-arq,0.9998859524996734 +20110217_173619_988-b.sph,ara-arq,0.9993538778019107 +20110218_143916_1008-a.sph,ara-arq,0.9995276489555424 +20110218_152219_1012-a.sph,ara-arq,0.9989564931737804 +20110218_152219_1012-b.sph,ara-arq,0.983424400422087 +20110218_154208_1013-a.sph,ara-arq,0.9996610622089865 +20110218_171114_1015-a.sph,ara-arq,0.9983391451448304 +20110218_171114_1015-b.sph,ara-aeb,0.9839049923252243 +20110219_130356_1026-a.sph,ara-arq,0.9800875696973508 +20110219_130356_1026-b.sph,ara-arq,0.9981097426100024 +20110220_153604_1050-b.sph,ara-arq,0.9992922307310227 +20110226_102551_1168-a.sph,ara-arq,0.9956164145717841 +20110226_104245_1169-a.sph,ara-arq,0.999615990129898 +20110226_104245_1169-b.sph,ara-arq,0.9999432762407108 +20110226_105951_1171-b.sph,ara-arq,0.9963320797245594 +20110227_115638_1179-a.sph,ara-arq,0.9979731787181892 +20110227_115638_1179-b.sph,ara-arq,0.9999984412629119 +20110227_123734_1181-a.sph,ara-arq,0.9996205551160018 +20110227_125439_1182-a.sph,ara-arq,0.9996946469526378 +20110227_125439_1182-b.sph,ara-arq,0.9995090769571758 +20110227_131635_1183-a.sph,ara-arq,0.9981612635005762 +20110227_131635_1183-b.sph,ara-arq,0.9999225601135852 +20110227_134655_1184-a.sph,ara-arq,0.9993111982898277 +20110227_140420_1185-b.sph,ara-arq,0.9988598965943818 +20110227_142125_1186-a.sph,ara-arq,0.9948006601325144 +20110227_142125_1186-b.sph,ara-arq,0.9988307215422513 +20110227_154132_1189-a.sph,ara-arq,0.9938418813114719 +20110227_154132_1189-b.sph,ara-arq,0.9999920687652308 +20110227_155909_1191-b.sph,ara-arq,0.998912461185742 +20110227_162241_1192-a.sph,ara-arq,0.9984703552540448 +20110227_162241_1192-b.sph,ara-arq,0.9999895197829509 +20110227_163935_1195-b.sph,ara-arq,0.9949971842748578 +20110228_174826_1217-a.sph,ara-arq,0.9999886137143743 +20110228_174826_1217-b.sph,ara-arq,0.9993742634620741 +20110301_154921_1230-a.sph,ara-arq,0.9997093954036395 +20110301_154921_1230-b.sph,ara-arq,0.9986268296407617 +20110305_101932_1323-a.sph,ara-arq,0.9961912502914853 +20110305_101932_1323-b.sph,ara-arq,0.9992166940246625 +20110305_103655_1326-a.sph,ara-arq,0.9964829278234681 +20110305_175842_1332-a.sph,ara-arq,0.9971054262198007 +20110305_175842_1332-b.sph,ara-arq,0.9999802993572223 +20110305_181929_1333-b.sph,ara-arq,0.9999661470430923 +20110306_111437_1342-b.sph,ara-arq,0.9997018164726993 +20110306_113229_1344-b.sph,ara-arq,0.9999988318789892 +20110306_115706_1347-b.sph,ara-arq,0.9978191157389156 +20110306_121619_1348-a.sph,ara-arq,0.9827291278544082 +20110306_123404_1350-a.sph,ara-arq,0.9988325513754784 +20110306_155835_1353-b.sph,ara-arq,0.9841830936168241 +20110308_141939_1391-a.sph,ara-arq,0.9927596828605247 +20110309_090633_1407-a.sph,ara-arq,0.9974682774936627 +20110309_090633_1407-b.sph,ara-arq,0.9999472047179431 +20110309_092426_1408-a.sph,ara-arq,0.9891944796228922 +20110309_190600_1415-b.sph,ara-arq,0.9992401337610775 +20110312_100116_1442-b.sph,ara-aeb,0.9999964907283014 +lre11ablk.sph,ara-arq,0.9999907113220694 +lre11aedq.sph,ara-arq,0.9998876721921264 +lre11afar.sph,ara-arq,0.9998482741863788 +lre11aglc.sph,ara-arq,0.9999898456118099 +lre11ahqo.sph,ara-arq,0.9918603244627976 +lre11alas.sph,ara-arq,0.9908901959831016 +lre11alwj.sph,ara-arq,0.9894410455076774 +lre11amzo.sph,ara-arq,0.9786892682080185 +lre11anjz.sph,ara-arq,0.9999981759198692 +lre11aojl.sph,ara-arq,0.9997424488929775 +lre11apsf.sph,ara-arq,0.9849324254750552 +lre11avmm.sph,ara-arq,0.9999504288541092 +lre11axmy.sph,ara-arq,0.9999523003483873 +lre11azex.sph,ara-arq,0.9995398052261274 +lre11bbvj.sph,ara-arq,0.9999335460724018 +lre11bcek.sph,ara-arq,0.9999851926128998 +lre11bcpv.sph,ara-arq,0.984682382200886 +lre11biws.sph,ara-arq,0.9998120788157547 +lre11bnkp.sph,ara-arq,0.9995987125056803 +lre11bnsh.sph,ara-arq,0.9996962089592643 +lre11bnsx.sph,ara-arq,0.9977111633190938 +lre11bpyg.sph,ara-arq,0.9959055192068114 +lre11bpzi.sph,ara-arq,0.9905831011969168 +lre11bqon.sph,ara-arq,0.9969555758000546 +lre11brct.sph,ara-arq,0.9975246452439199 +lre11bsry.sph,ara-arq,0.9894503783769195 +lre11byco.sph,ara-arq,0.9999976971030877 +lre11bzjc.sph,ara-arq,0.999543540290902 +lre11bzlo.sph,ara-arq,0.9999970319421916 +lre11cejv.sph,ara-arq,0.9982811975618141 +lre11cesz.sph,ara-arq,0.998990976426668 +lre11cfgz.sph,ara-arq,0.9935160910053205 +lre11cfwm.sph,ara-arq,0.9999964881312391 +lre11cgay.sph,ara-arq,0.9996390794313366 +lre11cgxl.sph,ara-arq,0.9888137475556131 +lre11cian.sph,ara-arq,0.9999456258260779 +lre11cjxu.sph,ara-arq,0.9999091559138382 +lre11cmat.sph,ara-arq,0.999993807597705 +lre11cmnm.sph,ara-arq,0.9968753878911075 +lre11cpyg.sph,ara-arq,0.99778707361397 +lre11cysx.sph,ara-arq,0.9999531509783556 +lre11czoc.sph,ara-arq,0.9999723971738445 +lre11czzz.sph,ara-arq,0.9967642546011689 +lre11dcxm.sph,ara-arq,0.9988254822426347 +lre11dkdu.sph,ara-arq,0.9995208011254988 +lre11dmgu.sph,ara-arq,0.9999964088803398 +lre11dnsn.sph,ara-arq,0.9999932007597087 +lre11dtba.sph,ara-arq,0.9999993605172935 +lre11dtee.sph,ara-arq,0.9998725513672501 +lre11dtma.sph,ara-arq,0.977093779334459 +lre11dwvy.sph,ara-arq,0.9912064733461909 +lre11dzmv.sph,ara-arq,0.9893557384712441 +lre11edst.sph,ara-arq,0.9996483218194355 +lre11efjk.sph,ara-arq,0.9997175005082578 +lre11eiyw.sph,ara-arq,0.9876725818861913 +lre11ekip.sph,ara-arq,0.9996941527919115 +lre11eohx.sph,ara-arq,0.9999635932561415 +lre11erez.sph,ara-arq,0.9965516335686703 +lre11erxq.sph,ara-arq,0.9996087806099485 +lre11erxr.sph,ara-arq,0.9775705269985646 +lre11eufb.sph,ara-arq,0.9843849296538485 +lre11fagv.sph,ara-arq,0.9998946883634996 +lre11fbda.sph,ara-arq,0.9978295206364557 +lre11fcjj.sph,ara-ayl,0.993383627457564 +lre11fkvi.sph,ara-arq,0.9995855863185078 +lre11fodl.sph,ara-aeb,0.9926903655566781 +lre11fqsk.sph,ara-arq,0.9999977131862029 +lre11ftfz.sph,ara-arq,0.9992187521546018 +lre11fvvi.sph,ara-arq,0.9998858777480336 +lre11fwev.sph,ara-arq,0.9895982019894587 +lre11fwgy.sph,ara-arq,0.9999585016943806 +lre11fxxe.sph,ara-arq,0.9999324150603885 +lre11fyfu.sph,ara-arq,0.9998347600563473 +lre11fyul.sph,ara-arq,0.9936234560510464 +lre11fzut.sph,ara-arq,0.9842511617654447 +lre11gdzy.sph,ara-arq,0.999889190713416 +lre11gezd.sph,ara-arq,0.9998699060210541 +lre11gfzz.sph,ara-arq,0.9999891185814981 +lre11ggoj.sph,ara-arq,0.9939957237418172 +lre11ggpo.sph,ara-arq,0.9856255396828421 +lre11goba.sph,ara-arq,0.990344531513245 +lre11gobo.sph,ara-arq,0.9999875942826566 +lre11grhs.sph,ara-arq,0.9997556743150159 +lre11grvu.sph,ara-arq,0.9999968541025068 +lre11gugo.sph,ara-arq,0.9965937007797033 +lre11hfea.sph,ara-arq,0.9998608609896967 +lre11hhql.sph,ara-arq,0.9998338106542838 +lre11hnhd.sph,ara-arq,0.9772497743518446 +lre11honr.sph,ara-arq,0.9998932196048694 +lre11hqam.sph,ara-arq,0.9981273699564014 +lre11hqxf.sph,ara-arq,0.9994436545087109 +lre11hxhj.sph,ara-arq,0.9999969785031192 +lre11iape.sph,ara-arq,0.9805984363079878 +lre11ibqb.sph,ara-arq,0.9818806715807676 +lre11ijgj.sph,ara-arq,0.9990861345342709 +lre11ilih.sph,ara-arq,0.9999502485361578 +lre11imki.sph,ara-ayl,0.9999999999342657 +lre11iqwq.sph,ara-arq,0.9989577296816973 +lre11irup.sph,ara-arq,0.9987522767488314 +lre11itok.sph,ara-arq,0.9999898498294778 +lre11ivvj.sph,ara-arq,0.9989286012303883 +lre11ixke.sph,ara-arq,0.9999500945534188 +lre11jepu.sph,ara-arq,0.9972127532313085 +lre11jfrt.sph,ara-arq,0.999937410524841 +lre11jgdm.sph,ara-arq,0.9999313841413808 +lre11jgex.sph,ara-arq,0.9996981300593533 +lre11jjzk.sph,ara-arq,0.9969824384056464 +lre11jkcg.sph,ara-arq,0.9999901763442688 +lre11jlbb.sph,ara-ayl,0.9999797418290254 +lre11jmkp.sph,ara-aeb,0.9829506107982495 +lre11jnbo.sph,ara-arq,0.9955088149420692 +lre11joqm.sph,ara-arq,0.9979145377503337 +lre11jpnp.sph,ara-arq,0.9820469567638314 +lre11jqay.sph,ara-arq,0.9999059550640546 +lre11jsem.sph,ara-arq,0.9999659015380002 +lre11jtsu.sph,ara-arq,0.9999985874275487 +lre11jxjq.sph,ara-arq,0.9996252775252656 +lre11kcxw.sph,ara-arq,0.9999980198875125 +lre11kghl.sph,ara-arq,0.9999969246150942 +lre11khai.sph,ara-arq,0.9999850849749775 +lre11khpn.sph,ara-arq,0.9996319733879265 +lre11kizl.sph,ara-arq,0.9946944777387341 +lre11kjhr.sph,ara-arq,0.9999999478107263 +lre11kkvl.sph,ara-arq,0.9999331725841073 +lre11kmrd.sph,ara-arq,0.998407471871956 +lre11kvlp.sph,ara-arq,0.9881647482815468 +lre11kweb.sph,ara-arq,0.9959584099323461 +lre11laah.sph,ara-arq,0.9999994459184748 +lre11laym.sph,ara-arq,0.9996946633832319 +lre11lcve.sph,ara-ayl,0.9754071685038981 +lre11lgwf.sph,ara-arq,0.999998185212916 +lre11lkfn.sph,ara-arq,0.9871985523328064 +lre11lljy.sph,ara-arq,0.9999951992577 +lre11ltbl.sph,ara-arq,0.9925424060658127 +lre11lvrc.sph,ara-arq,0.999994692352155 +lre11lxeu.sph,ara-arq,0.9998494543892836 +lre11mciy.sph,ara-arq,0.9991649802638065 +lre11mcpb.sph,ara-arq,0.9998812879498469 +lre11mdaq.sph,ara-arq,0.9999993536606017 +lre11mdlw.sph,ara-arq,0.9999933631714529 +lre11megz.sph,ara-arq,0.999991578971874 +lre11mgcj.sph,ara-arq,0.9997465534552311 +lre11mgda.sph,ara-arq,0.9998675569243112 +lre11mhts.sph,ara-arq,0.9993162979080465 +lre11mimu.sph,ara-arq,0.9853693267657061 +lre11mmil.sph,ara-arq,0.9998013743994384 +lre11moic.sph,ara-arq,0.9999975487340111 +lre11mpyg.sph,ara-arq,0.9999918690235887 +lre11mrgx.sph,ara-arq,0.9916396537114318 +lre11muco.sph,ara-arq,0.9990051655062514 +lre11myev.sph,ara-arq,0.9999999709774632 +lre11mzsf.sph,ara-arq,0.9814633660769434 +lre11nfrs.sph,ara-arq,0.9993363405655639 +lre11nhol.sph,ara-arq,0.9997583313854068 +lre11nhpm.sph,ara-arq,0.9999766347438241 +lre11nisq.sph,ara-arq,0.9999493082338785 +lre11njwd.sph,ara-arq,0.9999968670068229 +lre11nlof.sph,ara-arq,0.997025558783378 +lre11nsiw.sph,ara-arq,0.9759841774215573 +lre11ntcf.sph,ara-arq,0.9777836287483279 +lre11nted.sph,ara-arq,0.9998141941637515 +lre11nvno.sph,ara-arq,0.9999964888490706 +lre11oavt.sph,ara-arq,0.9965200220599933 +lre11ocsv.sph,ara-arq,0.9999827956617842 +lre11oege.sph,ara-arq,0.9950563664377374 +lre11ofei.sph,ara-arq,0.9998876820742316 +lre11ohag.sph,ara-arq,0.9985768786597264 +lre11oije.sph,ara-arq,0.992072008446929 +lre11ojgd.sph,ara-arq,0.9940659470468965 +lre11ojvf.sph,ara-arq,0.9993122001400614 +lre11okxt.sph,ara-arq,0.9994222299446635 +lre11omni.sph,ara-arq,0.9998835501527243 +lre11onrg.sph,ara-arq,0.9916365803164747 +lre11ontl.sph,ara-arq,0.9995751330000039 +lre11opue.sph,ara-arq,0.9997738097677723 +lre11oqro.sph,ara-arq,0.999967793829243 +lre11otxd.sph,ara-arq,0.9999418179160698 +lre11ouii.sph,ara-arq,0.9998576517513276 +lre11ovwf.sph,ara-arq,0.9889696901091004 +lre11oydk.sph,ara-arq,0.9819224710181479 +lre11ozdn.sph,ara-arq,0.9998106016178588 +lre11pagq.sph,ara-arq,0.9801218480437873 +lre11paur.sph,ara-arq,0.999987431916008 +lre11pfti.sph,ara-arq,0.9863893498230394 +lre11pfzy.sph,ara-arq,0.9999851535829934 +lre11pprb.sph,ara-arq,0.9907367260657962 +lre11pqno.sph,ara-arq,0.9999947908245772 +lre11pvoj.sph,ara-arq,0.9999986601480322 +lre11pysj.sph,ara-arq,0.9818927362425611 +lre11pzsc.sph,ara-arq,0.9777996051185309 +lre11qaml.sph,ara-arq,0.9901897881820463 +lre11qcse.sph,ara-arq,0.9786650402081483 +lre11qhrk.sph,ara-arq,0.99996447871608 +lre11qilb.sph,ara-arq,0.9999944023407891 +lre11qjbu.sph,ara-arq,0.999998909993637 +lre11qoxa.sph,ara-arq,0.9992628389476516 +lre11qpqk.sph,ara-arq,0.9988889605651897 +lre11qrlt.sph,ara-arq,0.9993564789200029 +lre11qtkd.sph,ara-arq,0.9964050771568211 +lre11qupc.sph,ara-arq,0.9758848861520171 +lre11qwil.sph,ara-arq,0.9999982738573114 +lre11qwqs.sph,ara-arq,0.9998351735862747 +lre11rafm.sph,ara-arq,0.9999610086553676 +lre11rdod.sph,ara-arq,0.9883226566986822 +lre11rdqv.sph,ara-ayl,0.985738149564001 +lre11relv.sph,ara-arq,0.999999468502387 +lre11rjui.sph,ara-arq,0.9999924251546126 +lre11rkhs.sph,ara-arq,0.9956440478348277 +lre11rldx.sph,ara-arq,0.9989857178552667 +lre11rwqr.sph,ara-arq,0.9999995661106464 +lre11sgia.sph,ara-arq,0.9792334621995509 +lre11skba.sph,ara-arq,0.9971597662211976 +lre11smpy.sph,ara-arq,0.9999299231162642 +lre11snqm.sph,ara-arq,0.9998770993281684 +lre11snzs.sph,ara-arq,0.9998957560021872 +lre11svhq.sph,ara-arq,0.9926690106361062 +lre11sxdk.sph,ara-arq,0.9999894072598812 +lre11szjx.sph,ara-arq,0.9997496078258093 +lre11tisp.sph,ara-aeb,0.9935967845696344 +lre11tkeq.sph,ara-arq,0.999992668852024 +lre11tkgv.sph,ara-arq,0.9999634239721431 +lre11tlbn.sph,ara-arq,0.9916960032980205 +lre11tlgc.sph,ara-arq,0.9921670149117343 +lre11tnbu.sph,ara-arq,0.9999917662026707 +lre11tqjp.sph,ara-arq,0.9999997186207273 +lre11trmj.sph,ara-aeb,0.9985641023002019 +lre11txsn.sph,ara-arq,0.9895624098081941 +lre11ubjy.sph,ara-arq,0.9991221016840601 +lre11ubmu.sph,ara-arq,0.9999965435512681 +lre11uhux.sph,ara-arq,0.997435675415044 +lre11ujqi.sph,ara-arq,0.9939996473996353 +lre11ullo.sph,ara-arq,0.9999999998276814 +lre11umdt.sph,ara-arq,0.9962758965298869 +lre11unmt.sph,ara-arq,0.9985618690998576 +lre11uqzm.sph,ara-arq,0.9985040805093104 +lre11urlw.sph,ara-arq,0.9998924522602656 +lre11usmv.sph,ara-arq,0.9997207805439943 +lre11uvte.sph,ara-arq,0.9983265140452946 +lre11uwxi.sph,ara-arq,0.9993165982905879 +lre11vcwy.sph,ara-arq,0.9969565461227344 +lre11veuu.sph,ara-arq,0.9999796896377858 +lre11vezt.sph,ara-arq,0.9767680653788202 +lre11vfna.sph,ara-aeb,0.9964120446674009 +lre11vhhz.sph,ara-arq,0.998103902690531 +lre11vhvh.sph,ara-arq,0.9999927635072146 +lre11vjcl.sph,ara-arq,0.9999819610209169 +lre11vkma.sph,ara-arq,0.9945794427407135 +lre11vncd.sph,ara-arq,0.9907523248594148 +lre11vrrg.sph,ara-arq,0.99460105641934 +lre11vsry.sph,ara-arq,0.9951631752607728 +lre11vssm.sph,ara-arq,0.9804358152668605 +lre11vukq.sph,ara-arq,0.998095638681888 +lre11vwzy.sph,ara-arq,0.9999994453927953 +lre11vxev.sph,ara-arq,0.9986174583419248 +lre11vyma.sph,ara-arq,0.9935618499200927 +lre11vzdv.sph,ara-arq,0.9940242404482954 +lre11wjmo.sph,ara-arq,0.9946379138594132 +lre11wlmf.sph,ara-arq,0.9985332278711876 +lre11wogz.sph,ara-arq,0.9999996831958213 +lre11wpeu.sph,ara-arq,0.987053003738009 +lre11xesf.sph,ara-arq,0.9926552158707163 +lre11xlhq.sph,ara-arq,0.9861699702078971 +lre11xmop.sph,ara-arq,0.9998106693232437 +lre11xncb.sph,ara-arq,0.9997518363731595 +lre11xsib.sph,ara-arq,0.9999999575778398 +lre11yfkq.sph,ara-arq,0.9978416409757853 +lre11yfuh.sph,ara-arq,0.9852732209813364 +lre11yjtr.sph,ara-arq,0.9999875286020034 +lre11ykqy.sph,ara-arq,0.999994599228917 +lre11ynky.sph,ara-arq,0.9870374231633215 +lre11ynub.sph,ara-arq,0.9975464312675795 +lre11ynut.sph,ara-arq,0.994865816956002 +lre11yohv.sph,ara-arq,0.9999860886998846 +lre11ypuu.sph,ara-arq,0.999939937136318 +lre11yqmg.sph,ara-arq,0.9988753925216532 +lre11yskr.sph,ara-arq,0.9990482482843873 +lre11yysp.sph,ara-arq,0.9909036055745113 +lre11zaix.sph,ara-arq,0.9993370825297897 +lre11zcny.sph,ara-arq,0.9863060880274914 +lre11zgmi.sph,ara-arq,0.9852780607073358 +lre11znqr.sph,ara-arq,0.999970678330779 +lre11zosk.sph,ara-aeb,0.9992041227652806 +lre11zsfl.sph,ara-arq,0.9999316377930149 +lre11zvte.sph,ara-arq,0.9994693690287269 +lre11zwzv.sph,ara-arq,0.998921262378038 +lre11zxvd.sph,ara-arq,0.9970232759937795 +lre11zzww.sph,ara-arq,0.9968858579018414 +lre17_abtnjqwo.sph,ara-arq,0.999946799291851 +lre17_acckmchx.sph,ara-aeb,0.9986711227355097 +lre17_acoxtkfz.sph,ara-aeb,0.9842666785196265 +lre17_adharrss.flac,ara-arq,0.9947182189896948 +lre17_adharrss.flac-gsm,ara-arq,0.9956384390235652 +lre17_aduhvtel.sph,ara-arq,0.9999889349109844 +lre17_aekqbfnc.sph,ara-arq,0.9984883022389972 +lre17_afpeboji.sph,ara-arq,0.9999593134555593 +lre17_agovhiqf.sph,ara-arq,0.9980746218401729 +lre17_aiesnqgx.sph,ara-arq,0.9999860691612521 +lre17_aipdwmxb.sph,ara-aeb,0.9989779255268202 +lre17_anlfomhj.flac,ara-ayl,0.9994229196677115 +lre17_anocqhav.sph,ara-arq,0.9997793277424729 +lre17_ansqkmxg.sph,ara-arq,0.9990411219484128 +lre17_aokadzsc.sph,ara-arq,0.9999717598295709 +lre17_aqfwvqpg.sph,ara-arq,0.9999999815159134 +lre17_aquebikd.sph,ara-arq,0.9999983875736405 +lre17_arzwedtw.sph,ara-arq,0.9945282050060794 +lre17_asbkyxts.sph,ara-arq,0.9902930286287294 +lre17_astrrnby.flac,ara-arq,0.9755268847782662 +lre17_astrrnby.flac-g711a,ara-arq,0.9854914061367727 +lre17_avfgalrs.flac-gsm,ara-arq,0.9999972882283532 +lre17_avrkkwph.sph,ara-arq,0.9988768871380727 +lre17_axiutoza.sph,ara-arq,0.9997779954741397 +lre17_ayizvbkc.sph,ara-arq,0.9999840812557923 +lre17_azjbrozk.sph,ara-arq,0.9953169139803245 +lre17_bdpgpxku.flac-g711a,ara-ayl,0.9999977914337187 +lre17_beeeutoh.flac,ara-arq,0.9942343106320238 +lre17_beeeutoh.flac-g723_1,ara-arq,0.9974204443646679 +lre17_bfaxqjqb.sph,ara-arq,0.999936007945235 +lre17_bfdjopui.sph,ara-aeb,0.9781197289083158 +lre17_bjfsfjit.flac,ara-arq,0.9879397124065223 +lre17_bjkkkuno.sph,ara-arq,0.999487026079353 +lre17_bjoiupem.sph,ara-arq,0.9880927563645315 +lre17_bjzozier.sph,ara-aeb,0.9997062951168889 +lre17_bkcpyhve.sph,ara-arq,0.9993564259542776 +lre17_bkjcaggk.flac,ara-arq,0.9994997776308957 +lre17_bkjcaggk.flac-g711mu,ara-arq,0.9993395177228688 +lre17_bktxvmar.sph,ara-arq,0.9999917820037525 +lre17_blazxkfa.sph,ara-arq,0.9999639560062823 +lre17_blbxhpiv.sph,ara-arq,0.9949809784805863 +lre17_blljbkpf.sph,ara-arq,0.9934970373407573 +lre17_bmujmfhj.sph,ara-arq,0.9775708820251491 +lre17_boryyjhf.sph,ara-arq,0.9977484576765338 +lre17_bowjoyjr.sph,ara-arq,0.9816765571144581 +lre17_bqxjnfxx.sph,ara-aeb,0.9929235091866946 +lre17_bqyznxui.flac-g711a,ara-arq,0.9999829540600199 +lre17_brpyutxm.sph,ara-aeb,0.9832823165542701 +lre17_bswfxzyr.sph,ara-arq,0.9999980843040208 +lre17_btafdkdg.flac,ara-arq,0.9992005312956213 +lre17_btafdkdg.flac-gsm,ara-arq,0.9992352599146054 +lre17_buwtqeqb.flac,ara-arq,0.987792729025184 +lre17_buwtqeqb.flac-g726,ara-arq,0.999145758855682 +lre17_bvqgsidl.sph,ara-aeb,0.9862447101786331 +lre17_bwxfqusr.sph,ara-aeb,0.9936207466860856 +lre17_bxwvpnfw.flac-opus,ara-arq,0.9999995104818462 +lre17_bymvcgmj.sph,ara-aeb,0.9997234164043465 +lre17_byzcayjn.flac,ara-arq,0.998985645903786 +lre17_byzcayjn.flac-opus,ara-arq,0.9999450148660525 +lre17_bzmjxehu.flac-opus,ara-arq,0.9999664531787248 +lre17_cairjuvk.sph,ara-arq,0.9987867029788313 +lre17_campowcv.sph,ara-arq,0.999787205693806 +lre17_ccazlpob.flac-g723_1,ara-arq,0.9974334229828895 +lre17_cccspkdm.flac,ara-arq,0.9880424513373823 +lre17_cccspkdm.flac-g726,ara-arq,0.9954627517627038 +lre17_ccjrvsph.sph,ara-arq,0.9999054835372087 +lre17_ccoewvvh.sph,ara-arq,0.9893765175449871 +lre17_ccypdbbu.sph,ara-arq,0.9999252890248448 +lre17_cdavzdsz.flac-g722,ara-arq,0.9997403404294325 +lre17_cflgybxg.sph,ara-arq,0.9940658569786607 +lre17_cfnizhql.sph,ara-arq,0.9961060724590401 +lre17_cfwfsjev.sph,ara-arq,0.9999998965888017 +lre17_cfznpgjd.sph,ara-arq,0.9971102748785 +lre17_cgmytvfk.sph,ara-aeb,0.9997875043473322 +lre17_cipisqbs.flac-g723_1,ara-arq,0.999066551053694 +lre17_cluexwgz.sph,ara-arq,0.9998680970232108 +lre17_cnfohesd.flac,ara-arq,0.9999887142160848 +lre17_cnfohesd.flac-g723_1,ara-arq,0.9995963322893509 +lre17_cphdyjdq.sph,ara-arq,0.9906215457248232 +lre17_csenuaki.sph,ara-arq,0.9999827796312072 +lre17_ctilbvnd.sph,ara-arq,0.9941490810957968 +lre17_ctrojttf.sph,ara-arq,0.9997415475987689 +lre17_ctudkyri.sph,ara-arq,0.999816925839242 +lre17_cupizrsx.sph,ara-arq,0.9977854992043466 +lre17_cureptst.flac,ara-arq,0.9966603829855006 +lre17_cureptst.flac-g726,ara-arq,0.9981724402252548 +lre17_cuvtxdbp.sph,ara-arq,0.9992149913711046 +lre17_cvdmebty.sph,ara-arq,0.9986034345626393 +lre17_cwbzqjzi.sph,ara-arq,0.9947360860803344 +lre17_cwdccgrs.sph,ara-arq,0.9954471876114118 +lre17_cwlcovrq.flac,ara-arq,0.9836373295795586 +lre17_cwlcovrq.flac-g726,ara-arq,0.9772066393285307 +lre17_cxfmtvjk.flac,ara-arq,0.998626631969006 +lre17_cxfmtvjk.flac-gsm,ara-arq,0.9995994571705398 +lre17_czdgssvb.sph,ara-arq,0.9839800680795405 +lre17_daifemlo.flac,ara-arq,0.9999923124695197 +lre17_daifemlo.flac-opus,ara-arq,0.9984999152656753 +lre17_dbwacwxo.sph,ara-aeb,0.9959928678878073 +lre17_dctjgdcf.sph,ara-arq,0.9999693971426666 +lre17_degmucpq.flac,ara-arq,0.9998238238013132 +lre17_degmucpq.flac-gsm,ara-arq,0.9997379324783554 +lre17_dfotbhmi.sph,ara-arq,0.9999967791115594 +lre17_dhsngizg.flac,ara-arq,0.9982745689022119 +lre17_dhsngizg.flac-g722,ara-arq,0.9899076755215684 +lre17_dhttmloy.flac,ara-aeb,0.9965398617526191 +lre17_dhttmloy.flac-g726,ara-arq,0.9999896912229259 +lre17_dimkfdga.sph,ara-aeb,0.9968909392210388 +lre17_dkorjmpr.sph,ara-arq,0.9993720459297395 +lre17_dlkdkiml.sph,ara-arq,0.9998570457972521 +lre17_dmptasts.sph,ara-arq,0.9999900227916614 +lre17_dmxpkcsa.sph,ara-aeb,0.9886627341769952 +lre17_dqynyyeg.flac-gsm,ara-arq,0.9885091570772832 +lre17_dreturny.sph,ara-arq,0.9999666233198918 +lre17_drvwwpat.sph,ara-arq,0.9986475256791328 +lre17_dsyovtja.sph,ara-arq,0.9993324032717326 +lre17_dtfklpze.flac,ara-arq,0.980875299084855 +lre17_duwzoctt.sph,ara-arq,0.9944810760803787 +lre17_dxcmnnvm.sph,ara-arq,0.998013143951749 +lre17_dyhzanuz.flac,ara-arq,0.9990022950435253 +lre17_dzdfoalc.flac-g711mu,ara-arq,0.9992312655669962 +lre17_dzpjlevc.sph,ara-arq,0.9999957026648084 +lre17_ecoxuoxn.sph,ara-arq,0.9991002623934314 +lre17_ecphppxx.sph,ara-arq,0.9999618844845741 +lre17_ecwsvpey.sph,ara-arq,0.9976190788174759 +lre17_edrerhyd.flac,ara-ayl,0.9999948782776695 +lre17_edrerhyd.flac-g726,ara-ayl,0.999989105843972 +lre17_efuktxso.sph,ara-arq,0.9941980497799132 +lre17_ehubbeoo.sph,ara-arq,0.9996933627528989 +lre17_ejqromcl.sph,ara-arq,0.9925673764218849 +lre17_ekecvked.sph,ara-arq,0.9994069294232564 +lre17_eknvksdj.sph,ara-arq,0.9999954297332013 +lre17_elihcnoy.sph,ara-arq,0.9999453376193964 +lre17_elkbffyz.sph,ara-aeb,0.9994985958843845 +lre17_elyyerit.sph,ara-arq,0.9969900208425778 +lre17_emvsmkok.sph,ara-arq,0.9999992133320064 +lre17_enrykydq.flac,ara-arq,0.9997480550009473 +lre17_enrykydq.flac-g726,ara-ayl,0.9999971853826948 +lre17_eqodnzbt.flac,ara-arq,0.9975299669768147 +lre17_eqzepcqb.sph,ara-arq,0.9999815043421167 +lre17_erwwlbkn.sph,ara-arq,0.997330169701168 +lre17_esimguhv.flac-g723_1,ara-arq,0.9990357824171696 +lre17_ewmwbivr.flac,ara-aeb,0.9941407951069882 +lre17_ewsegeoy.sph,ara-aeb,0.9999996013414438 +lre17_extnxkey.sph,ara-arq,0.9899611206184787 +lre17_fgvuyqrc.sph,ara-arq,0.9999740198189395 +lre17_fheeozab.sph,ara-arq,0.9999998531340115 +lre17_fhjonuvo.flac,ara-arq,0.9912514441770434 +lre17_fhjonuvo.flac-gsm,ara-arq,0.9986942437203434 +lre17_fhobhhji.flac-g711mu,ara-arq,0.9999982807618869 +lre17_fhqkrhdc.sph,ara-aeb,0.9974882737208554 +lre17_fjeaknag.sph,ara-arq,0.9999528433654035 +lre17_flghevgj.sph,ara-arq,0.9999275347920555 +lre17_flllshvw.sph,ara-arq,0.9995890566887757 +lre17_fmaaifty.sph,ara-arq,0.9967330411235821 +lre17_fmyxmvuh.sph,ara-arq,0.9947613058963332 +lre17_fobsmsvj.sph,ara-arq,0.9985126437061927 +lre17_fosfumyj.flac,ara-arq,0.9981175606728612 +lre17_fosfumyj.flac-opus,ara-arq,0.9973412918879427 +lre17_fovgucqc.flac-gsm,ara-arq,0.9902063770771822 +lre17_fpzybapz.flac,ara-arq,0.9999831117898691 +lre17_fpzybapz.flac-g722,ara-arq,0.9999745011543022 +lre17_frfvxgkm.flac,ara-arq,0.9982105807022026 +lre17_frfvxgkm.flac-g723_1,ara-arq,0.9999990437299212 +lre17_frldxzov.flac,ara-arq,0.9999911187378006 +lre17_frnemphs.sph,ara-aeb,0.9999309467267882 +lre17_frrujsta.sph,ara-aeb,0.9827694350674886 +lre17_fsibsssn.flac,ara-arq,0.9967716611519729 +lre17_fsibsssn.flac-opus,ara-arq,0.9943914629735336 +lre17_fstjhoom.sph,ara-arq,0.9999560494459958 +lre17_fuelrqpq.sph,ara-arq,0.998314155825479 +lre17_fwyhddxz.sph,ara-arq,0.9999653658276243 +lre17_fxhpiabv.flac-g722,ara-arq,0.9785533261819718 +lre17_fyoimwzn.sph,ara-aeb,0.9933070038389972 +lre17_fyousbwl.sph,ara-arq,0.9997738038053198 +lre17_fzetpzrs.sph,ara-arq,0.9845858022736108 +lre17_gbdwksrl.flac-opus,ara-ayl,0.9999810055915502 +lre17_gbkeixqy.sph,ara-arq,0.9995010489207078 +lre17_gbmrfptf.sph,ara-arq,0.9995997838188411 +lre17_gcwvbecw.flac,ara-arq,0.9999989525506976 +lre17_gcwvbecw.flac-g726,ara-arq,0.9999995241973817 +lre17_gekpnsqw.flac,ara-arq,0.9995617602232915 +lre17_gekpnsqw.flac-g711a,ara-arq,0.9990205101656683 +lre17_gfmhcimo.flac,ara-arq,0.9843261830443644 +lre17_gfmhcimo.flac-g711a,ara-arq,0.9920939572460264 +lre17_giljetfl.sph,ara-arq,0.9998866157683133 +lre17_givvturo.flac,ara-arq,0.9999960772188857 +lre17_givvturo.flac-g722,ara-arq,0.9998983053609016 +lre17_gkfwivzq.sph,ara-arq,0.9980134657798864 +lre17_gokkodsj.flac-g722,ara-ayl,0.9962500403266442 +lre17_gpvtlzov.flac-g711a,ara-arq,0.9999996204042616 +lre17_gqcxwuze.sph,ara-arq,0.9997783053110009 +lre17_gqpcfrwm.flac-g711mu,ara-arq,0.9817550583044142 +lre17_grjpzakf.sph,ara-arq,0.9839396690676935 +lre17_grjzqftr.sph,ara-arq,0.9877772556918923 +lre17_gszgcsjf.sph,ara-arq,0.998185259970527 +lre17_gvcqvsap.sph,ara-ayl,0.9997241868465031 +lre17_gxvmjddr.sph,ara-arq,0.9986899594684224 +lre17_hbopaybj.flac,ara-arq,0.987858946064221 +lre17_hbopaybj.flac-g726,ara-arq,0.999994436902088 +lre17_hchvsbqr.sph,ara-arq,0.999906917330984 +lre17_hdofrwsf.sph,ara-arq,0.9933958450004624 +lre17_heemkdqp.flac,ara-arq,0.9999909671052553 +lre17_heemkdqp.flac-g711mu,ara-arq,0.9985133817101537 +lre17_hezbzaqo.flac,ara-arq,0.9850199928962854 +lre17_hezbzaqo.flac-opus,ara-arq,0.9999841295369671 +lre17_hfcpmeoa.flac,ara-arq,0.9947181969213107 +lre17_hfcpmeoa.flac-g711a,ara-arq,0.9999138159106336 +lre17_hfjennzi.sph,ara-aeb,0.9983301362771589 +lre17_hhbqfxfc.sph,ara-aeb,0.9906856058776015 +lre17_hhdplflf.sph,ara-arq,0.9999949491011441 +lre17_hjimhzob.sph,ara-arq,0.9983113233299764 +lre17_hkeqbypc.flac,ara-arq,0.999999661618148 +lre17_hkeqbypc.flac-gsm,ara-arq,0.9966513627962669 +lre17_hlegmknx.sph,ara-arq,0.9999782289720263 +lre17_hmmdberw.sph,ara-aeb,0.9998194744091253 +lre17_hmptzweu.sph,ara-arq,0.9999971779992906 +lre17_hmqodybe.sph,ara-ayl,0.999996399948908 +lre17_hqrhzhyj.sph,ara-arq,0.9999231926652757 +lre17_hqzkhrhn.sph,ara-arq,0.9998770075415304 +lre17_hromittp.flac-g711a,ara-arq,0.9873451303247496 +lre17_hsdzydln.flac,ara-arq,0.9821628698106489 +lre17_hsdzydln.flac-g722,ara-arq,0.9988122191294789 +lre17_hsyuvhtp.sph,ara-arq,0.9990478816052286 +lre17_hvweyrfw.sph,ara-aeb,0.9988668377871749 +lre17_hwnjyblc.sph,ara-arq,0.9999249104513325 +lre17_hxpvwduf.flac-g711a,ara-aeb,0.9938488854312174 +lre17_hyhwjuli.sph,ara-arq,0.998858421685253 +lre17_hyreqvpy.flac,ara-arq,0.9878561156668769 +lre17_hyreqvpy.flac-g711mu,ara-arq,0.9999928431157828 +lre17_ibclsyjb.sph,ara-aeb,0.9846988495735338 +lre17_ifdrxwfj.sph,ara-arq,0.9988623308711881 +lre17_igayvnul.sph,ara-arq,0.9858583197264382 +lre17_igvjetcy.sph,ara-arq,0.9997565397210374 +lre17_igvlwujq.sph,ara-aeb,0.9942243168589683 +lre17_iibcchiq.flac-gsm,ara-arq,0.9989051845669153 +lre17_ilmlmyvv.sph,ara-aeb,0.9879290883225061 +lre17_inhzmrxh.sph,ara-arq,0.9999602544207984 +lre17_inufxzrc.sph,ara-arq,0.9997778215419035 +lre17_iqtqtuvc.flac,ara-arq,0.999987834966952 +lre17_iqtqtuvc.flac-opus,ara-arq,0.9817938892370449 +lre17_itjgcxig.sph,ara-arq,0.999561265994042 +lre17_itsqwgkz.sph,ara-arq,0.9999999957865953 +lre17_ittvvvfb.sph,ara-arq,0.9999964209775712 +lre17_ivcdeiky.flac,ara-arq,0.9873438502201111 +lre17_iwtlmazd.sph,ara-arq,0.9873719419778358 +lre17_ixbvjxte.sph,ara-arq,0.9997976143150719 +lre17_iycttrsq.sph,ara-arq,0.987846742780538 +lre17_iyqnjpod.sph,ara-arq,0.9936664779953471 +lre17_izhxudfa.sph,ara-arq,0.9999249686091597 +lre17_javisjpg.sph,ara-arq,0.9933263960275387 +lre17_jclfqqom.sph,ara-arq,0.9996552571484193 +lre17_jcperagi.sph,ara-arq,0.9999535310829344 +lre17_jcueuvkk.sph,ara-arq,0.9998819304923648 +lre17_jgqtrgqt.sph,ara-arq,0.9999823610331084 +lre17_jgzyarns.sph,ara-arq,0.9999898713367306 +lre17_jhjgasxv.sph,ara-aeb,0.9991139740455672 +lre17_jhoqfjpk.flac,ara-arq,0.9999995523948527 +lre17_jhoqfjpk.flac-g711a,ara-arq,0.9944430263756097 +lre17_jiakkjtr.sph,ara-arq,0.9999993323735444 +lre17_jilypibp.flac-gsm,ara-arq,0.9996434093761065 +lre17_jiowcahg.sph,ara-arq,0.992648625274396 +lre17_jlvgsuxh.sph,ara-aeb,0.9948123012485498 +lre17_jlvtorab.sph,ara-arq,0.983513534636461 +lre17_jmkuwbpc.sph,ara-arq,0.9994527050835158 +lre17_jnipskqx.flac-g711mu,ara-arq,0.9999742870473751 +lre17_jpeqxepv.sph,ara-aeb,0.9994104144919757 +lre17_jpeyombi.sph,ara-arq,0.9999998044387237 +lre17_jpjtuxvw.flac,ara-arq,0.9996847495267612 +lre17_jqmoqqfm.flac,ara-arq,0.9999869216223071 +lre17_jqmoqqfm.flac-g726,ara-arq,0.9999999894357187 +lre17_jtdfvpln.sph,ara-arq,0.999997871153658 +lre17_jtqoxxtm.sph,ara-aeb,0.9965086342211626 +lre17_jvpfjwdp.flac,ara-ayl,0.9999999997451912 +lre17_jvurmddm.flac,ara-arq,0.9987187046194855 +lre17_jvurmddm.flac-gsm,ara-arq,0.9983730947085013 +lre17_jwkybctt.sph,ara-arq,0.9999989593481196 +lre17_jxcmtrxm.sph,ara-ayl,0.9996284167713838 +lre17_jywavsuu.flac,ara-arq,0.9913930400541082 +lre17_kaaesmko.flac,ara-arq,0.9999294732363818 +lre17_kbmrgfwm.sph,ara-arq,0.9999528508197458 +lre17_kbodxjcn.flac,ara-arq,0.9982992372902407 +lre17_kcdcpzly.sph,ara-arq,0.9759220472115765 +lre17_keetepyz.flac,ara-arq,0.9927333842986636 +lre17_keetepyz.flac-g722,ara-arq,0.9999998412855006 +lre17_kfmsssrs.sph,ara-arq,0.9997193659178423 +lre17_kfsotues.sph,ara-arq,0.999998669422541 +lre17_khygxcdj.sph,ara-arq,0.9868874065356342 +lre17_kjtqnjgt.sph,ara-arq,0.9791232775307577 +lre17_kkcxpjzr.flac,ara-arq,0.9999999887350973 +lre17_kkcxpjzr.flac-g711a,ara-arq,0.9999944954328739 +lre17_kmzwffxp.sph,ara-arq,0.9782750887595135 +lre17_knkvczhw.flac-g722,ara-arq,0.9999106890724243 +lre17_kpbzatbg.sph,ara-arq,0.9935992901995973 +lre17_kpcquycc.sph,ara-arq,0.9988726069205118 +lre17_kugvrfiw.sph,ara-arq,0.9986475838029554 +lre17_kuzbruhc.sph,ara-arq,0.9998879396014413 +lre17_kwvzftsa.sph,ara-arq,0.9906568648956764 +lre17_kzutiwjm.sph,ara-arq,0.9883352489803169 +lre17_larfsawf.sph,ara-arq,0.9833813699639339 +lre17_lectmxiy.sph,ara-arq,0.9997601079206343 +lre17_lfdmjqzk.sph,ara-arq,0.9997174449458649 +lre17_lfqfgpty.flac,ara-arq,0.9877470265323836 +lre17_lfqfgpty.flac-opus,ara-arq,0.9998989537391589 +lre17_lgimdxjv.sph,ara-arq,0.9897020483952464 +lre17_lgmtfuaf.sph,ara-arq,0.9997034751344174 +lre17_lgzhdvir.flac,ara-arq,0.9999666603862899 +lre17_lgzhdvir.flac-g723_1,ara-arq,0.9999814711501472 +lre17_litfqatc.sph,ara-arq,0.9999697307957149 +lre17_ljqkqvuk.sph,ara-arq,0.9931204062930487 +lre17_lkeepofx.sph,ara-aeb,0.9815662246718163 +lre17_lkvpiaco.flac-g711mu,ara-aeb,0.9894714726927342 +lre17_llwfixbt.flac,ara-arq,0.9998571656021117 +lre17_llwfixbt.flac-opus,ara-arq,0.9999840733365404 +lre17_llxcpovx.sph,ara-arq,0.9999726611852431 +lre17_lmtexhdt.sph,ara-arq,0.9955378310817409 +lre17_lnlzbiqv.sph,ara-arq,0.9845486175862881 +lre17_lnwqjgum.sph,ara-arq,0.9994084640832857 +lre17_logsuwkc.sph,ara-ayl,0.9973010083242871 +lre17_lpdrjcmf.sph,ara-arq,0.9999999987748978 +lre17_lpnxjclp.sph,ara-arq,0.999990551187932 +lre17_lpwlbnvd.sph,ara-aeb,0.9985688096228789 +lre17_lqcxhbgx.flac,ara-aeb,0.9789437284063228 +lre17_lqeynset.sph,ara-arq,0.9946714116424836 +lre17_lqqtwkna.sph,ara-arq,0.9959561849558086 +lre17_lrchzlnf.sph,ara-arq,0.9999964986490049 +lre17_lriptaxa.sph,ara-arq,0.9999861992203515 +lre17_lrmpuslv.sph,ara-arq,0.9999833158908321 +lre17_lsglcrqu.sph,ara-arq,0.9994974702145716 +lre17_ltobvlca.flac,ara-aeb,0.9998321920878662 +lre17_ltobvlca.flac-g726,ara-arq,0.9808235035540288 +lre17_lumlsydt.flac,ara-arq,0.9993847597571562 +lre17_lvwbcjui.sph,ara-arq,0.9999990130410765 +lre17_lwbqplua.flac,ara-arq,0.9985377563185653 +lre17_lyvsulsp.sph,ara-arq,0.9996779798745427 +lre17_lzzfbiwk.sph,ara-arq,0.9999780111773144 +lre17_mazmicwf.flac,ara-arq,0.9761802678092957 +lre17_mazmicwf.flac-g726,ara-arq,0.9999987851548972 +lre17_mcchuzqa.flac,ara-arq,0.9994453172253329 +lre17_mcchuzqa.flac-g726,ara-arq,0.9994722195518764 +lre17_mhelcckx.sph,ara-aeb,0.9921911924278494 +lre17_minmrdvv.flac,ara-arq,0.9999806822091847 +lre17_minmrdvv.flac-g711mu,ara-arq,0.9999798110768492 +lre17_miyeplrp.flac,ara-ayl,0.999967537246669 +lre17_miyeplrp.flac-g722,ara-ayl,0.9999974966675732 +lre17_mjkrjctc.sph,ara-arq,0.9996993211891599 +lre17_mjuhytod.flac-g722,ara-aeb,0.9942683057158186 +lre17_mjxevtqw.flac,ara-arq,0.9789540403486894 +lre17_mllyvrkw.sph,ara-arq,0.9984655364684033 +lre17_mneiaioi.sph,ara-aeb,0.985452121186191 +lre17_mnoswtar.flac,ara-arq,0.9999465324732042 +lre17_mnoswtar.flac-g722,ara-arq,0.9994314827928369 +lre17_moihuogw.sph,ara-arq,0.9999981054273598 +lre17_moohuqbu.flac-opus,ara-arq,0.9946789594259231 +lre17_mpewcntj.sph,ara-arq,0.9999987697239342 +lre17_mtyfveku.sph,ara-arq,0.9829721690668127 +lre17_mvbpdkqz.sph,ara-ayl,0.9948321204607391 +lre17_mxcghtfj.sph,ara-arq,0.9927362055311203 +lre17_mxhoedfe.sph,ara-arq,0.9999999146737504 +lre17_mxmdmamo.sph,ara-aeb,0.988517700201585 +lre17_mxmjurdd.sph,ara-arq,0.9980132953988482 +lre17_mzdpsrvs.sph,ara-aeb,0.992494003405007 +lre17_mzsfsjad.sph,ara-aeb,0.99997586512649 +lre17_naeguqak.sph,ara-arq,0.9999929946428248 +lre17_nblzukhx.flac,ara-arq,0.9999359009222737 +lre17_nblzukhx.flac-gsm,ara-arq,0.9999999966088737 +lre17_ndkkdwgy.sph,ara-arq,0.9975163999653704 +lre17_negphusk.sph,ara-arq,0.9998989889366274 +lre17_nhdlsoit.sph,ara-arq,0.9916584056978099 +lre17_njbwudbl.sph,ara-arq,0.999934592749547 +lre17_njontgtu.sph,ara-arq,0.9794969009896114 +lre17_nkgdldta.sph,ara-arq,0.9916521956821477 +lre17_nkqygxxz.sph,ara-arq,0.9997656446176615 +lre17_nocucjva.sph,ara-arq,0.9995204775364295 +lre17_nojsrnhx.sph,ara-arq,0.999919125620621 +lre17_nowvnwzc.sph,ara-arq,0.995585233402159 +lre17_nqfliycm.sph,ara-arq,0.9999912069728009 +lre17_nqkyimjt.sph,ara-arq,0.9881291130932576 +lre17_nqxowwop.flac,ara-arq,0.9997443038852292 +lre17_nqxowwop.flac-g723_1,ara-arq,0.9914457208775102 +lre17_nrunzxja.flac,ara-arq,0.9993221612062564 +lre17_nrunzxja.flac-g711mu,ara-arq,0.9999566225291738 +lre17_nsiynodu.sph,ara-arq,0.9908214588078317 +lre17_nssuzfbr.sph,ara-arq,0.9999396177844772 +lre17_ntbrwymu.sph,ara-arq,0.9993012725372231 +lre17_nuvzuxee.sph,ara-arq,0.9996041721916568 +lre17_nvgpubxb.sph,ara-arq,0.9759857598176621 +lre17_nxjuqezl.flac,ara-arq,0.9995754800524955 +lre17_nxjuqezl.flac-gsm,ara-arq,0.9793987540104333 +lre17_nxvquxsr.sph,ara-arq,0.9990399807148835 +lre17_nzeyrrcl.sph,ara-arq,0.9999461953593082 +lre17_nzmnjjpc.flac,ara-aeb,0.9847092271434903 +lre17_obbtvsaj.flac-g711a,ara-arq,0.9999409869224803 +lre17_obkyiehe.sph,ara-arq,0.9998679965082828 +lre17_obrcwlmw.sph,ara-aeb,0.9998778475538475 +lre17_ogwcxkjw.sph,ara-arq,0.9895315802827847 +lre17_oireqedt.sph,ara-arq,0.9917281473076983 +lre17_oirnebxz.flac,ara-arq,0.9967110495563957 +lre17_oirnebxz.flac-opus,ara-aeb,0.9933118074655622 +lre17_oiveluew.sph,ara-arq,0.9968284888503907 +lre17_oizxklej.sph,ara-arq,0.9986908296100067 +lre17_olqpjrwd.sph,ara-ayl,0.985872505893845 +lre17_olwownje.sph,ara-arq,0.9999818242744661 +lre17_onckhujt.sph,ara-aeb,0.9979436467237117 +lre17_onknnaim.sph,ara-aeb,0.9987555260169619 +lre17_opsncnkb.sph,ara-aeb,0.9997592175168953 +lre17_opxoeses.flac-g711mu,ara-arq,0.9895267895164883 +lre17_oqnuceey.flac,ara-arq,0.9818564260274837 +lre17_oqnuceey.flac-opus,ara-arq,0.9960254767681471 +lre17_orthumig.sph,ara-arq,0.9979787737264081 +lre17_ouhrqmvj.sph,ara-arq,0.9999999735219096 +lre17_oukunjzc.flac,ara-arq,0.9999998874015028 +lre17_oukunjzc.flac-g722,ara-arq,0.9999998022867953 +lre17_ouvsypqp.sph,ara-arq,0.9999862194709894 +lre17_ownmyzum.sph,ara-arq,0.9983899224785795 +lre17_owxndsay.sph,ara-arq,0.9991660737491793 +lre17_oxoeettt.sph,ara-aeb,0.994764323060291 +lre17_oxvlijdf.sph,ara-arq,0.9980756971870425 +lre17_oylngzoh.sph,ara-arq,0.9999772205491734 +lre17_pbmuxcky.flac,ara-arq,0.9980374961356401 +lre17_pbmuxcky.flac-opus,ara-arq,0.9999987347640981 +lre17_pdcigndc.sph,ara-arq,0.9956870254382242 +lre17_pfcsmyfp.flac,ara-arq,0.999994210275427 +lre17_pfcsmyfp.flac-opus,ara-arq,0.9995142367581035 +lre17_pfecwivw.flac,ara-arq,0.997995447936321 +lre17_pfecwivw.flac-gsm,ara-arq,0.979556413285578 +lre17_pfenqxed.sph,ara-arq,0.9935592984355501 +lre17_pgqzdpfq.sph,ara-arq,0.9987770018281733 +lre17_phvwlddn.sph,ara-arq,0.9996126414779914 +lre17_piiiaqsg.sph,ara-arq,0.9883006332201746 +lre17_piixpsbr.flac,ara-arq,0.9999739707108446 +lre17_piixpsbr.flac-g722,ara-arq,0.9996474019470863 +lre17_pixqbtbm.flac,ara-arq,0.9949782923210799 +lre17_pixqbtbm.flac-g726,ara-ayl,0.9966265706203424 +lre17_pjfvtjab.sph,ara-arq,0.999885331527543 +lre17_pklmiexr.sph,ara-arq,0.9992720760130763 +lre17_pnlxhqnm.sph,ara-ayl,0.9999929993566911 +lre17_pnrhsfou.flac-g722,ara-arq,0.9999640996706576 +lre17_pnwenjwm.sph,ara-arq,0.9960418034539658 +lre17_poheolla.sph,ara-aeb,0.9985638045139876 +lre17_poysotsv.sph,ara-aeb,0.9856503882631178 +lre17_ppvtutvt.sph,ara-arq,0.9998354077570467 +lre17_pqawpvfb.flac,ara-arq,0.9950780853489194 +lre17_pqawpvfb.flac-g711a,ara-arq,0.9994370686040279 +lre17_pqwwzwxo.sph,ara-arq,0.999990243213515 +lre17_psacvdup.flac,ara-arq,0.9999630353117823 +lre17_psacvdup.flac-opus,ara-arq,0.9994055674097663 +lre17_pslkpzhl.sph,ara-arq,0.9997159742438066 +lre17_pufnzdvd.flac,ara-arq,0.9997950371376702 +lre17_pufnzdvd.flac-gsm,ara-arq,0.9999959634431062 +lre17_pujabbev.sph,ara-arq,0.9994079718102534 +lre17_pvfvlhsq.flac,ara-aeb,0.9999713112925558 +lre17_pvfvlhsq.flac-gsm,ara-arq,0.9999254611931253 +lre17_pwhqsovd.sph,ara-arq,0.9917133657396171 +lre17_pxekbodb.sph,ara-arq,0.9984745978775882 +lre17_qdwsexfm.sph,ara-arq,0.9999974580577462 +lre17_qhiyavse.sph,ara-arq,0.9912788455576231 +lre17_qivtcmgk.sph,ara-arq,0.9836174820047392 +lre17_qjitoyxc.sph,ara-arq,0.9999874042742806 +lre17_qkxouubm.sph,ara-arq,0.9998872491727429 +lre17_qljscllj.sph,ara-aeb,0.9979913152483216 +lre17_qlzldcpe.sph,ara-aeb,0.997053186781475 +lre17_qmcrgdzz.sph,ara-arq,0.9996004571017476 +lre17_qmjbylrs.flac,ara-arq,0.9999120076761361 +lre17_qmjbylrs.flac-g723_1,ara-arq,0.9942971237057362 +lre17_qmjpvlvg.sph,ara-arq,0.9998030655795183 +lre17_qogybjhz.sph,ara-arq,0.9994621165166646 +lre17_qpntxzjb.sph,ara-aeb,0.9957626204201693 +lre17_qpredbkv.sph,ara-arq,0.9990911968221025 +lre17_qrbdlmjx.sph,ara-arq,0.9999999999999865 +lre17_qrcvlqts.sph,ara-arq,0.9767514676069964 +lre17_qscgrzxe.flac,ara-arq,0.9998709648180928 +lre17_qsewfkyh.sph,ara-arq,0.9999999864987743 +lre17_qstdyztt.flac-g711mu,ara-arq,0.9999759932517555 +lre17_qszrgiyz.sph,ara-arq,0.9999496361715189 +lre17_qtaulytr.sph,ara-arq,0.9770776315818761 +lre17_qudalolg.sph,ara-arq,0.998897750323492 +lre17_qufteqvo.sph,ara-aeb,0.992027129263138 +lre17_qwiyjayz.sph,ara-arq,0.9995498080059056 +lre17_qwvrxfzu.sph,ara-arq,0.9988604816072997 +lre17_qyiarywg.flac,ara-arq,0.9999605279920688 +lre17_qyiarywg.flac-g723_1,ara-arq,0.9999999619112184 +lre17_qyzhxzvj.sph,ara-arq,0.9759510962602079 +lre17_rajrtwbo.sph,ara-aeb,0.9999999999944702 +lre17_rcryqfgn.sph,ara-arq,0.9972776568740012 +lre17_rcueudci.flac,ara-arq,0.9988837735514282 +lre17_rcueudci.flac-gsm,ara-arq,0.9999697760125505 +lre17_reicsaat.sph,ara-arq,0.9997754476127328 +lre17_reyualuk.flac,ara-arq,0.9992844879623304 +lre17_rfwyqutk.sph,ara-arq,0.9997722244477082 +lre17_rggtfbrd.sph,ara-aeb,0.9931773563621665 +lre17_rhepwrug.sph,ara-arq,0.9998787750778266 +lre17_rkocbhzs.sph,ara-arq,0.9960601282813184 +lre17_rlcyzlcy.sph,ara-arq,0.9994472570443922 +lre17_rlpbjbed.sph,ara-arq,0.9942325885969098 +lre17_rlqkwaeh.sph,ara-arq,0.9953431894962037 +lre17_rnveyooi.sph,ara-ayl,0.9996778752622651 +lre17_rnvyrkwg.flac-g723_1,ara-arq,0.9853171747622366 +lre17_rqacreai.sph,ara-arq,0.999207518939918 +lre17_rqlzthlg.sph,ara-aeb,0.989048786309874 +lre17_ruzqcwpn.sph,ara-arq,0.9999863065050799 +lre17_rwvdctfg.flac,ara-aeb,0.9815766373873294 +lre17_rynppewk.flac,ara-arq,0.9999708695439152 +lre17_rypzhghv.flac,ara-arq,0.9947011510267938 +lre17_rypzhghv.flac-g711a,ara-arq,0.9847132293141271 +lre17_sagynpjo.sph,ara-arq,0.999744534125517 +lre17_sbxerjvo.sph,ara-arq,0.999987473908599 +lre17_scfolxob.flac,ara-arq,0.9999713591244429 +lre17_serpsscu.flac,ara-arq,0.9793042704401821 +lre17_serpsscu.flac-g723_1,ara-arq,0.9999130486126522 +lre17_sffusbzg.sph,ara-arq,0.9999268097555194 +lre17_sfjwayps.flac,ara-ayl,0.9946715937173086 +lre17_sfjwayps.flac-g726,ara-arq,0.9982714238405073 +lre17_sgkgyjvk.flac,ara-arq,0.9995433812540649 +lre17_sgkgyjvk.flac-gsm,ara-arq,0.9996437880979923 +lre17_sjnfbigi.sph,ara-arq,0.99999366855751 +lre17_skdclppi.sph,ara-arq,0.9889180838738156 +lre17_smjdgznr.flac,ara-aeb,0.9964248254318828 +lre17_snfzxijz.sph,ara-ayl,0.9827938327458273 +lre17_sofspqyi.sph,ara-arq,0.9999994382673698 +lre17_sqoxhftl.sph,ara-arq,0.9996790538981134 +lre17_stbhhhou.sph,ara-arq,0.9945710415211226 +lre17_stpksvvi.sph,ara-aeb,0.996004757361174 +lre17_stxkelkq.sph,ara-ayl,0.9752463023195366 +lre17_suqttdyg.sph,ara-arq,0.997742051798683 +lre17_susdosey.sph,ara-arq,0.9977231084345539 +lre17_suvxbjhl.sph,ara-arq,0.9986381780682658 +lre17_svetuuie.sph,ara-arq,0.9998479453288084 +lre17_svzozbfk.sph,ara-arq,0.9998881824847226 +lre17_swgrlydv.sph,ara-arq,0.9990560832648376 +lre17_sxgfwork.sph,ara-arq,0.9999763564539524 +lre17_syatmwze.sph,ara-arq,0.9848649335693501 +lre17_syxmxolu.sph,ara-aeb,0.9867936744030255 +lre17_tbbuisna.sph,ara-arq,0.9999805669714006 +lre17_tbplljcp.flac,ara-arq,0.9998129818454303 +lre17_tbplljcp.flac-gsm,ara-arq,0.9999999839340195 +lre17_tcmjqsvf.sph,ara-arq,0.9936464811055075 +lre17_tcvunuvp.sph,ara-aeb,0.9946331881971427 +lre17_tduxpzqq.sph,ara-aeb,0.9996190225252365 +lre17_teyvymzd.flac-g711mu,ara-ayl,0.9787413632582724 +lre17_tfngvqdf.flac,ara-arq,0.999990203549186 +lre17_tfngvqdf.flac-g726,ara-arq,0.9983466771871533 +lre17_tforvtmc.sph,ara-arq,0.9935448102639823 +lre17_tfxmolis.sph,ara-arq,0.9998286292942293 +lre17_thjcyqwr.flac,ara-arq,0.9994467118163807 +lre17_thjcyqwr.flac-opus,ara-arq,0.9976422823383214 +lre17_thxeccdu.sph,ara-arq,0.9998750920305819 +lre17_ticjhhbi.sph,ara-arq,0.9794445128724558 +lre17_tjcshvrl.sph,ara-arq,0.9999926102290503 +lre17_tjremugr.sph,ara-arq,0.9999991435250514 +lre17_tlutsejs.sph,ara-arq,0.9988207070133517 +lre17_tnrvafxe.sph,ara-arq,0.9944573271724075 +lre17_tnxtgdnc.sph,ara-aeb,0.9942622096810594 +lre17_tolpbvsc.flac,ara-arq,0.9999996570853448 +lre17_tolpbvsc.flac-g711mu,ara-arq,0.9999930938174156 +lre17_totoyxhm.sph,ara-arq,0.999996710305506 +lre17_tqkpkxgu.sph,ara-arq,0.9999960590328173 +lre17_tqmnzgyb.sph,ara-arq,0.9999682738390965 +lre17_tsppppzj.sph,ara-arq,0.9999889892476231 +lre17_tssmuwge.sph,ara-arq,0.9993471439476459 +lre17_ttkmfmkk.sph,ara-arq,0.9961646913300042 +lre17_ttvvzlvt.sph,ara-arq,0.9945500617775027 +lre17_twkrspxj.flac-g711mu,ara-arq,0.9970066534454132 +lre17_tyqxhlrh.sph,ara-arq,0.9947460028171129 +lre17_tzwuzntv.flac,ara-arq,0.9999848694087901 +lre17_tzwuzntv.flac-g711a,ara-arq,0.9999999505751382 +lre17_uawwqpsa.sph,ara-arq,0.9857085643990153 +lre17_ubnnanex.sph,ara-arq,0.9999993559208963 +lre17_ucfvsgyr.flac,ara-arq,0.9961772950424368 +lre17_ucfvsgyr.flac-g711mu,ara-arq,0.9999597640499912 +lre17_ufifckts.flac-gsm,ara-ayl,0.9928071586629514 +lre17_uiyescxr.sph,ara-arq,0.9997891506043249 +lre17_ukkxkxxt.sph,ara-aeb,0.9935449493739165 +lre17_umissmzv.sph,ara-arq,0.9990005119204275 +lre17_unxhwqmy.flac,ara-aeb,0.9963270393303603 +lre17_upseluva.sph,ara-arq,0.9909948021770557 +lre17_upvapoke.sph,ara-arq,0.9930629481999376 +lre17_uqtiiong.sph,ara-arq,0.9999999685612003 +lre17_usdeaflg.sph,ara-arq,0.9857851998633298 +lre17_uszjbbko.sph,ara-arq,0.9999941374675029 +lre17_utjkjjcn.sph,ara-arq,0.9979757853366961 +lre17_utooogzo.sph,ara-ayl,0.9959130999661093 +lre17_uwescwtn.sph,ara-arq,0.9999992993153919 +lre17_uwldzayo.sph,ara-arq,0.9999916059792026 +lre17_uwuytsxe.sph,ara-arq,0.9947486052008054 +lre17_uzxmtvue.sph,ara-arq,0.9968064808522498 +lre17_vaugwmvv.sph,ara-arq,0.9994338833370221 +lre17_vbgmqfuo.flac,ara-arq,0.980847451266026 +lre17_vbjsoyeh.sph,ara-arq,0.9999116642269064 +lre17_vcksyiuy.flac-g711a,ara-aeb,0.9835977136688748 +lre17_vgxwjuno.sph,ara-arq,0.999983609611863 +lre17_vingckxa.flac,ara-arq,0.9986237456025335 +lre17_vjffccpz.sph,ara-arq,0.9999954081046549 +lre17_vjfjqitw.flac-gsm,ara-arq,0.9994035904368442 +lre17_vjtprfjw.flac,ara-ayl,0.9964430683674823 +lre17_vjtprfjw.flac-g726,ara-arq,0.9805106671407414 +lre17_vjvrlhfs.sph,ara-arq,0.9833183398241712 +lre17_vkqxvmtc.sph,ara-arq,0.9994368014427134 +lre17_vmssxzzd.sph,ara-arq,0.9996477418713372 +lre17_vndndpzq.sph,ara-arq,0.9791108970484209 +lre17_vnlvmhpc.flac-g711mu,ara-arq,0.9999365684834217 +lre17_vnxwpwge.sph,ara-aeb,0.9986622760430225 +lre17_vovpsxcd.sph,ara-arq,0.9998378398538086 +lre17_vpossvdt.flac-g711a,ara-arq,0.9999460791991176 +lre17_vrejajcm.sph,ara-arq,0.9776138943860346 +lre17_vswsposp.sph,ara-arq,0.9994020716061514 +lre17_vtigorkv.sph,ara-arq,0.9999623163215805 +lre17_vtkffspm.flac,ara-arq,0.9997898857084506 +lre17_vuznysrk.flac-g711mu,ara-arq,0.9979511877631668 +lre17_vvyqmniq.sph,ara-arq,0.9999515987872877 +lre17_vwijmoke.sph,ara-arq,0.9989975351908933 +lre17_vynkvprp.sph,ara-arq,0.9999845129327151 +lre17_wagzvxqz.sph,ara-arq,0.9973278031174633 +lre17_wahbanqs.sph,ara-arq,0.9998842310344779 +lre17_wairvblk.sph,ara-arq,0.9987016619787147 +lre17_wesfzmws.sph,ara-arq,0.9908093051797177 +lre17_wfvvkjuv.sph,ara-arq,0.999944678936039 +lre17_wggunlcp.sph,ara-arq,0.9996749635832616 +lre17_wgnbrmfd.sph,ara-aeb,0.980616930826586 +lre17_whjqstnl.sph,ara-ayl,0.999979615699586 +lre17_whqbhubs.sph,ara-arq,0.9898499177391032 +lre17_widuepdg.sph,ara-aeb,0.9817918573044054 +lre17_wkhkxpmr.sph,ara-ayl,0.9998286566004881 +lre17_wmtiighi.sph,ara-arq,0.987786199224793 +lre17_wnevoywa.sph,ara-arq,0.9999869741085323 +lre17_woccwvjw.sph,ara-arq,0.9999711680680078 +lre17_wrwmvkyy.sph,ara-arq,0.9999793167856921 +lre17_wryaaaay.sph,ara-arq,0.9791656416113507 +lre17_wtkatcwm.sph,ara-arq,0.9850427630920813 +lre17_wvgqdrqk.sph,ara-arq,0.9791490717212691 +lre17_wvyabqbx.sph,ara-arq,0.9990947686607856 +lre17_wwypkyea.sph,ara-aeb,0.9892611131137926 +lre17_wxaxnvpq.sph,ara-arq,0.996190338175843 +lre17_wxirsbfe.sph,ara-arq,0.9998855761433262 +lre17_wxwauidm.sph,ara-arq,0.9899615630910984 +lre17_wyjetcgf.sph,ara-arq,0.9999367576307792 +lre17_xaowthgy.sph,ara-arq,0.9983944376668455 +lre17_xdcmpfbl.sph,ara-arq,0.9914795091793974 +lre17_xdhrhgmk.flac,ara-arq,0.9999939811339105 +lre17_xdhrhgmk.flac-opus,ara-arq,0.9999380489585059 +lre17_xepisjpn.sph,ara-arq,0.998123428276411 +lre17_xhpkbvei.sph,ara-aeb,0.9896471029490118 +lre17_xhqfsfkf.sph,ara-arq,0.9999985545272336 +lre17_xkayfgzq.sph,ara-arq,0.9985227970239359 +lre17_xllwincb.sph,ara-arq,0.9999999189534862 +lre17_xlqqxoym.sph,ara-arq,0.9999541071953805 +lre17_xmvmloxn.flac,ara-arq,0.9827789901954631 +lre17_xmvmloxn.flac-g723_1,ara-arq,0.9998467402490113 +lre17_xnyjhsyy.sph,ara-arq,0.9999956069402056 +lre17_xovcjkso.sph,ara-aeb,0.9920305514596128 +lre17_xroveufz.sph,ara-aeb,0.9993682416393447 +lre17_xskjgkzq.sph,ara-arq,0.9970587807073615 +lre17_xsuhxjmz.sph,ara-arq,0.9999908698954791 +lre17_xtwbrgfu.sph,ara-aeb,0.9964922765834566 +lre17_xujatdxg.sph,ara-arq,0.9999544795771792 +lre17_xviuupwl.sph,ara-arq,0.9999998045117351 +lre17_xvxlncwz.sph,ara-arq,0.9999098345731946 +lre17_xyyhmsku.flac,ara-arq,0.998537851661698 +lre17_ybtygbuu.sph,ara-arq,0.9999292474625724 +lre17_yctdihii.sph,ara-arq,0.999999574179243 +lre17_ydmmannh.flac,ara-arq,0.9983920334462054 +lre17_ydmmannh.flac-g723_1,ara-arq,0.9995348938940224 +lre17_yekpxxwc.flac,ara-arq,0.9988877246272224 +lre17_yekpxxwc.flac-g711mu,ara-aeb,0.9923589185563311 +lre17_ygndvzfp.sph,ara-arq,0.9999911328372215 +lre17_yhjzokrv.sph,ara-arq,0.9958319219062072 +lre17_yilroulj.flac-g711a,ara-aeb,0.9795574945355306 +lre17_yivtnzkg.sph,ara-arq,0.9999715099952994 +lre17_yjoblztq.flac,ara-arq,0.9999973207194949 +lre17_yjoblztq.flac-opus,ara-arq,0.9998238058972634 +lre17_ykxiohej.sph,ara-arq,0.9990315895452987 +lre17_ylsidleu.flac-g711a,ara-arq,0.9984473055243461 +lre17_yltydxpy.sph,ara-arq,0.9956478528044228 +lre17_yownwnlt.flac,ara-arq,0.9998812691288554 +lre17_yownwnlt.flac-g722,ara-arq,0.9916894249368101 +lre17_ypetotbw.flac,ara-arq,0.9999949516854025 +lre17_ypetotbw.flac-gsm,ara-aeb,0.9972546803878858 +lre17_yqgdczse.flac,ara-arq,0.9930023849936759 +lre17_yqhtqtnl.flac,ara-arq,0.9993399870284819 +lre17_yqhtqtnl.flac-opus,ara-arq,0.9998349097677763 +lre17_yrzjdbif.sph,ara-aeb,0.9999958037591928 +lre17_ysadxqiw.sph,ara-arq,0.9999106154927021 +lre17_ysdzkrmo.flac-g711a,ara-arq,0.9974528853995988 +lre17_ytgfvwpa.flac,ara-arq,0.9998217502864875 +lre17_ytgfvwpa.flac-opus,ara-arq,0.9998524581773589 +lre17_yuduwhrd.flac,ara-arq,0.9955028997292512 +lre17_yuoequzk.sph,ara-arq,0.9999099394309094 +lre17_yuxtqtbd.sph,ara-arq,0.9999994359070692 +lre17_yvybpria.sph,ara-arq,0.9973286645664943 +lre17_ywssuzqt.sph,ara-arq,0.9999990761916652 +lre17_yygbpsdg.sph,ara-arq,0.9999490833528133 +lre17_yynyldnq.sph,ara-arq,0.9999228504305794 +lre17_yzbbhyzt.sph,ara-arq,0.9993694348705324 +lre17_yzjlvluy.sph,ara-arq,0.9996272929446929 +lre17_zaopfwhd.flac,ara-arq,0.9904595900538987 +lre17_zarwpotk.sph,ara-arq,0.9999289722738884 +lre17_zbolnsoz.sph,ara-arq,0.9999999973417777 +lre17_zcjklnfe.sph,ara-arq,0.9996813639473103 +lre17_zcxzxqos.sph,ara-arq,0.9999489746159186 +lre17_zdhatipt.flac,ara-ayl,0.9990280980242568 +lre17_zdhatipt.flac-g726,ara-aeb,0.9996834792362927 +lre17_zfajxywc.sph,ara-arq,0.9999934420544287 +lre17_zgdxpveq.flac-g711a,ara-arq,0.9999999408428903 +lre17_zgiksrvx.flac-g711mu,ara-arq,0.9993189757834657 +lre17_zhdbyfcw.sph,ara-arq,0.9997585048527508 +lre17_zhdfyrxw.sph,ara-arq,0.999148999070484 +lre17_zilbjisa.flac,ara-ayl,0.9999999857011468 +lre17_zilbjisa.flac-gsm,ara-ayl,0.9998704515922084 +lre17_zkdjfgbp.sph,ara-aeb,0.9951005308493373 +lre17_zmebjusq.sph,ara-arq,0.9993030221446528 +lre17_zmodeuem.sph,ara-arq,0.9999743212165114 +lre17_zmyziuxc.flac,ara-aeb,0.9999827977872252 +lre17_zmyziuxc.flac-g711a,ara-aeb,0.9950355679983685 +lre17_zpjrydvx.sph,ara-aeb,0.9994743369849513 +lre17_zrnsvuzf.sph,ara-arq,0.994174882388934 +lre17_zruejjuh.flac,ara-arq,0.9992671462220715 +lre17_zsrybjvn.sph,ara-arq,0.9993004483160852 +lre17_zvvdwwpv.flac,ara-aeb,0.9997026564129534 +lre17_zvvdwwpv.flac-g726,ara-aeb,0.9999912235110034 +lre17_zzkdjfea.sph,ara-arq,0.9842091321709953 diff --git a/egs/lre22/fixed.v1.8k/run_001_prepare_data.sh b/egs/lre22/fixed.v1.8k/run_001_prepare_data.sh new file mode 100755 index 00000000..60eb6891 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_001_prepare_data.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +stage=1 + +. parse_options.sh || exit 1; +. datapath.sh + + +if [ $stage -le 1 ];then + # Prepares voxlingua 107 for training + hyp_utils/conda_env.sh \ + local/prepare_voxlingua107.py \ + --corpus-dir $voxlingua_root \ + --output-dir data/voxlingua107 \ + --remove-langs en-en es-es ar-ar pt-pt \ + --map-langs-to-lre-codes \ + --target-fs 8000 + +fi + +if [ $stage -le 2 ];then + # Prepare LRE17 Training data + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_train \ + --subset train \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_cts \ + --subset dev \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_afv \ + --subset dev \ + --source vast \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_cts \ + --subset eval \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_afv \ + --subset eval \ + --source vast \ + --target-fs 8000 + +fi + +if [ $stage -le 3 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_dev.py \ + --corpus-dir $lre22_dev_root \ + --output-dir data/lre22_dev \ + --target-fs 8000 + +fi + +if [ $stage -le 4 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_eval.py \ + --corpus-dir $lre22_eval_root \ + --output-dir data/lre22_eval \ + --target-fs 8000 + +fi + +if [ $stage -le 5 ];then + local/download_lre22_scorer.sh + local/download_focal.sh +fi diff --git a/egs/lre22/fixed.v1.8k/run_002_compute_evad.sh b/egs/lre22/fixed.v1.8k/run_002_compute_evad.sh new file mode 100755 index 00000000..676ed335 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_002_compute_evad.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=b1 +storage_name=$(date +'%m_%d_%H_%M') +vaddir=`pwd`/exp/vad_e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/lre22-fixed-v1.8k-$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b1{0,1,2,3,4,5,6,7,8,9}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" + fi + fi +fi + +# VAD Train/Test Datasets +if [ $stage -le 2 ];then + for name in voxlingua107 \ + lre17_train \ + lre17_dev_cts lre17_dev_afv \ + lre17_eval_cts lre17_eval_afv \ + lre22_dev lre22_eval \ + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + diff --git a/egs/lre22/fixed.v1.8k/run_003_prepare_noises_rirs.sh b/egs/lre22/fixed.v1.8k/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..638143f0 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_003_prepare_noises_rirs.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 8 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../v1.16k/RIRS_NOISES ];then + ln -s ../v1.16k/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 8 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 8 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 8 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/lre22/fixed.v1.8k/run_004_apply_codecs.sh b/egs/lre22/fixed.v1.8k/run_004_apply_codecs.sh new file mode 100755 index 00000000..afd6a8ed --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_004_apply_codecs.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ];then + + for data in voxlingua107 lre17_dev_afv lre17_eval_afv + do + hyp_utils/conda_env.sh \ + local/apply_tel_codecs_to_kaldi_datadir.py \ + --input-dir data/$data \ + --output-dir data/${data}_codecs + done + +fi diff --git a/egs/lre22/fixed.v1.8k/run_010_prepare_xvec_train_data.sh b/egs/lre22/fixed.v1.8k/run_010_prepare_xvec_train_data.sh new file mode 100755 index 00000000..fbff4a02 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_010_prepare_xvec_train_data.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # This script preprocess audio for x-vector training + for name in voxlingua107_codecs \ + lre17_train \ + lre17_{dev,eval}_{cts,afv,afv_codecs} + do + steps_xvec/preprocess_audios_for_nnet_train.sh \ + --nj 40 --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${name} data/${name}_proc_audio_no_sil exp/${name}_proc_audio_no_sil + utils/fix_data_dir.sh data/${name}_proc_audio_no_sil + done +fi + +if [ $stage -le 2 ];then + utils/combine_data.sh \ + data/lre17_proc_audio_no_sil \ + data/lre17_train_proc_audio_no_sil \ + data/lre17_{dev,eval}_{cts,afv,afv_codecs}_proc_audio_no_sil +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 3s + hyp_utils/remove_short_audios.sh --min-len 3 data/voxlingua107_codecs_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/lre17_proc_audio_no_sil +fi + +if [ $stage -le 4 ];then + # merge voxlingua and lre17 + utils/combine_data.sh \ + data/voxlingua107_lre17_proc_audio_no_sil \ + data/voxlingua107_codecs_proc_audio_no_sil \ + data/lre17_proc_audio_no_sil +fi + +if [ $stage -le 5 ]; then + for name in lre17_proc_audio_no_sil voxlingua107_lre17_proc_audio_no_sil + do + hyp_utils/conda_env.sh \ + local/split_segments_train_val.py \ + --segments-file data/$name/utt2lang \ + --recordings-file data/$name/wav.scp \ + --durations-file data/$name/utt2dur \ + --val-percent 2. \ + --output-dir data/$name/train_val_split + done +fi + +if [ $stage -le 6 ]; then + for name in voxlingua107_lre17_proc_audio_no_sil + do + hyp_utils/conda_env.sh \ + local/split_segments_train_val.py \ + --segments-file data/$name/utt2lang \ + --recordings-file data/$name/wav.scp \ + --durations-file data/$name/utt2dur \ + --remove-langs en-en es-es ar-ar pt-pt \ + --val-percent 2. \ + --ara-ary-seg-file resources/lre17_ara-ary/segs_ara-ary.csv \ + --output-dir data/$name/train_val_split_noary + done + mkdir data/voxlingua107_lre17_noary_proc_audio_no_sil + cd data/voxlingua107_lre17_noary_proc_audio_no_sil + ln -s ../voxlingua107_lre17_proc_audio_no_sil/wav.scp + ln -s ../voxlingua107_lre17_proc_audio_no_sil/train_val_split_noary train_val_split + cd - + +fi + +if [ $stage -le 7 ]; then + awk 'BEGIN{ +adapt_langs_list="ara-acm ara-aeb ara-apc ara-arq ara-arz ara-ayl eng-gbr eng-usg por-brz zho-cmn zho-nan am-am sn-sn fra-mix haw-haw ia-ia ceb-ceb tl-tl sa-sa su-su te-te yo-yo sw-sw war-war km-km tr-tr gn-gn ha-ha ln-ln mg-mg"; +nf=split(adapt_langs_list, f, " "); +for(i=1;i<=nf;i++){ adapt_langs[f[i]]=1;}; +FS=","; OFS=","; +getline; print $0; +} +{if ($1 in adapt_langs) { $3="1."} else{ $3="0.01"}; print $0}' \ + data/voxlingua107_lre17_noary_proc_audio_no_sil/train_val_split/class_file.csv > \ + data/voxlingua107_lre17_noary_proc_audio_no_sil/train_val_split/class_file_adapt_1.csv +fi diff --git a/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh b/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh new file mode 100755 index 00000000..c67c8741 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project lre22-fixed-v1.8k --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu + +fi + +# Class balanced Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu + +fi +exit + +# Class-balanced + hard prototipe mining Fine-tuning +if [ $stage -le 3 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir \ + --num-gpus $ngpu + +fi + +exit + +# Fine-tuning +if [ $stage -le 4 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s4_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s4_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s4_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s4_base_cfg $nnet_s4_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s3 \ + --trainer.exp-path $nnet_s4_dir \ + --num-gpus $ngpu + +fi + + +# Fine-tuning +if [ $stage -le 5 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s5_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s5_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s5_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s5_base_cfg $nnet_s5_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s4 \ + --trainer.exp-path $nnet_s5_dir \ + --num-gpus $ngpu + +fi + +# Fine-tuning +if [ $stage -le 6 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s6_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s6_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s6_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s6_base_cfg $nnet_s6_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file_adapt_1.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s5 \ + --trainer.exp-path $nnet_s6_dir \ + --num-gpus $ngpu + +fi + diff --git a/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh~ b/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh~ new file mode 100755 index 00000000..2be763c7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh~ @@ -0,0 +1,161 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project lre22-fixed-v1.8k --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu + +fi + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu + +fi + +# Large Margin Fine-tuning +if [ $stage -le 3 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir \ + --num-gpus $ngpu + +fi + +# Large Margin Fine-tuning +if [ $stage -le 4 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s4_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s4_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s4_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s4_base_cfg $nnet_s4_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s3 \ + --trainer.exp-path $nnet_s4_dir \ + --num-gpus $ngpu + +fi + + +# Large Margin Fine-tuning +if [ $stage -le 5 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s5_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s5_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s5_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s5_base_cfg $nnet_s5_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s4 \ + --trainer.exp-path $nnet_s5_dir \ + --num-gpus $ngpu + +fi + +# Large Margin Fine-tuning +if [ $stage -le 6 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s6_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s6_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s6_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s6_base_cfg $nnet_s6_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file_adapt_1.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s5 \ + --trainer.exp-path $nnet_s6_dir \ + --num-gpus $ngpu + +fi + diff --git a/egs/lre22/fixed.v1.8k/run_030_extract_xvectors.sh b/egs/lre22/fixed.v1.8k/run_030_extract_xvectors.sh new file mode 100755 index 00000000..dc760d5b --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_030_extract_xvectors.sh @@ -0,0 +1,215 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=2 +nnet_stage=2 +config_file=default_config.sh +use_gpu=false +do_tsne=true +split_dev=false +xvec_chunk_length=12800 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 4G" +else + xvec_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +# if [ $stage -le 1 ]; then +# # Extract xvectors for training +# for name in lre17_proc_audio_no_sil voxlingua107_codecs_proc_audio_no_sil +# do +# steps_xvec/extract_xvectors_from_wav.sh \ +# --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ +# --use-bin-vad false \ +# --random-utt-length true --min-utt-length 300 --max-utt-length 3000 \ +# --feat-config $feat_config \ +# $nnet data/${name} \ +# $xvector_dir/${name} +# done +# fi + +if [ $stage -le 2 ]; then + # Extract xvectors for training + for name in lre22_dev + do + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --use-bin-vad true --num-augs 10 --aug-config conf/reverb_noise_aug.yaml \ + --random-utt-length true --min-utt-length 300 --max-utt-length 3000 \ + --feat-config $feat_config \ + $nnet data/${name} \ + $xvector_dir/${name}_aug \ + data/${name}_aug + done +fi + + +if [ $stage -le 3 ]; then + # Extracts x-vectors for dev and eval + for name in lre22_dev lre22_eval + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + --feat-config $feat_config \ + $nnet data/$name \ + $xvector_dir/$name + done +fi + +if [ $stage -le 4 ]; then + for name in lre22_dev + do + if [ "$do_tsne" == "true" ] || [ "$split_dev" == "true" ];then + $train_cmd \ + $xvector_dir/$name/tsne/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 + + $train_cmd \ + $xvector_dir/$name/tsne_per_class/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne_per_class.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne_per_class \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 \ + --do-ahc --cluster-tsne --ahc-thr -5 + + if [ "$split_dev" == "true" ];then + hyp_utils/conda_env.sh \ + local/split_dev.py \ + --segs-file $xvector_dir/$name/tsne_per_class/segments.csv \ + --output-dir ./resources/dev_splits \ + --num-folds 2 + + # delete the split data dirs so they are regenerated later + rm -rf data/lre22_dev_p{1,2} + + fi + fi + done +fi + +if [ $stage -le 5 ]; then + if [ ! -d data/lre22_dev_p1 ];then + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/train_segments.csv \ + > p1.lst + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/test_segments.csv \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev data/lre22_dev_$p + done + fi +fi + +if [ $stage -le 6 ]; then + if [ -d data/lre22_dev_aug ] && [ ! -d data/lre22_dev_aug_p1 ];then + awk -v fsegs=./resources/dev_splits/fold_0/train_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1] +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p1.lst + + awk -v fsegs=./resources/dev_splits/fold_0/test_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1]=1; +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev_aug data/lre22_dev_aug_$p + done + fi +fi + +if [ $stage -le 7 ];then + if [ -f $xvector_dir/lre22_dev_aug/xvector.scp ];then + mkdir -p $xvector_dir/lre22_dev_aug_clean + cat $xvector_dir/lre22_dev/xvector.scp \ + $xvector_dir/lre22_dev_aug/xvector.scp \ + > $xvector_dir/lre22_dev_aug_clean/xvector.scp + + for p in "" _p1 _p2 + do + if [ ! -d data/lre22_dev_aug_clean$p ]; then + utils/combine_data.sh \ + data/lre22_dev_aug_clean$p \ + data/lre22_dev$p \ + data/lre22_dev_aug$p + fi + done + fi +fi + +exit diff --git a/egs/lre22/fixed.v1.8k/run_040_be_final.sh b/egs/lre22/fixed.v1.8k/run_040_be_final.sh new file mode 100755 index 00000000..fe5b6f18 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_040_be_final.sh @@ -0,0 +1,434 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +fi + +xvector_dir=exp/xvectors/$nnet_name +be_base_dir=exp/be/$nnet_name +score_base_dir=exp/scores/$nnet_name + +if [ $stage -le 1 ];then + for r in 1 #0.9999 0.999 #0.99 0.975 0.95 + do + be_name=pca${r}_cw_lnorm_lgbe_lre22_aug + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + + + ) & + + done + + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}_p12/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + + + done + wait + +fi + +exit +# Back-ends below over-fitted + +if [ $stage -le 2 ];then + for r in 1 + do + for penalty in l2 #l1 + do + for c in 1 #0.1 1 + do + for ary_thr in 0.975 #0.85 0.7 #0.99 0.95 0.9 #15 ##1 5 10 20 + do + be_name=pca${r}_cw_lnorm_lsvm_${penalty}_c${c}_sqhinge_lre22_aug_lre17_aryt${ary_thr} + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + done + done + done + done + wait + +fi + +if [ $stage -le 3 ];then + for r in 1 # 0.9999 0.99 0.975 0.95 0.9 0.8 + do + for shrinking in true #false + do + for c in 1 10 #0.1 1 10 #0.01 0.1 1 10 # 0.0001 + do + for vl in false #true #false + do + if [ "$vl" == "true" ];then + do_vl="--do-vl" + else + do_vl="--no_do-vl" + fi + ary_thr=0.975 + be_name=pca${r}_cw_lnorm_gsvm_shrinking_${shrinking}_c${c}_lre17_aryt${ary_thr}_vl${vl}_aug_clean + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + #score_dir=$score_base_dir/${be_name}_logpost + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500\ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500 \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --svm.eval-type cat-log-post \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + + ) & + done + done + done + done + wait + +fi diff --git a/egs/lre22/fixed.v1.8k/run_050_fusion_v1.sh b/egs/lre22/fixed.v1.8k/run_050_fusion_v1.sh new file mode 100755 index 00000000..ffe3d6c6 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_050_fusion_v1.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +score_dir_0=exp/scores +nnet_1=fbank64_stmn_ecapatdnn2048x4_v1.0.s2 +nnet_2=fbank64_stmn_fwseres2net50s8_v1.0.s2 +be_1=pca1_cw_lnorm_lgbe_lre22_aug +score_dirs="$score_dir_0/$nnet_1/$be_1 +$score_dir_0/$nnet_2/$be_1" + +train_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"_p12/cal_v1" }; print $0}') +test_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"/cal_v1" }; print $0}') + +output_dir=exp/fusion/fus_v1.0 + +local/train_fusion_lre22.sh "$train_score_dirs" $output_dir/train +local/score_lre22.sh \ + dev \ + ${output_dir}/train/lre22_dev_scores.tsv \ + ${output_dir}/train/lre22_dev_results + +local/eval_fusion_lre22.sh "$test_score_dirs" $output_dir/train/fus.mat $output_dir/test + +local/score_lre22.sh \ + dev \ + ${output_dir}/test/lre22_dev_scores.tsv \ + ${output_dir}/test/lre22_dev_results + +local/score_lre22.sh eval \ + ${output_dir}/test/lre22_eval_scores.tsv \ + ${output_dir}/test/lre22_eval_results + + + + + + diff --git a/egs/voxceleb/vae.v1/steps b/egs/lre22/fixed.v1.8k/steps similarity index 100% rename from egs/voxceleb/vae.v1/steps rename to egs/lre22/fixed.v1.8k/steps diff --git a/egs/lre22/fixed.v1.8k/steps_be/eval_be_v1.py b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v1.py new file mode 100755 index 00000000..85fee18c --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v1.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("test acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels) + + +def train_be( + v_file, + trial_list, + class_name, + has_labels, + gbe, + model_dir, + score_file, + verbose, +): + config_logger(verbose) + model_dir = Path(model_dir) + output_dir = Path(score_file).parent + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + segs = SegmentSet.load(trial_list) + reader = DRF.create(v_file) + x = reader.read(segs["id"], squeeze=True) + del reader + logging.info("loaded %d samples", x.shape[0]) + + trans_file = model_dir / "transforms.h5" + if trans_file.is_file(): + logging.info("loading transform file %s", trans_file) + trans = TransformList.load(trans_file) + logging.info("applies transform") + x = trans(x) + + gbe_file = model_dir / "model_gbe.h5" + logging.info("loading GBE file %s", gbe_file) + gbe_model = GBE.load(gbe_file) + logging.info("GBE args=%s", str(gbe)) + logging.info("evals GBE") + scores = gbe_model(x, **gbe) + + if has_labels: + class_ids = segs[class_name] + y_true = np.asarray([gbe_model.labels.index(l) for l in class_ids]) + # labels, y_true = np.unique(class_ids, return_inverse=True) + y_pred = np.argmax(scores, axis=-1) + compute_metrics(y_true, y_pred, gbe_model.labels) + + logging.info("Saving scores to %s", score_file) + score_table = {"segmentid": segs["id"]} + for i, key in enumerate(gbe_model.labels): + score_table[key] = scores[:, i] + + score_table = pd.DataFrame(score_table) + score_table.to_csv(score_file, sep="\t", index=False) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Evals linear GBE", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--trial-list", required=True) + GBE.add_eval_args(parser, prefix="gbe") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--has-labels", default=False, action=ActionYesNo) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/eval_be_v2.py b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v2.py new file mode 100755 index 00000000..78b50935 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v2.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("test acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + trial_list, + class_name, + has_labels, + svm, + model_dir, + score_file, + verbose, +): + config_logger(verbose) + model_dir = Path(model_dir) + output_dir = Path(score_file).parent + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + segs = SegmentSet.load(trial_list) + reader = DRF.create(v_file) + x = reader.read(segs["id"], squeeze=True) + del reader + logging.info("loaded %d samples", x.shape[0]) + + trans_file = model_dir / "transforms.h5" + if trans_file.is_file(): + logging.info("loading transform file %s", trans_file) + trans = TransformList.load(trans_file) + logging.info("applies transform") + x = trans(x) + + svm_file = model_dir / "model_svm.h5" + logging.info("loading SVM file %s", svm_file) + svm_model = SVM.load(svm_file) + logging.info("SVM args=%s", str(svm)) + logging.info("evals SVM") + scores = svm_model(x, **svm) + + if has_labels: + class_ids = segs[class_name] + y_true = np.asarray([svm_model.labels.index(l) for l in class_ids]) + # labels, y_true = np.unique(class_ids, return_inverse=True) + y_pred = np.argmax(scores, axis=-1) + compute_metrics(y_true, y_pred, svm_model.labels) + + logging.info("Saving scores to %s", score_file) + score_table = {"segmentid": segs["id"]} + for i, key in enumerate(svm_model.labels): + score_table[key] = scores[:, i] + + score_table = pd.DataFrame(score_table) + score_table.to_csv(score_file, sep="\t", index=False) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Evals linear SVM", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--trial-list", required=True) + SVM.add_eval_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--has-labels", default=False, action=ActionYesNo) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/eval_be_v5.py b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v5.py new file mode 100755 index 00000000..ad11a667 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v5.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import GaussianSVMC as SVM +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("test acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + label_idxs = [i for i in range(len(labels))] + C = compute_confusion_matrix(y_true, y_pred, label_idxs, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, label_idxs, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + trial_list, + class_name, + has_labels, + svm, + model_dir, + score_file, + verbose, +): + config_logger(verbose) + model_dir = Path(model_dir) + output_dir = Path(score_file).parent + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + segs = SegmentSet.load(trial_list) + reader = DRF.create(v_file) + x = reader.read(segs["id"], squeeze=True) + del reader + logging.info("loaded %d samples", x.shape[0]) + + trans_file = model_dir / "transforms.h5" + if trans_file.is_file(): + logging.info("loading transform file %s", trans_file) + trans = TransformList.load(trans_file) + logging.info("applies transform") + x = trans(x) + + svm_file = model_dir / "model_svm.h5" + logging.info("loading SVM file %s", svm_file) + svm_model = SVM.load(svm_file) + if not isinstance(svm_model, SVM): + print("Model loading failed") + +# model_labels = ['afr-afr', 'ara-aeb', 'ara-arq', 'ara-ayl', 'eng-ens', 'eng-iaf', 'fra-ntf', 'nbl-nbl', 'orm-orm', 'tir-tir', 'tso-tso', 'ven-ven', 'xho-xho', 'zul-zul'] +# model_labels = list(svm_model.labels) +# print('model_labels', np.shape(model_labels)) +# if 'zzzzzz' in model_labels: +# model_labels.remove('zzzzzz') +# svm_model.labels = model_labels + print('svm_model.labels', np.shape(svm_model.labels)) + + logging.info("SVM args=%s", str(svm)) + logging.info("evals SVM") + scores = svm_model(x, **svm) + + if has_labels: + class_ids = segs[class_name] + y_true = np.asarray([svm_model.labels.index(l) for l in class_ids if l in svm_model.labels]) + # labels, y_true = np.unique(class_ids, return_inverse=True) + y_pred = np.argmax(scores, axis=-1) + compute_metrics(y_true, y_pred, svm_model.labels) + + logging.info("Saving scores to %s", score_file) + score_table = {"segmentid": segs["id"]} + for i, key in enumerate(svm_model.labels): + score_table[key] = scores[:, i] + + score_table = pd.DataFrame(score_table) + score_table.to_csv(score_file, sep="\t", index=False) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Evals gaussian SVM", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--trial-list", required=True) + SVM.add_eval_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--has-labels", default=False, action=ActionYesNo) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/eval_fusion.m b/egs/lre22/fixed.v1.8k/steps_be/eval_fusion.m new file mode 100644 index 00000000..830ee6c8 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/eval_fusion.m @@ -0,0 +1,17 @@ +function eval_fusion(in_files, out_file, model_file) + + load(model_file, 'alpha', 'beta', 'labels'); + n_files = length(in_files); + scores={}; + for i=1:n_files + T_i = readtable(in_files{i}, 'FileType', 'delimitedtext', 'Delimiter','tab', 'ReadRowNames', true, 'VariableNamingRule', 'preserve'); + T_i = sortrows(T_i, 'RowNames'); + s_i = T_i.Variables'; + scores{i}=s_i; + end + scores = apply_nary_lin_fusion(scores, alpha, beta); + T_i.Variables = scores'; + %T_i.Properties.VariableNames = T_i.Properties.VariableDescriptions; + writetable(T_i, out_file, 'FileType', 'text', 'Delimiter','tab', 'WriteRowNames', true) + + \ No newline at end of file diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v1.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v1.py new file mode 100755 index 00000000..983d903d --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v1.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels) + + +def train_be( + v_file, + train_list, + class_name, + do_lnorm, + whiten, + pca, + gbe, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + train_reader = DRF.create(v_file) + x_trn = train_reader.read(train_segs["id"], squeeze=True) + del train_reader + class_ids = train_segs[class_name] + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("loaded %d samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("GBE args=%s", str(gbe)) + gbe = GBE(labels=labels, **gbe) + gbe.fit(x_trn, y_true) + logging.info("trained GBE") + scores = gbe.eval_linear(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and Gaussian BE") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + gbe.save(output_dir / "model_gbe.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Train linear GBE", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GBE.add_class_args(parser, prefix="gbe") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v2.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v2.py new file mode 100755 index 00000000..599b55c4 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v2.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels) + + +def train_be( + v_file, + train_list, + class_name, + do_lnorm, + whiten, + pca, + svm, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + train_reader = DRF.create(v_file) + x_trn = train_reader.read(train_segs["id"], squeeze=True) + del train_reader + class_ids = train_segs[class_name] + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("loaded %d samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("SVM args=%s", str(svm)) + model = SVM(labels=labels, **svm) + model.fit(x_trn, y_true) + logging.info("trained SVM") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and SVM") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.save(output_dir / "model_svm.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Train linear SVM Classifier", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + PCA.add_class_args(parser, prefix="pca") + SVM.add_class_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v3.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v3.py new file mode 100755 index 00000000..87009212 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v3.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + lre17_v_file, + lre17_list, + class_name, + do_lnorm, + whiten, + ary_thr, + num_nons, + pca, + svm, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_lre17 = SegmentSet.load(lre17_list) + ary_idx = segs_lre17[class_name] == "ara-ary" + # lre17_segs.loc[ara_ary_idx, class_name] = "ara-ayl" # "ara-arq" # "ara-aeb" + segs_ary = segs_lre17.loc[ary_idx] + + logging.info("label maghrebi arabic samples") + v_reader = DRF.create(lre17_v_file) + x_ary = v_reader.read(segs_ary["id"], squeeze=True) + logging.info("loaded %d lre17 ara-ary samples", x_ary.shape[0]) + + ara_idx = train_segs[class_name].isin(["ara-ayl", "ara-arq", "ara-aeb"]) + x_ara = x_trn[ara_idx] + class_ids_ara = train_segs.loc[ara_idx, class_name].values + + gbe_ara = GBE() + labels_ara, y_true_ara = np.unique(class_ids_ara, return_inverse=True) + gbe_ara.fit(x_ara, y_true_ara) + scores_ary = gbe_ara(x_ary) + y_pred_ary = np.argmax(scores_ary, axis=-1) + logp_ary = np.max(softmax(scores_ary, axis=-1), axis=-1) + print(logp_ary, y_pred_ary) + # dscores_ary = np.diff(np.sort(scores_ary, axis=-1), axis=-1)[:, -1] + # sel_ary = dscores_ary > ary_thr + sel_ary = logp_ary > ary_thr + segs_ary = segs_ary.loc[sel_ary] + y_pred_ary = y_pred_ary[sel_ary] + x_ary = x_ary[sel_ary] + segs_ary[class_name] = [labels_ara[c] for c in y_pred_ary] + logging.info("selected %d ara-ary segments", x_ary.shape[0]) + segs_ary["logp"] = logp_ary[sel_ary] + SegmentSet(segs_ary).save(output_dir / "segs_ary.csv") + + logging.info("selecting non-target segments") + segs_non = segs_lre17.loc[~ary_idx].copy() + segs_non[class_name] = "zzzzzz" + x_non = v_reader.read(segs_non["id"], squeeze=True) + logging.info("loaded %d lre17 non-tar samples", x_non.shape[0]) + + class_ids = train_segs[class_name].values + labels, y_true = np.unique(class_ids, return_inverse=True) + gbe = GBE() + gbe.fit(x_trn, y_true) + scores_non = np.max(gbe(x_non), axis=1) + sel_non = np.argsort(scores_non)[-num_nons:] + segs_non = segs_non.iloc[sel_non] + x_non = x_non[sel_non] + logging.info("selected %d non-tar segments", x_non.shape[0]) + + class_ids = ( + list(train_segs[class_name].values) + + list(segs_ary[class_name].values) + + list(segs_non[class_name].values) + ) + x_trn = np.concatenate((x_trn, x_ary, x_non), axis=0) + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("SVM args=%s", str(svm)) + model = SVM(labels=labels, **svm) + model.fit(x_trn, y_true) + logging.info("trained SVM") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and SVM") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.svm.coef_ = model.svm.coef_[:-1] + model.svm.intercept_ = model.svm.intercept_[:-1] + model.labels = model.labels[:-1] + model.save(output_dir / "model_svm.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Train linear SVM Classifier", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--lre17-v-file", required=True) + parser.add_argument("--lre17-list", required=True) + PCA.add_class_args(parser, prefix="pca") + SVM.add_class_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--ary-thr", default=10, type=float) + parser.add_argument("--num-nons", default=10000, type=int) + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v4.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v4.py new file mode 100755 index 00000000..986393a8 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v4.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + lre17_v_file, + lre17_list, + class_name, + do_lnorm, + whiten, + ary_thr, + # num_nons, + pca, + gbe, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_lre17 = SegmentSet.load(lre17_list) + ary_idx = segs_lre17[class_name] == "ara-ary" + # lre17_segs.loc[ara_ary_idx, class_name] = "ara-ayl" # "ara-arq" # "ara-aeb" + segs_ary = segs_lre17.loc[ary_idx] + + logging.info("label maghrebi arabic samples") + v_reader = DRF.create(lre17_v_file) + x_ary = v_reader.read(segs_ary["id"], squeeze=True) + logging.info("loaded %d lre17 ara-ary samples", x_ary.shape[0]) + + ara_idx = train_segs[class_name].isin(["ara-ayl", "ara-arq", "ara-aeb"]) + x_ara = x_trn[ara_idx] + class_ids_ara = train_segs.loc[ara_idx, class_name].values + + gbe_ara = GBE() + labels_ara, y_true_ara = np.unique(class_ids_ara, return_inverse=True) + gbe_ara.fit(x_ara, y_true_ara) + scores_ary = gbe_ara(x_ary) + y_pred_ary = np.argmax(scores_ary, axis=-1) + p_ary = np.max(softmax(scores_ary, axis=-1), axis=-1) + sel_ary = p_ary > ary_thr + segs_ary = segs_ary.loc[sel_ary] + y_pred_ary = y_pred_ary[sel_ary] + x_ary = x_ary[sel_ary] + segs_ary[class_name] = [labels_ara[c] for c in y_pred_ary] + logging.info("selected %d ara-ary segments", x_ary.shape[0]) + segs_ary["p"] = p_ary[sel_ary] + SegmentSet(segs_ary).save(output_dir / "segs_ary.csv") + + # logging.info("selecting non-target segments") + # segs_non = segs_lre17.loc[~ary_idx].copy() + # segs_non[class_name] = "zzzzzz" + # x_non = v_reader.read(segs_non["id"], squeeze=True) + # logging.info("loaded %d lre17 non-tar samples", x_non.shape[0]) + + # class_ids = train_segs[class_name].values + # labels, y_true = np.unique(class_ids, return_inverse=True) + # gbe = GBE() + # gbe.fit(x_trn, y_true) + # scores_non = np.max(gbe(x_non), axis=1) + # sel_non = np.argsort(scores_non)[-num_nons:] + # segs_non = segs_non.iloc[sel_non] + # x_non = x_non[sel_non] + # logging.info("selected %d non-tar segments", x_non.shape[0]) + + # class_ids = ( + # list(train_segs[class_name].values) + # + list(segs_ary[class_name].values) + # + list(segs_non[class_name].values) + # ) + # x_trn = np.concatenate((x_trn, x_ary, x_non), axis=0) + class_ids = list(train_segs[class_name].values) + list(segs_ary[class_name].values) + x_trn = np.concatenate((x_trn, x_ary), axis=0) + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("GBE args=%s", str(gbe)) + model = GBE(labels=labels, **gbe) + model.fit(x_trn, y_true) + logging.info("trained GBE") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and GBE") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.save(output_dir / "model_gbe.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Train linear GBE Classifier", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--lre17-v-file", required=True) + parser.add_argument("--lre17-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GBE.add_class_args(parser, prefix="gbe") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--ary-thr", default=10, type=float) + # parser.add_argument("--num-nons", default=10000, type=int) + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v5.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v5.py new file mode 100755 index 00000000..32cfd6c9 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v5.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.classifiers import GaussianSVMC as GSVM +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + lre17_v_file, + lre17_list, + voxlingua_v_file, + voxlingua_list, + class_name, + do_lnorm, + whiten, + ary_thr, + num_nons, + pca, + svm, + output_dir, + verbose, + do_vl, + do_lre17, +): + print(locals(), flush=True) + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + x_ary = [] + x_non = [] + y_ary = [] + y_non = [] + + if do_lre17: + segs_lre17 = SegmentSet.load(lre17_list) + ary_idx = segs_lre17[class_name] == "ara-ary" + # lre17_segs.loc[ara_ary_idx, class_name] = "ara-ayl" # "ara-arq" # "ara-aeb" + segs_ary = segs_lre17.loc[ary_idx] + + logging.info("label maghrebi arabic samples") + v_reader = DRF.create(lre17_v_file) + x_ary = v_reader.read(segs_ary["id"], squeeze=True) + logging.info("loaded %d lre17 ara-ary samples", x_ary.shape[0]) + + ara_idx = train_segs[class_name].isin(["ara-ayl", "ara-arq", "ara-aeb"]) + x_ara = x_trn[ara_idx] + class_ids_ara = train_segs.loc[ara_idx, class_name].values + + gbe_ara = GBE() + labels_ara, y_true_ara = np.unique(class_ids_ara, return_inverse=True) + gbe_ara.fit(x_ara, y_true_ara) + scores_ary = gbe_ara(x_ary) + y_pred_ary = np.argmax(scores_ary, axis=-1) + logp_ary = np.max(softmax(scores_ary, axis=-1), axis=-1) + print(logp_ary, y_pred_ary) + # dscores_ary = np.diff(np.sort(scores_ary, axis=-1), axis=-1)[:, -1] + # sel_ary = dscores_ary > ary_thr + sel_ary = logp_ary > ary_thr + segs_ary = segs_ary.loc[sel_ary] + y_pred_ary = y_pred_ary[sel_ary] + x_ary = x_ary[sel_ary] + segs_ary[class_name] = [labels_ara[c] for c in y_pred_ary] + logging.info("selected %d ara-ary segments", x_ary.shape[0]) + segs_ary["logp"] = logp_ary[sel_ary] + SegmentSet(segs_ary).save(output_dir / "segs_ary.csv") + + logging.info("selecting non-target segments") + lre17_close_idx = segs_lre17[class_name].isin( + ["ara-acm", "ara-apc", "eng-usg", "por-brz"] + ) + segs_non = segs_lre17.loc[lre17_close_idx].copy() + segs_non[class_name] = "zzzzzz" + x_non = v_reader.read(segs_non["id"], squeeze=True) + logging.info("loaded %d lre17 non-tar samples", x_non.shape[0]) + + y_ary = list(segs_ary[class_name].values) + y_non = list(segs_non[class_name].values) + + # class_ids = train_segs[class_name].values + # labels, y_true = np.unique(class_ids, return_inverse=True) + # gbe = GBE() + # gbe.fit(x_trn, y_true) + # scores_non = np.max(gbe(x_non), axis=1) + # sel_non = np.argsort(scores_non)[-num_nons:] + # segs_non = segs_non.iloc[sel_non] + # x_non = x_non[sel_non] + # logging.info("selected %d non-tar segments", x_non.shape[0]) + + if do_vl: + v_reader_vl = DRF.create(voxlingua_v_file) + segs_voxlingua = SegmentSet.load(voxlingua_list) + vl_close_idx = segs_voxlingua[class_name].isin( + [ + "en-en", + "am-am", + "sn-sn", + "fra-mix", + "haw-haw", + "zho-cmn", + "ia-ia", + "ceb-ceb", + "sa-sa", + "su-su", + "te-te", + "yo-yo", + "sw-sw", + "pt-pt", + "war-war", + "km-km", + "tr-tr", + "gn-gn", + ] + ) + segs_vl_close = segs_voxlingua.loc[vl_close_idx].copy() + segs_vl_close[class_name] = "zzzzzz" + x_non_vl = v_reader_vl.read(segs_vl_close["id"], squeeze=True) + + vl_afk_idx = segs_voxlingua[class_name] == "afr-afr" + if not np.any(vl_afk_idx): + vl_afk_idx = segs_voxlingua[class_name] == "af-af" + segs_vl_afk = segs_voxlingua.loc[vl_afk_idx].copy() + segs_vl_afk[class_name] = "afr-afr" + x_trn_vl = v_reader_vl.read(segs_vl_afk["id"], squeeze=True) + + y_trn_vl = list(segs_vl_afk[class_name].values) + y_non_vl = list(segs_vl_close[class_name].values) + + del v_reader_vl + else: + x_trn_vl = np.zeros((0, x_trn.shape[1])) + x_non_vl = np.zeros((0, x_trn.shape[1])) + y_trn_vl = [] + y_non_vl = [] + + class_ids = ( + list(train_segs[class_name].values) + y_trn_vl + y_ary + y_non + y_non_vl + ) + x_trn = np.concatenate((x_trn, x_trn_vl, x_ary, x_non, x_non_vl), axis=0) + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("Gaussian SVM args=%s", str(svm)) + model = GSVM(labels=labels, **svm) + model.fit(x_trn, y_true) + logging.info("trained SVM") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and SVM") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + # model.svm.coef_ = model.svm.coef_[:-1] + # model.svm.intercept_ = model.svm.intercept_[:-1] + model_labels = list(np.copy(model.labels)) + if "zzzzzz" in model_labels: + model_labels.remove("zzzzzz") + model.labels = model_labels + print("model.labels before save", np.shape(model.labels)) + model.save(output_dir / "model_svm.h5") + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train gaussian SVM Classifier", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--lre17-v-file", required=True) + parser.add_argument("--lre17-list", required=True) + parser.add_argument("--voxlingua-v-file", required=True) + parser.add_argument("--voxlingua-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GSVM.add_class_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--ary-thr", default=10, type=float) + parser.add_argument("--num-nons", default=10000, type=int) + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--do-vl", default=True, action=ActionYesNo) + parser.add_argument("--do-lre17", default=True, action=ActionYesNo) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + # parser.add_argument("--classifier", default="lsvm", choices=["lsvm", "gsvm", "rf"], required=False) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v6.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v6.py new file mode 100755 index 00000000..d481a18d --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v6.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + +tar_langs = ( + "afr-afr", + "ara-aeb", + "ara-arq", + "ara-ayl", + "eng-ens", + "eng-iaf", + "fra-ntf", + "nbl-nbl", + "orm-orm", + "tir-tir", + "tso-tso", + "ven-ven", + "xho-xho", + "zul-zul", +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + cv_v_file, + cv_list, + afr_v_file, + afr_list, + class_name, + do_lnorm, + whiten, + pca, + gbe, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_cv = SegmentSet.load(cv_list) + # ary_idx = segs_lre17[class_name] == "ara-ary" + # segs_ary = segs_lre17.loc[ary_idx] + + segs_cv = SegmentSet.load(cv_list) + cv_idx = np.zeros((len(segs_cv),), dtype=bool) + for lang in tar_langs: + cv_idx_i = segs_cv[class_name] == lang + cv_idx = np.logical_or(cv_idx, cv_idx_i) + + segs_cv = segs_cv.loc[cv_idx] + # segs_cv.loc[segs_cv[class_name] == "eng-ine", class_name] = "eng-iaf" + + # v_reader = DRF.create(cv_v_file) + # x_cv = v_reader.read(segs_cv["id"], squeeze=True) + # logging.info("loaded %d cv samples", x_cv.shape[0]) + + segs_afr = SegmentSet.load(afr_list) + afr_idx = np.zeros((len(segs_afr),), dtype=bool) + for lang in tar_langs: + afr_idx_i = segs_afr[class_name] == lang + afr_idx = np.logical_or(afr_idx, afr_idx_i) + + segs_afr = segs_afr.loc[afr_idx] + + v_reader = DRF.create(afr_v_file) + x_afr = v_reader.read(segs_afr["id"], squeeze=True) + logging.info("loaded %d afr samples", x_afr.shape[0]) + + class_ids = ( + list(train_segs[class_name].values) + # + list(segs_cv[class_name].values) + + list(segs_afr[class_name].values) + ) + # x_trn = np.concatenate((x_trn, x_cv, x_afr), axis=0) + x_trn = np.concatenate((x_trn, x_afr), axis=0) + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("GBE args=%s", str(gbe)) + model = GBE(labels=labels, **gbe) + model.fit(x_trn, y_true) + logging.info("trained GBE") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and GBE") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.save(output_dir / "model_gbe.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train linear GBE Classifier",) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--cv-v-file", required=True) + parser.add_argument("--cv-list", required=True) + parser.add_argument("--afr-v-file", required=True) + parser.add_argument("--afr-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GBE.add_class_args(parser, prefix="gbe") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v7.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v7.py new file mode 100755 index 00000000..1b37d92e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v7.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + +tar_langs = ( + "afr-afr", + "ara-aeb", + "ara-arq", + "ara-ayl", + "eng-ens", + "eng-iaf", + "fra-ntf", + "nbl-nbl", + "orm-orm", + "tir-tir", + "tso-tso", + "ven-ven", + "xho-xho", + "zul-zul", +) + +non_langs = ( + "fra-can", + "fra-fra", + "fra-ntf", + "afr-afr", + "ara-acm", + "ara-arz", + "ara-jor", + "ara-ksa", + "ara-kuw", + "ara-leb", + "ara-mau", + "ara-mor", + "ara-oma", + "ara-pal", + "ara-qat", + "ara-sud", + "ara-syr", + "ara-uae", + "ara-yem", + "ara-apc", + "eng-gbr", + "eng-usg", +) + + +def read_ood_data(train_list, v_file, class_name): + v_reader = DRF.create(v_file) + + segs = SegmentSet.load(train_list) + idx = np.zeros((len(segs),), dtype=bool) + for lang in tar_langs: + idx_i = segs[class_name] == lang + idx = np.logical_or(idx, idx_i) + + segs_tar = segs.loc[idx].copy() + if len(segs_tar) > 0: + x_tar = v_reader.read(segs_tar["id"], squeeze=True) + else: + x_tar = None + + idx = np.zeros((len(segs),), dtype=bool) + for lang in non_langs: + idx_i = segs[class_name] == lang + idx = np.logical_or(idx, idx_i) + + segs_non = segs.loc[idx].copy() + segs_non[class_name] = "zzzzzzz" + if len(segs_non) > 0: + x_non = v_reader.read(segs_non["id"], squeeze=True) + else: + x_non = None + + logging.info( + "read %s got ntar: %d nnon: %d", train_list, len(segs_tar), len(segs_non) + ) + return segs_tar, x_tar, segs_non, x_non + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + sre_v_file, + sre_list, + lre17_v_file, + lre17_list, + cv_v_file, + cv_list, + afr_v_file, + afr_list, + class_name, + do_lnorm, + whiten, + pca, + svm, + output_dir, + ood_weight, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_sre_tar, x_sre_tar, segs_sre_non, x_sre_non = read_ood_data( + sre_list, sre_v_file, class_name, + ) + _, _, segs_lre17_non, x_lre17_non = read_ood_data( + lre17_list, lre17_v_file, class_name, + ) + segs_cv_tar, x_cv_tar, segs_cv_non, x_cv_non = read_ood_data( + cv_list, cv_v_file, class_name + ) + segs_afr_tar, x_afr_tar, segs_afr_non, x_afr_non = read_ood_data( + afr_list, afr_v_file, class_name, + ) + + # class_ids = train_segs[class_name].values + # labels, y_true = np.unique(class_ids, return_inverse=True) + # gbe = GBE() + # gbe.fit(x_trn, y_true) + # scores_non = np.max(gbe(x_non), axis=1) + # sel_non = np.argsort(scores_non)[-num_nons:] + # segs_non = segs_non.iloc[sel_non] + # x_non = x_non[sel_non] + # logging.info("selected %d non-tar segments", x_non.shape[0]) + + # class_ids = ( + # list(train_segs[class_name].values) + # + list(segs_sre_tar[class_name].values) + # + list(segs_cv_tar[class_name].values) + # + list(segs_afr_tar[class_name].values) + # + list(segs_sre_non[class_name].values) + # + list(segs_lre17_non[class_name].values) + # + list(segs_cv_non[class_name].values) + # + list(segs_afr_non[class_name].values) + # ) + # x_trn = np.concatenate( + # ( + # x_trn, + # x_sre_tar, + # x_cv_tar, + # x_afr_tar, + # x_sre_non, + # x_lre17_non, + # x_cv_non, + # x_afr_non, + # ), + # axis=0, + # ) + class_ids = ( + list(train_segs[class_name].values) + + list(segs_sre_tar[class_name].values) + + list(segs_cv_tar[class_name].values) + + list(segs_afr_tar[class_name].values) + + list(segs_sre_non[class_name].values) + + list(segs_lre17_non[class_name].values) + + list(segs_cv_non[class_name].values) + + list(segs_afr_non[class_name].values) + ) + x = np.concatenate( + ( + x_trn, + x_sre_tar, + x_cv_tar, + x_afr_tar, + x_sre_non, + x_lre17_non, + x_cv_non, + x_afr_non, + ), + axis=0, + ) + sample_weight = np.concatenate( + ( + np.ones((len(train_segs),)), + ood_weight * np.ones((len(segs_sre_tar),)), + ood_weight * np.ones((len(segs_cv_tar),)), + ood_weight * np.ones((len(segs_afr_tar),)), + ood_weight * np.ones((len(segs_sre_non),)), + np.ones((len(segs_lre17_non),)), + ood_weight * np.ones((len(segs_cv_non),)), + ood_weight * np.ones((len(segs_afr_non),)), + ) + ) + + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x = pca(x) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x) + + logging.info("apply lnorm") + x = lnorm(x) + else: + lnorm = None + + logging.info("SVM args=%s", str(svm)) + model = SVM(labels=labels, **svm) + model.fit(x, y_true, sample_weight=sample_weight) + logging.info("trained SVM") + scores = model(x) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and SVM") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.svm.coef_ = model.svm.coef_[:-1] + model.svm.intercept_ = model.svm.intercept_[:-1] + model.labels = model.labels[:-1] + model.save(output_dir / "model_svm.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train linear SVM Classifier",) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--sre-v-file", required=True) + parser.add_argument("--sre-list", required=True) + parser.add_argument("--lre17-v-file", required=True) + parser.add_argument("--lre17-list", required=True) + parser.add_argument("--cv-v-file", required=True) + parser.add_argument("--cv-list", required=True) + parser.add_argument("--afr-v-file", required=True) + parser.add_argument("--afr-list", required=True) + PCA.add_class_args(parser, prefix="pca") + SVM.add_class_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + # parser.add_argument("--num-nons", default=10000, type=int) + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--ood-weight", default=0.1, type=float) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v8.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v8.py new file mode 100755 index 00000000..ec9d5e56 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v8.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import GaussianSVMC as GSVM +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + +tar_langs = ( + "afr-afr", + "ara-aeb", + "ara-arq", + "ara-ayl", + "eng-ens", + "eng-iaf", + "fra-ntf", + "nbl-nbl", + "orm-orm", + "tir-tir", + "tso-tso", + "ven-ven", + "xho-xho", + "zul-zul", +) + +non_langs = ( + "fra-can", + "fra-fra", + "fra-ntf", + "afr-afr", + "ara-acm", + "ara-arz", + "ara-jor", + "ara-ksa", + "ara-kuw", + "ara-leb", + "ara-mau", + "ara-mor", + "ara-oma", + "ara-pal", + "ara-qat", + "ara-sud", + "ara-syr", + "ara-uae", + "ara-yem", + "ara-apc", + "eng-gbr", + "eng-usg", +) + + +def read_ood_data(train_list, v_file, class_name): + v_reader = DRF.create(v_file) + + segs = SegmentSet.load(train_list) + idx = np.zeros((len(segs),), dtype=bool) + for lang in tar_langs: + idx_i = segs[class_name] == lang + idx = np.logical_or(idx, idx_i) + + segs_tar = segs.loc[idx].copy() + if len(segs_tar) > 0: + x_tar = v_reader.read(segs_tar["id"], squeeze=True) + else: + x_tar = None + + idx = np.zeros((len(segs),), dtype=bool) + for lang in non_langs: + idx_i = segs[class_name] == lang + idx = np.logical_or(idx, idx_i) + + segs_non = segs.loc[idx].copy() + segs_non[class_name] = "zzzzzz" + if len(segs_non) > 0: + x_non = v_reader.read(segs_non["id"], squeeze=True) + else: + x_non = None + + logging.info( + "read %s got ntar: %d nnon: %d", train_list, len(segs_tar), len(segs_non) + ) + return segs_tar, x_tar, segs_non, x_non + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + sre_v_file, + sre_list, + lre17_v_file, + lre17_list, + cv_v_file, + cv_list, + afr_v_file, + afr_list, + class_name, + do_lnorm, + whiten, + pca, + svm, + output_dir, + ood_weight, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_sre_tar, x_sre_tar, segs_sre_non, x_sre_non = read_ood_data( + sre_list, sre_v_file, class_name, + ) + _, _, segs_lre17_non, x_lre17_non = read_ood_data( + lre17_list, lre17_v_file, class_name, + ) + segs_cv_tar, x_cv_tar, segs_cv_non, x_cv_non = read_ood_data( + cv_list, cv_v_file, class_name + ) + segs_afr_tar, x_afr_tar, segs_afr_non, x_afr_non = read_ood_data( + afr_list, afr_v_file, class_name, + ) + + # class_ids = train_segs[class_name].values + # labels, y_true = np.unique(class_ids, return_inverse=True) + # gbe = GBE() + # gbe.fit(x_trn, y_true) + # scores_non = np.max(gbe(x_non), axis=1) + # sel_non = np.argsort(scores_non)[-num_nons:] + # segs_non = segs_non.iloc[sel_non] + # x_non = x_non[sel_non] + # logging.info("selected %d non-tar segments", x_non.shape[0]) + + # class_ids = ( + # list(train_segs[class_name].values) + # + list(segs_sre_tar[class_name].values) + # + list(segs_cv_tar[class_name].values) + # + list(segs_afr_tar[class_name].values) + # + list(segs_sre_non[class_name].values) + # + list(segs_lre17_non[class_name].values) + # + list(segs_cv_non[class_name].values) + # + list(segs_afr_non[class_name].values) + # ) + # x_trn = np.concatenate( + # ( + # x_trn, + # x_sre_tar, + # x_cv_tar, + # x_afr_tar, + # x_sre_non, + # x_lre17_non, + # x_cv_non, + # x_afr_non, + # ), + # axis=0, + # ) + class_ids = ( + list(train_segs[class_name].values) + + list(segs_sre_tar[class_name].values) + + list(segs_cv_tar[class_name].values) + + list(segs_afr_tar[class_name].values) + + list(segs_sre_non[class_name].values) + + list(segs_lre17_non[class_name].values) + + list(segs_cv_non[class_name].values) + + list(segs_afr_non[class_name].values) + ) + x = np.concatenate( + ( + x_trn, + x_sre_tar, + x_cv_tar, + x_afr_tar, + x_sre_non, + x_lre17_non, + x_cv_non, + x_afr_non, + ), + axis=0, + ) + sample_weight = np.concatenate( + ( + np.ones((len(train_segs),)), + ood_weight * np.ones((len(segs_sre_tar),)), + ood_weight * np.ones((len(segs_cv_tar),)), + ood_weight * np.ones((len(segs_afr_tar),)), + ood_weight * np.ones((len(segs_sre_non),)), + np.ones((len(segs_lre17_non),)), + ood_weight * np.ones((len(segs_cv_non),)), + ood_weight * np.ones((len(segs_afr_non),)), + ) + ) + + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x = pca(x) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x) + + logging.info("apply lnorm") + x = lnorm(x) + else: + lnorm = None + + logging.info("SVM args=%s", str(svm)) + model = GSVM(labels=labels, **svm) + model.fit(x, y_true, sample_weight=sample_weight) + logging.info("trained SVM") + scores = model(x) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and SVM") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model_labels = list(np.copy(model.labels)) + if "zzzzzz" in model_labels: + model_labels.remove("zzzzzz") + model.labels = model_labels + print("model.labels before save", np.shape(model.labels)) + model.save(output_dir / "model_svm.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train linear SVM Classifier",) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--sre-v-file", required=True) + parser.add_argument("--sre-list", required=True) + parser.add_argument("--lre17-v-file", required=True) + parser.add_argument("--lre17-list", required=True) + parser.add_argument("--cv-v-file", required=True) + parser.add_argument("--cv-list", required=True) + parser.add_argument("--afr-v-file", required=True) + parser.add_argument("--afr-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GSVM.add_class_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + # parser.add_argument("--num-nons", default=10000, type=int) + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--ood-weight", default=0.1, type=float) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v9.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v9.py new file mode 100755 index 00000000..5c174233 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v9.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + +tar_langs = ( + "afr-afr", + "ara-aeb", + "ara-arq", + "ara-ayl", + "eng-ens", + "eng-iaf", + "fra-ntf", + "nbl-nbl", + "orm-orm", + "tir-tir", + "tso-tso", + "ven-ven", + "xho-xho", + "zul-zul", +) + + +def read_ood_data(train_list, v_file, class_name): + v_reader = DRF.create(v_file) + + segs = SegmentSet.load(train_list) + idx = np.zeros((len(segs),), dtype=bool) + for lang in tar_langs: + idx_i = segs[class_name] == lang + idx = np.logical_or(idx, idx_i) + + segs_tar = segs.loc[idx].copy() + if len(segs_tar) > 0: + x_tar = v_reader.read(segs_tar["id"], squeeze=True) + else: + x_tar = None + + logging.info( + "read %s got ntar: %d", train_list, len(segs_tar), + ) + return segs_tar, x_tar + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + sre_v_file, + sre_list, + cv_v_file, + cv_list, + afr_v_file, + afr_list, + class_name, + do_lnorm, + whiten, + pca, + gbe, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_sre, x_sre = read_ood_data(sre_list, sre_v_file, class_name,) + segs_cv, x_cv = read_ood_data(cv_list, cv_v_file, class_name) + segs_afr, x_afr = read_ood_data(afr_list, afr_v_file, class_name,) + + class_ids_trn = train_segs[class_name].values + x_ood = np.concatenate((x_sre, x_cv, x_afr), axis=0) + class_ids_ood = ( + list(segs_sre[class_name].values) + + list(segs_cv[class_name].values) + + list(segs_afr[class_name].values) + ) + + labels, y_true_trn = np.unique(class_ids_trn, return_inverse=True) + _, y_true_ood = np.unique( + np.concatenate((labels, class_ids_ood)), return_inverse=True + ) + y_true_ood = y_true_ood[len(labels) :] + + logging.info("%d ood samples", x_ood.shape[0]) + logging.info("%d training samples", x_trn.shape[0]) + + x_ood += np.mean(x_trn, axis=0, keepdims=True) - np.mean( + x_ood, axis=0, keepdims=True + ) + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + x_ood = pca(x_ood) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + x_ood = lnorm(x_ood) + else: + lnorm = None + + prior_0 = GBE( + mu=np.zeros((len(labels), x_trn.shape[1])), + W=np.eye(x_trn.shape[1]), + beta=16, + nu=x_trn.shape[1], + ) + print(prior_0.__dict__) + prior = GBE(prior=prior_0) + prior.fit(x_ood, y_true_ood) + prior.nu = 0.1 * prior.nu + prior.beta = 0.01 * prior.beta + print(prior.__dict__) + model = GBE(labels=labels, prior=prior) + model.fit(x_trn, y_true_trn) + print(model.__dict__, flush=True) + logging.info("trained GBE") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true_trn, y_pred, labels) + + logging.info("Saving transforms and GBE") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.save(output_dir / "model_gbe.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train linear GBE Classifier",) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--sre-v-file", required=True) + parser.add_argument("--sre-list", required=True) + parser.add_argument("--cv-v-file", required=True) + parser.add_argument("--cv-list", required=True) + parser.add_argument("--afr-v-file", required=True) + parser.add_argument("--afr-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GBE.add_class_args(parser, prefix="gbe") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_fusion.m b/egs/lre22/fixed.v1.8k/steps_be/train_fusion.m new file mode 100644 index 00000000..8f1c3dda --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_fusion.m @@ -0,0 +1,16 @@ +function train_fusion(train_list, in_files, model_file) + + train_list = readtable(train_list, 'FileType', 'delimitedtext', 'Delimiter', ' ', 'ReadVariableNames', false, 'ReadRowNames', true); + train_list = sortrows(train_list, 'RowNames'); + [labels, ia, ic]=unique(train_list); + n_files = length(in_files); + scores={}; + for i=1:n_files + T_i = readtable(in_files{i}, 'FileType', 'delimitedtext', 'Delimiter','tab', 'ReadRowNames', true, 'VariableNamingRule', 'preserve'); + T_i = sortrows(T_i, 'RowNames'); + s_i = T_i.Variables'; + scores{i}=s_i; + end + [alpha, beta] = train_nary_llr_fusion(scores, ic, 0, 1e-6, [], ones(1,1)) + save(model_file, 'alpha', 'beta', 'labels'); + \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/steps_xvec b/egs/lre22/fixed.v1.8k/steps_xvec similarity index 100% rename from egs/voxceleb/vae.v1/steps_xvec rename to egs/lre22/fixed.v1.8k/steps_xvec diff --git a/egs/voxceleb/vae.v1/utils b/egs/lre22/fixed.v1.8k/utils similarity index 100% rename from egs/voxceleb/vae.v1/utils rename to egs/lre22/fixed.v1.8k/utils diff --git a/egs/lre22/open.v1.8k/README.md b/egs/lre22/open.v1.8k/README.md new file mode 100644 index 00000000..d55ced4e --- /dev/null +++ b/egs/lre22/open.v1.8k/README.md @@ -0,0 +1,55 @@ +# LRE22 Open Condition V1 + +Recipe for the NIST LRE22 open condition based to the JHU-MIT Submission. + +## Citing +``` +@inproceedings{villalba23_interspeech, + author={Jesús Villalba and Jonas Borgstrom and Maliha Jahan and Saurabh Kataria and Leibny Paola Garcia and Pedro Torres-Carrasquillo and Najim Dehak}, + title={{Advances in Language Recognition in Low Resource African Languages: The JHU-MIT Submission for NIST LRE22}}, + year=2023, + booktitle={Proc. INTERSPEECH 2023}, + pages={521--525}, + doi={10.21437/Interspeech.2023-1094} +} +``` + +## Training Data + + - x-Vector networks trained on: + - VoxLingua107 + - NIST LRE17 Train + Dev + Eval / CTS + AfV without Maghrebi Arabic + - NIST SRE16 + - NIST SRE18 + - NIST SRE19 CMN2 + - NIST SRE21 + - NIST SRE CTS Superset + - IARPA Babel + - Fleurs + - LWAZI 2009 + - NCHLT 2014 + - AMMI 2020 + - CommonVoice Tigrinya, Indian English, French + - ADI 2017 + - AST + - Gaussian back-end trained on: + - NIST LRE22 dev with 2-fold cross-val + x10 augmentations + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it uses Res2Net50 + - To change the default network run scripts with the config-file argument: +```bash +run_011_train_xvector.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh +run_030_extract_xvectors.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh --use-gpu true +run_040_be_final.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh +``` + +## Results + +| Config | Model Type | Model Details | Back-end | Dev MinCp | Dev ActCp | Eval MinCp | Eval ActCp | +| ------ | ---------- | ------------- | -------- | :-------: | :-------: | :--------: | :--------: | +| config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh | ECAPA-TDNN 2048x4 | Stage-1 | GBE | 0.100 | 0.101 | 0.105 | 0.106 | +| config_fbank64_stmn_fwseres2net50s8_v1.0.sh | fw-SE Res2Net50 scale=8 | Stage-1 | GBE | 0.092 | 0.093 | 0.103 | 0.104 | +| Fusion ECAPA-TDNN + FwSE Res2Net50 | | | FoCal | 0.082 | 0.083 | 0.089 | 0.090 | diff --git a/egs/lre22/open.v1.8k/cmd.sh b/egs/lre22/open.v1.8k/cmd.sh new file mode 100755 index 00000000..f22c66b4 --- /dev/null +++ b/egs/lre22/open.v1.8k/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 40G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01][234589]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/lre22/open.v1.8k/conf/clsp.conf b/egs/lre22/open.v1.8k/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/lre22/open.v1.8k/conf/coe_gpu_bigmem.conf b/egs/lre22/open.v1.8k/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/lre22/open.v1.8k/conf/coe_gpu_long.conf b/egs/lre22/open.v1.8k/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/lre22/open.v1.8k/conf/coe_gpu_rtx.conf b/egs/lre22/open.v1.8k/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/lre22/open.v1.8k/conf/coe_gpu_short.conf b/egs/lre22/open.v1.8k/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/lre22/open.v1.8k/conf/coe_gpu_v100.conf b/egs/lre22/open.v1.8k/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/lre22/open.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml b/egs/lre22/open.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml new file mode 100644 index 00000000..fce3804a --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml @@ -0,0 +1,24 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 8 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/open.v1.8k/conf/fbank64_stmn_8k.yaml b/egs/lre22/open.v1.8k/conf/fbank64_stmn_8k.yaml new file mode 100644 index 00000000..dfd0d3e5 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/fbank64_stmn_8k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/open.v1.8k/conf/other_conf/fbank64_stmn_8k.yaml b/egs/lre22/open.v1.8k/conf/other_conf/fbank64_stmn_8k.yaml new file mode 100644 index 00000000..dfd0d3e5 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/fbank64_stmn_8k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/open.v1.8k/conf/other_conf/fbank80_stmn_16k.yaml b/egs/lre22/open.v1.8k/conf/other_conf/fbank80_stmn_16k.yaml new file mode 100644 index 00000000..f4091f5d --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/fbank80_stmn_16k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/open.v1.8k/conf/other_conf/reverb_noise_aug.yaml b/egs/lre22/open.v1.8k/conf/other_conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage1_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage1_v2.4.yaml new file mode 100644 index 00000000..16b17c08 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage1_v2.4.yaml @@ -0,0 +1,124 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + - 2048 + - 3072 + - 3072 + - 4096 + - 4096 + - 5120 + - 5120 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 2 + - 3 + - 2 + - 3 + - 2 + - 3 + resb_strides: + - 1 + - 1 + - 2 + - 1 + - 2 + - 1 + - 2 + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 7168 + endpoint_layers: + - 2 + - 4 + - 6 + - 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage2_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage2_v2.4.yaml new file mode 100644 index 00000000..2bc8675f --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage2_v2.4.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage3_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage3_v2.4.yaml new file mode 100644 index 00000000..2bc8675f --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage3_v2.4.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..f43b3712 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,99 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.1 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 15000 + use_amp: true + swa_start: 9 + swa_lr: 5e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 256 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.1.yaml new file mode 100644 index 00000000..5d98e662 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.1.yaml @@ -0,0 +1,95 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 256 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + cos_scale: 30.0 + margin: 0.2 + intertop_margin: 0.1 + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.2.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.2.yaml new file mode 100644 index 00000000..038e7207 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.2.yaml @@ -0,0 +1,97 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 256 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + intertop_margin: 0.1 + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.3.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.3.yaml new file mode 100644 index 00000000..f0200ad2 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.3.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + margin: 0. + margin_warmup_epochs: 0 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + train_mode: ft-embed-affine diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.4.yaml new file mode 100644 index 00000000..3718b10b --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.4.yaml @@ -0,0 +1,97 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 256 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..d1c87491 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml @@ -0,0 +1,98 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + dropout_rate: 0.3 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.98 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..66c69e8e --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml @@ -0,0 +1,98 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + dropout_rate: 0.3 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.98 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.2.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.2.yaml new file mode 100644 index 00000000..3a4a81a7 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.2.yaml @@ -0,0 +1,98 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + dropout_rate: 0.3 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adamw + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.98 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.5.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.5.yaml new file mode 100644 index 00000000..17b1b6cf --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.5.yaml @@ -0,0 +1,98 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + dropout_rate: 0.1 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.98 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.1.yaml new file mode 100644 index 00000000..54f76200 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.1.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.2.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.2.yaml new file mode 100644 index 00000000..d68860be --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.2.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.yaml new file mode 100644 index 00000000..54f76200 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.3.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.3.yaml new file mode 100644 index 00000000..465d92eb --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.3.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 2 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 8000 + #hold_steps: 10000 + decay_steps: 12000 + hold_steps: 12000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 4000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.4.yaml new file mode 100644 index 00000000..64e71f65 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.4.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.1.yaml new file mode 100644 index 00000000..64e71f65 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.1.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.4.yaml new file mode 100644 index 00000000..64e71f65 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.4.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.1.yaml new file mode 100644 index 00000000..fe0171d1 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.1.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + se_r: 512 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + cos_scale: 30.0 + margin: 0.2 + intertop_margin: 0.1 + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.2.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.2.yaml new file mode 100644 index 00000000..80925cc7 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.2.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + se_r: 512 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + intertop_margin: 0.1 + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.3.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.3.yaml new file mode 100644 index 00000000..11997c55 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.3.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + margin: 0. + margin_warmup_epochs: 0 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + train_mode: ft-embed-affine diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.1.yaml new file mode 100644 index 00000000..cde840fe --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.1.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.3.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.3.yaml new file mode 100644 index 00000000..4f704b29 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.3.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 12 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 6 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 2 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 8000 + #hold_steps: 10000 + decay_steps: 12000 + hold_steps: 12000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 4000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/vad_16k.yaml b/egs/lre22/open.v1.8k/conf/other_conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/vad_8k.yaml b/egs/lre22/open.v1.8k/conf/other_conf/vad_8k.yaml new file mode 100644 index 00000000..7592c9d1 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/vad_8k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 8000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/lre22/open.v1.8k/conf/reverb_noise_aug.yaml b/egs/lre22/open.v1.8k/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/lre22/open.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/lre22/open.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..1448df98 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,105 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.0 + intertop_margin: 0.0 + margin_warmup_epochs: 3.0 + dropout_rate: 0.2 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 65000 + warmup_steps: 15000 + min_lr: 1.0e-06 + #decay_steps: 16000 + #hold_steps: 40000 + #warmup_steps: 5000 + #min_lr: 1.0e-05 + update_lr_on_opt_step: true + use_amp: true + swa_start: 12 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 15 + #eff_batch_size: 512 + eff_batch_size: 256 diff --git a/egs/lre22/open.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml b/egs/lre22/open.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..e501abdb --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,82 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + resnet_type: fwseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + se_r: 4 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.0 + intertop_margin: 0.0 + margin_warmup_epochs: 3.0 + dropout_rate: 0.05 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 65000 + warmup_steps: 15000 + min_lr: 1.0e-06 + #decay_steps: 16000 + #hold_steps: 40000 + #warmup_steps: 5000 + #min_lr: 1.0e-05 + update_lr_on_opt_step: true + use_amp: true + swa_start: 12 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 diff --git a/egs/lre22/open.v1.8k/conf/vad_8k.yaml b/egs/lre22/open.v1.8k/conf/vad_8k.yaml new file mode 100644 index 00000000..1cfe34b0 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/vad_8k.yaml @@ -0,0 +1,9 @@ +sample_frequency: 8000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: -4.89 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 +wav_scale: 1 diff --git a/egs/lre22/open.v1.8k/datapath.sh b/egs/lre22/open.v1.8k/datapath.sh new file mode 100644 index 00000000..fec52329 --- /dev/null +++ b/egs/lre22/open.v1.8k/datapath.sh @@ -0,0 +1,87 @@ +# Copyright +# 2022 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + +#paths to databases + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + ldc_root3=/export/fs02/corpora3/LDC + ldc_root5=/export/corpora5/LDC + ldc_root=/export/corpora6/LDC + sre16_dev_root=$ldc_root/LDC2019S20/data/dev/R148_0_0 + sre16_eval_root=$ldc_root/LDC2019S20/data/eval/R149_0_1 + sre18_dev_root=$ldc_root5/LDC2018E46 + sre18_eval_root=$ldc_root3/LDC2018E51 + sre19cmn2_eval_root=$ldc_root3/LDC2019E58 + sre_superset_root=$ldc_root/LDC2021E08 + sre21_dev_root=$ldc_root/LDC2021E09 + sre21_eval_root=$ldc_root/LDC2021E10 + lre17_train_root=$ldc_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$ldc_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$ldc_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=/export/corpora6/lre22_test_data_v2 + voxlingua_root=/export/corpora6/voxlingua107 + musan_root=/export/corpora5/JHU/musan + babel_assamese_root=$ldc_root/LDC2016S06 + babel_bengali_root=$ldc_root/LDC2016S08 + babel_pashto_root=$ldc_root/LDC2016S09 + babel_turkish_root=$ldc_root/LDC2016S10 + babel_georgian_root=$ldc_root/LDC2016S12 + babel_vietnam_root=$ldc_root/LDC2017S01 + babel_haitian_root=$ldc_root/LDC2017S03 + babel_lao_root=$ldc_root/LDC2017S08 + babel_tamil_root=$ldc_root/LDC2017S13 + babel_zulu_root=$ldc_root/LDC2017S19 + babel_kurmanji_root=$ldc_root/LDC2017S22 + babel_tok_root=$ldc_root/LDC2018S02 + babel_kazakh_root=$ldc_root/LDC2018S13 + babel_telugu_root=$ldc_root/LDC2018S16 + babel_lithuanian_root=$ldc_root/LDC2019S03 + fleurs_root=/export/corpora6/LRE/FLEURS2022 + lwazi_root=/export/corpora6/LRE/Lwazi2009 + nchlt_root=/export/corpora6/LRE/NCHLT2014 + ammi_root=/export/corpora6/LRE/AMMI2020 + cv20_root=/export/corpora5/mozilla-common-voice/cv-corpus-5.1-2020-06-22 + cv22_root=/export/corpora6/LRE/CommonVoice2020/cv-corpus-11.0-2022-09-21 + adi_root=/export/corpora6/ADI17 + ast_root=/export/corpora6/LRE/AST2004 +elif [ "$(hostname --domain)" == "cm.gemini" ];then + ldc_root=/export/common/data/corpora/LDC + sre_root=/export/common/data/corpora/NIST/SRE + my_root=/exp/jvillalba/corpora + sre16_dev_root=/exp/jvillalba/corpora/LDC2019S20/data/dev/R148_0_0 + sre16_eval_root=/exp/jvillalba/corpora/LDC2019S20/data/eval/R149_0_1 + sre18_dev_root=$sre_root/SRE18/LDC2018E46_2018_NIST_Speaker_Recognition_Evaluation_Development_Set + sre18_eval_root=$sre_root/SRE18/Eval/LDC2018E51 + sre19cmn2_eval_root=/exp/jvillalba/corpora/LDC2019E58 + sre_superset_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E08 + sre21_dev_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E09 + sre21_eval_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E10 + lre17_train_root=$my_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$my_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$my_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=$my_root/lre22_test_data_v2 + voxlingua_root=$my_root/voxlingua107 + musan_root=/expscratch/dgromero/corpora/musan + babel_assamese_root=$ldc_root/LDC2016S06 + babel_bengali_root=$ldc_root/LDC2016S08 + babel_pashto_root=$ldc_root/LDC2016S09 + babel_turkish_root=$my_root/LDC2016S10 + babel_georgian_root=$my_root/LDC2016S12 + babel_vietnam_root=$my_root/LDC2017S01 + babel_haitian_root=$my_root/LDC2017S03 + babel_lao_root=$ldc_root/LDC2017S08 + babel_tamil_root=$ldc_root/LDC2017S13 + babel_zulu_root=$ldc_root/LDC2017S19 + babel_kurmanji_root=$ldc_root/LDC2017S22 + babel_tok_root=$my_root/LDC2018S02 + babel_kazakh_root=$ldc_root/LDC2018S13 + babel_telugu_root=$ldc_root/LDC2018S16 + babel_lithuanian_root=$my_root/LDC2019S03 + adi_root=/exp/jvillalba/corpora/ADI17 + +else + echo "Put your database paths here" + exit 1 +fi diff --git a/egs/lre22/open.v1.8k/default_config.sh b/egs/lre22/open.v1.8k/default_config.sh new file mode 120000 index 00000000..d1be989f --- /dev/null +++ b/egs/lre22/open.v1.8k/default_config.sh @@ -0,0 +1 @@ +global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh new file mode 100644 index 00000000..1abb3d3f --- /dev/null +++ b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh @@ -0,0 +1,20 @@ +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=open + +# x-vector cfg + +nnet_type=resnet1d +nnet_stages=1 +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml + +nnet_name=${feat_type}_ecapatdnn2048x4_v1.0 +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0012.pth diff --git a/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh new file mode 100644 index 00000000..352cd1a6 --- /dev/null +++ b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh @@ -0,0 +1,20 @@ +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=open + +# x-vector cfg +nnet_type=resnet +nnet_stages=1 +nnet_s1_base_cfg=conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml + +nnet_name=${feat_type}_fwseres2net50s8_v1.0 +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0012.pth + diff --git a/egs/lre22/open.v1.8k/hyp_utils b/egs/lre22/open.v1.8k/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/lre22/open.v1.8k/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/local b/egs/lre22/open.v1.8k/local new file mode 120000 index 00000000..c2a3fdea --- /dev/null +++ b/egs/lre22/open.v1.8k/local @@ -0,0 +1 @@ +../fixed.v1.8k/local \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/path.sh b/egs/lre22/open.v1.8k/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/lre22/open.v1.8k/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/lre22/open.v1.8k/resources b/egs/lre22/open.v1.8k/resources new file mode 120000 index 00000000..113b3492 --- /dev/null +++ b/egs/lre22/open.v1.8k/resources @@ -0,0 +1 @@ +../fixed.v1.8k/resources \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/run_001_prepare_data.sh b/egs/lre22/open.v1.8k/run_001_prepare_data.sh new file mode 100755 index 00000000..bb64cdbe --- /dev/null +++ b/egs/lre22/open.v1.8k/run_001_prepare_data.sh @@ -0,0 +1,342 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +stage=1 + +. parse_options.sh || exit 1; +. datapath.sh + + +if [ $stage -le 1 ];then + # Prepares voxlingua 107 for training + hyp_utils/conda_env.sh \ + local/prepare_voxlingua107.py \ + --corpus-dir $voxlingua_root \ + --output-dir data/voxlingua107 \ + --remove-langs en-en es-es ar-ar pt-pt \ + --map-langs-to-lre-codes \ + --target-fs 8000 + +fi + +if [ $stage -le 2 ];then + # Prepare LRE17 Training data + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_train \ + --subset train \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_cts \ + --subset dev \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_afv \ + --subset dev \ + --source vast \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_cts \ + --subset eval \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_afv \ + --subset eval \ + --source vast \ + --target-fs 8000 + +fi + +if [ $stage -le 3 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_dev.py \ + --corpus-dir $lre22_dev_root \ + --output-dir data/lre22_dev \ + --target-fs 8000 + +fi + +if [ $stage -le 4 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_eval.py \ + --corpus-dir $lre22_eval_root \ + --output-dir data/lre22_eval \ + --target-fs 8000 + +fi + +if [ $stage -le 5 ];then + local/make_sre16_train_dev.sh $sre16_dev_root 8 data + local/make_sre16_train_eval.sh $sre16_eval_root 8 data +fi + +if [ $stage -le 6 ];then + local/make_sre18_dev_unlabeled.sh $sre18_dev_root 8 data + local/make_sre18_train_dev.sh $sre18_dev_root 8 data + local/make_sre18_train_eval.sh $sre18_eval_root 8 data +fi + +if [ $stage -le 7 ];then + # Prepare sre19 + local/make_sre19cmn2_eval.sh $sre19cmn2_eval_root 8 data +fi + +if [ $stage -le 8 ];then + # Prepare SRE21 dev + hyp_utils/conda_env.sh \ + local/prepare_sre21av_dev_audio.py \ + --corpus-dir $sre21_dev_root \ + --target-fs 8000 \ + --output-path data/sre21_audio_dev \ + --av-output-path data/sre21_audio-visual_dev + # Prepare SRE21 eval + hyp_utils/conda_env.sh \ + local/prepare_sre21av_eval_audio.py \ + --corpus-dir $sre21_eval_root \ + --target-fs 8000 \ + --output-path data/sre21_audio_eval \ + --av-output-path data/sre21_audio-visual_eval + +fi + +if [ $stage -le 9 ];then + # Prepare SRE CTS superset + hyp_utils/conda_env.sh \ + local/prepare_sre_cts_superset.py \ + --corpus-dir $sre_superset_root \ + --target-fs 8000 \ + --output-dir data/sre_cts_superset +fi + +if [ $stage -le 10 ];then + # Prepare babel datasets + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_assamese_root \ + --target-fs 8000 \ + --lang-code as-as \ + --output-dir data/babel_assamese + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_bengali_root \ + --target-fs 8000 \ + --lang-code bn-bn \ + --output-dir data/babel_bengali + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_pashto_root \ + --target-fs 8000 \ + --lang-code ps-ps \ + --output-dir data/babel_pashto + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_turkish_root \ + --target-fs 8000 \ + --lang-code tr-tr \ + --output-dir data/babel_turkish + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_georgian_root \ + --target-fs 8000 \ + --lang-code ka-ka \ + --output-dir data/babel_georgian + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_vietnam_root \ + --target-fs 8000 \ + --lang-code vi-vi \ + --output-dir data/babel_vietnam + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_haitian_root \ + --target-fs 8000 \ + --lang-code ht-ht \ + --output-dir data/babel_haitian + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_lao_root \ + --target-fs 8000 \ + --lang-code lo-lo \ + --output-dir data/babel_lao + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_tamil_root \ + --target-fs 8000 \ + --lang-code ta-ta \ + --output-dir data/babel_tamil + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_zulu_root \ + --target-fs 8000 \ + --lang-code zul-zul \ + --output-dir data/babel_zulu + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_kurmanji_root \ + --target-fs 8000 \ + --lang-code kur-kur \ + --output-dir data/babel_kurmanji + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_tok_root \ + --target-fs 8000 \ + --lang-code tok-tok \ + --output-dir data/babel_tok + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_kazakh_root \ + --target-fs 8000 \ + --lang-code kk-kk \ + --output-dir data/babel_kazakh + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_telugu_root \ + --target-fs 8000 \ + --lang-code te-te \ + --output-dir data/babel_telugu + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_lithuanian_root \ + --target-fs 8000 \ + --lang-code lt-lt \ + --output-dir data/babel_lithuanian + +fi + +if [ $stage -le 11 ];then + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre.py \ + --corpus-dir $fleurs_root \ + --output-dir data/fleurs22 \ + --map-langs-to-lre-codes --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre_cat.py \ + --corpus-dir $lwazi_root \ + --output-dir data/lwazi09 \ + --map-langs-to-lre-codes --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre_cat.py \ + --corpus-dir $nchlt_root \ + --output-dir data/nchlt14 \ + --map-langs-to-lre-codes --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre_cat.py \ + --corpus-dir $ammi_root \ + --output-dir data/ammi20 \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 12 ];then + + hyp_utils/conda_env.sh \ + local/prepare_common_voice_cat.py \ + --corpus-dir $cv22_root \ + --output-dir data/cv22_tir \ + --keep-langs tir-tir \ + --map-langs-to-lre-codes --target-fs 8000 +fi + + +if [ $stage -le 13 ];then + hyp_utils/conda_env.sh \ + local/prepare_common_voice_accents_cat.py \ + --corpus-dir $cv20_root \ + --output-dir data/cv20_eng_ine \ + --lang en \ + --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_common_voice_accents_cat.py \ + --corpus-dir $cv20_root \ + --output-dir data/cv20_fra \ + --lang fr \ + --target-fs 8000 + +fi + +if [ $stage -le 14 ];then + hyp_utils/conda_env.sh \ + local/prepare_adi17.py \ + --corpus-dir $adi_root \ + --output-dir data/adi17 \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 15 ];then + hyp_utils/conda_env.sh \ + local/prepare_ast_cat.py \ + --corpus-dir $ast_root \ + --output-dir data/ast \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 16 ];then + #combine data + utils/combine_data.sh \ + data/babel \ + data/babel_{a*,b*,g*,k*,l*,p*,t*,v*,zulu} + + utils/combine_data.sh \ + data/cv \ + data/cv20_eng_ine data/cv20_fra data/cv22_tir + + utils/combine_data.sh \ + data/sre16 \ + data/sre16_train_{dev*,eval*} + + utils/combine_data.sh \ + data/sre18 \ + data/sre18_train_{dev*,eval*} data/sre18_dev_unlabeled + + utils/combine_data.sh \ + data/sre19 \ + data/sre19_eval_{enroll,test}_cmn2 + + utils/combine_data.sh \ + data/sre21_cts \ + data/sre21_*_cts + + utils/combine_data.sh \ + data/sre21_afv \ + data/sre21_audio*_{dev*,eval*}_afv + + utils/combine_data.sh \ + data/sre16-21_cts \ + data/sre1{6,8,9} data/sre21_cts + +fi + +if [ $stage -le 5 ];then + if [ -d ../fixed.v1.8k/lre-scorer ];then + ln -s ../fixed.v1.8k/lre-scorer + else + local/download_lre22_scorer.sh + fi + if [ -d ../fixed.v1.8k/focal_multiclass ];then + ln -s ../fixed.v1.8k/focal_multiclass + else + local/download_focal.sh + fi +fi diff --git a/egs/lre22/open.v1.8k/run_002_compute_evad.sh b/egs/lre22/open.v1.8k/run_002_compute_evad.sh new file mode 100755 index 00000000..f7ccdfa7 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_002_compute_evad.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=b1 +storage_name=$(date +'%m_%d_%H_%M') +vaddir=`pwd`/exp/vad_e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/lre22-fixed-v1.8k-$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b1{0,1,2,3,4,5,6,7,8,9}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + elif [ "$nodes" == "fs05" ];then + utils/create_split_dir.pl \ + /export/fs05/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" + fi + fi +fi + +# VAD Train/Test Datasets +if [ $stage -le 2 ];then + for name in voxlingua107 \ + lre17_train \ + lre17_dev_cts lre17_dev_afv \ + lre17_eval_cts lre17_eval_afv \ + lre22_dev lre22_eval \ + babel sre16-21_cts sre21_afv sre_cts_superset \ + lwazi09 nchlt14 adi17 fleurs22 ammi20 \ + ast cv + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + diff --git a/egs/lre22/open.v1.8k/run_003_prepare_noises_rirs.sh b/egs/lre22/open.v1.8k/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..09f01f4d --- /dev/null +++ b/egs/lre22/open.v1.8k/run_003_prepare_noises_rirs.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 8 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name lre22-open-v1.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name lre22-open-v1.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../fixed.v1.8k/RIRS_NOISES ];then + ln -s ../fixed.v1.8k/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 8 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 8 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 8 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/lre22/open.v1.8k/run_004_apply_codecs.sh b/egs/lre22/open.v1.8k/run_004_apply_codecs.sh new file mode 100755 index 00000000..6efc016b --- /dev/null +++ b/egs/lre22/open.v1.8k/run_004_apply_codecs.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ];then + + for data in voxlingua107 \ + lre17_dev_afv lre17_eval_afv \ + sre21_afv ast cv \ + lwazi09 nchlt14 adi17 fleurs22 ammi20 + do + hyp_utils/conda_env.sh \ + local/apply_tel_codecs_to_kaldi_datadir.py \ + --input-dir data/$data \ + --output-dir data/${data}_codecs + done + +fi diff --git a/egs/lre22/open.v1.8k/run_010_prepare_xvec_train_data.sh b/egs/lre22/open.v1.8k/run_010_prepare_xvec_train_data.sh new file mode 100755 index 00000000..9f3eff6c --- /dev/null +++ b/egs/lre22/open.v1.8k/run_010_prepare_xvec_train_data.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # This script preprocess audio for x-vector training + for name in voxlingua107_codecs \ + lre17_train \ + lre17_{dev,eval}_{cts,afv,afv_codecs} \ + babel sre16-21_cts sre_cts_superset \ + sre21_afv_codecs cv_codecs adi17_codecs \ + lwazi09{,_codecs} nchlt14{,_codecs} fleurs22{,_codecs} ammi20{,_codecs} ast{,_codecs} + do + steps_xvec/preprocess_audios_for_nnet_train.sh \ + --nj 40 --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${name} data/${name}_proc_audio_no_sil exp/${name}_proc_audio_no_sil + utils/fix_data_dir.sh data/${name}_proc_audio_no_sil + done +fi + +if [ $stage -le 2 ];then + utils/combine_data.sh \ + data/lre17_proc_audio_no_sil \ + data/lre17_train_proc_audio_no_sil \ + data/lre17_{dev,eval}_{cts,afv,afv_codecs}_proc_audio_no_sil + + utils/combine_data.sh \ + data/babel_sre_proc_audio_no_sil \ + data/{babel,sre16-21_cts,sre21_afv_codecs,sre_cts_superset}_proc_audio_no_sil + + utils/combine_data.sh \ + data/others_afr_proc_audio_no_sil \ + data/adi17_proc_audio_no_sil \ + data/{lwazi09,nchlt14,fleurs22,ammi20,ast}{,_codecs}_proc_audio_no_sil +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 3s + hyp_utils/remove_short_audios.sh --min-len 3 data/voxlingua107_codecs_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/lre17_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/babel_sre_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/others_afr_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/cv_codecs_proc_audio_no_sil +fi + +if [ $stage -le 4 ];then + # merge all data + utils/combine_data.sh \ + data/open_proc_audio_no_sil \ + data/{voxlingua107_codecs,lre17,babel_sre,cv_codecs,others_afr}_proc_audio_no_sil +fi + + +if [ $stage -le 5 ]; then + for name in open_proc_audio_no_sil + do + hyp_utils/conda_env.sh \ + local/split_segments_train_val.py \ + --segments-file data/$name/utt2lang \ + --recordings-file data/$name/wav.scp \ + --durations-file data/$name/utt2dur \ + --val-percent 2. \ + --remove-langs fra-mix ara-ary en-en es-es pt-pt ar-ar \ + --output-dir data/$name/train_val_split + done +fi + +exit + +# if [ $stage -le 6 ]; then +# awk 'BEGIN{ +# adapt_langs_list="ara-acm ara-aeb ara-apc ara-arq ara-ary ara-arz ara-ayl ara-jor ara-ksa ara-kuw ara-leb ara-mau ara-mor ara-oma ara-pal ara-qat ara-sud ara-syr ara-uae ara-yem fra-can fra-fra fra-ntf eng-ens eng-gbr eng-iaf eng-ine eng-usg eng-zho afr-afr nbl-nbl orm-orm tir-tir tso-tso ven-ven xho-xho zul-zul"; +# nf=split(adapt_langs_list, f, " "); +# for(i=1;i<=nf;i++){ adapt_langs[f[i]]=1;}; +# FS=","; OFS=","; +# getline; print $0; +# } +# { if ($1 in adapt_langs) { $3="1."} else{ $3="0.01"}; print $0}' \ +# data/open_proc_audio_no_sil/train_val_split/class_file.csv > \ +# data/open_proc_audio_no_sil/train_val_split/class_file_adapt_1.csv +# fi diff --git a/egs/lre22/open.v1.8k/run_011_train_xvector.sh b/egs/lre22/open.v1.8k/run_011_train_xvector.sh new file mode 100755 index 00000000..4b3f9642 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_011_train_xvector.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project lre22-fixed-v1.8k --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + mkdir -p $nnet_s1_dir/log + if [ ! -f "$nnet_s0" ];then + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu #--master-port 3456 + else + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s0 \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu + + fi + +fi + + +# Class-balanced Fine-tuning +if [ $stage -le 2 ] && [ $max_stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu + +fi diff --git a/egs/lre22/open.v1.8k/run_030_extract_xvectors.sh b/egs/lre22/open.v1.8k/run_030_extract_xvectors.sh new file mode 100755 index 00000000..227d1047 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_030_extract_xvectors.sh @@ -0,0 +1,219 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=2 +nnet_stage=1 +config_file=default_config.sh +use_gpu=false +do_tsne=false +split_dev=false +xvec_chunk_length=12800 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 4G" +else + xvec_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +# if [ $stage -le 1 ]; then +# # Extract xvectors for training +# for name in lre17_proc_audio_no_sil \ +# voxlingua107_codecs_proc_audio_no_sil \ +# babel_sre_proc_audio_no_sil \ +# cv_codecs_proc_audio_no_sil \ +# others_afr_proc_audio_no_sil +# do +# steps_xvec/extract_xvectors_from_wav.sh \ +# --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ +# --use-bin-vad false \ +# --random-utt-length true --min-utt-length 300 --max-utt-length 3000 \ +# --feat-config $feat_config \ +# $nnet data/${name} \ +# $xvector_dir/${name} +# done +# fi + +if [ $stage -le 2 ]; then + # Extract xvectors for training + for name in lre22_dev + do + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --use-bin-vad true --num-augs 10 --aug-config conf/reverb_noise_aug.yaml \ + --random-utt-length true --min-utt-length 300 --max-utt-length 3000 \ + --feat-config $feat_config \ + $nnet data/${name} \ + $xvector_dir/${name}_aug \ + data/${name}_aug + done +fi + + +if [ $stage -le 3 ]; then + # Extracts x-vectors for dev and eval + for name in lre22_dev lre22_eval + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + --feat-config $feat_config \ + $nnet data/$name \ + $xvector_dir/$name + done +fi + +if [ $stage -le 4 ]; then + for name in lre22_dev + do + if [ "$do_tsne" == "true" ] || [ "$split_dev" == "true" ];then + $train_cmd \ + $xvector_dir/$name/tsne/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 + + $train_cmd \ + $xvector_dir/$name/tsne_per_class/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne_per_class.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne_per_class \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 \ + --do-ahc --cluster-tsne --ahc-thr -5 + + if [ "$split_dev" == "true" ];then + hyp_utils/conda_env.sh \ + local/split_dev.py \ + --segs-file $xvector_dir/$name/tsne_per_class/segments.csv \ + --output-dir ./resources/dev_splits \ + --num-folds 2 + + # delete the split data dirs so they are regenerated later + rm -rf data/lre22_dev_p{1,2} + + fi + fi + done +fi + +if [ $stage -le 5 ]; then + if [ ! -d data/lre22_dev_p1 ];then + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/train_segments.csv \ + > p1.lst + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/test_segments.csv \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev data/lre22_dev_$p + done + fi +fi + +if [ $stage -le 6 ]; then + if [ -d data/lre22_dev_aug ] && [ ! -d data/lre22_dev_aug_p1 ];then + awk -v fsegs=./resources/dev_splits/fold_0/train_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1] +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p1.lst + + awk -v fsegs=./resources/dev_splits/fold_0/test_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1]=1; +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev_aug data/lre22_dev_aug_$p + done + fi +fi + +if [ $stage -le 7 ];then + if [ -f $xvector_dir/lre22_dev_aug/xvector.scp ];then + mkdir -p $xvector_dir/lre22_dev_aug_clean + cat $xvector_dir/lre22_dev/xvector.scp \ + $xvector_dir/lre22_dev_aug/xvector.scp \ + > $xvector_dir/lre22_dev_aug_clean/xvector.scp + + for p in "" _p1 _p2 + do + if [ ! -d data/lre22_dev_aug_clean$p ]; then + utils/combine_data.sh \ + data/lre22_dev_aug_clean$p \ + data/lre22_dev$p \ + data/lre22_dev_aug$p + fi + done + fi +fi + +exit diff --git a/egs/lre22/open.v1.8k/run_040_be_final.sh b/egs/lre22/open.v1.8k/run_040_be_final.sh new file mode 100755 index 00000000..fe5b6f18 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_040_be_final.sh @@ -0,0 +1,434 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +fi + +xvector_dir=exp/xvectors/$nnet_name +be_base_dir=exp/be/$nnet_name +score_base_dir=exp/scores/$nnet_name + +if [ $stage -le 1 ];then + for r in 1 #0.9999 0.999 #0.99 0.975 0.95 + do + be_name=pca${r}_cw_lnorm_lgbe_lre22_aug + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + + + ) & + + done + + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}_p12/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + + + done + wait + +fi + +exit +# Back-ends below over-fitted + +if [ $stage -le 2 ];then + for r in 1 + do + for penalty in l2 #l1 + do + for c in 1 #0.1 1 + do + for ary_thr in 0.975 #0.85 0.7 #0.99 0.95 0.9 #15 ##1 5 10 20 + do + be_name=pca${r}_cw_lnorm_lsvm_${penalty}_c${c}_sqhinge_lre22_aug_lre17_aryt${ary_thr} + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + done + done + done + done + wait + +fi + +if [ $stage -le 3 ];then + for r in 1 # 0.9999 0.99 0.975 0.95 0.9 0.8 + do + for shrinking in true #false + do + for c in 1 10 #0.1 1 10 #0.01 0.1 1 10 # 0.0001 + do + for vl in false #true #false + do + if [ "$vl" == "true" ];then + do_vl="--do-vl" + else + do_vl="--no_do-vl" + fi + ary_thr=0.975 + be_name=pca${r}_cw_lnorm_gsvm_shrinking_${shrinking}_c${c}_lre17_aryt${ary_thr}_vl${vl}_aug_clean + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + #score_dir=$score_base_dir/${be_name}_logpost + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500\ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500 \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --svm.eval-type cat-log-post \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + + ) & + done + done + done + done + wait + +fi diff --git a/egs/lre22/open.v1.8k/run_050_fusion_v1.sh b/egs/lre22/open.v1.8k/run_050_fusion_v1.sh new file mode 100755 index 00000000..5f9a1624 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_050_fusion_v1.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +score_dir_0=exp/scores +nnet_1=fbank64_stmn_ecapatdnn2048x4_v1.0.s1 +nnet_2=fbank64_stmn_fwseres2net50s8_v1.0.s1 +be_1=pca1_cw_lnorm_lgbe_lre22_aug +score_dirs="$score_dir_0/$nnet_1/$be_1 +$score_dir_0/$nnet_2/$be_1" + +train_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"_p12/cal_v1" }; print $0}') +test_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"/cal_v1" }; print $0}') + +output_dir=exp/fusion/fus_v1.0 + +local/train_fusion_lre22.sh "$train_score_dirs" $output_dir/train +local/score_lre22.sh \ + dev \ + ${output_dir}/train/lre22_dev_scores.tsv \ + ${output_dir}/train/lre22_dev_results + +local/eval_fusion_lre22.sh "$test_score_dirs" $output_dir/train/fus.mat $output_dir/test + +local/score_lre22.sh \ + dev \ + ${output_dir}/test/lre22_dev_scores.tsv \ + ${output_dir}/test/lre22_dev_results + +local/score_lre22.sh eval \ + ${output_dir}/test/lre22_eval_scores.tsv \ + ${output_dir}/test/lre22_eval_results + + + + + + diff --git a/egs/lre22/open.v1.8k/steps b/egs/lre22/open.v1.8k/steps new file mode 120000 index 00000000..aede39fe --- /dev/null +++ b/egs/lre22/open.v1.8k/steps @@ -0,0 +1 @@ +hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/steps_be b/egs/lre22/open.v1.8k/steps_be new file mode 120000 index 00000000..48aedc5a --- /dev/null +++ b/egs/lre22/open.v1.8k/steps_be @@ -0,0 +1 @@ +../fixed.v1.8k/steps_be \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/steps_xvec b/egs/lre22/open.v1.8k/steps_xvec new file mode 120000 index 00000000..af66a94d --- /dev/null +++ b/egs/lre22/open.v1.8k/steps_xvec @@ -0,0 +1 @@ +hyp_utils/xvectors \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/utils b/egs/lre22/open.v1.8k/utils new file mode 120000 index 00000000..3d590a1d --- /dev/null +++ b/egs/lre22/open.v1.8k/utils @@ -0,0 +1 @@ +hyp_utils/kaldi/utils \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/README.md b/egs/lre22/open.v2.8k/README.md new file mode 100644 index 00000000..c500d811 --- /dev/null +++ b/egs/lre22/open.v2.8k/README.md @@ -0,0 +1,58 @@ +# LRE22 Open Condition V2 + +Recipe for the NIST LRE22 open condition based to the JHU-MIT Submission, using Hugging Face Wav2Vec + x-vector model. + +## Citing +``` +@inproceedings{villalba23_interspeech, + author={Jesús Villalba and Jonas Borgstrom and Maliha Jahan and Saurabh Kataria and Leibny Paola Garcia and Pedro Torres-Carrasquillo and Najim Dehak}, + title={{Advances in Language Recognition in Low Resource African Languages: The JHU-MIT Submission for NIST LRE22}}, + year=2023, + booktitle={Proc. INTERSPEECH 2023}, + pages={521--525}, + doi={10.21437/Interspeech.2023-1094} +} +``` + +## Training Data + + - x-Vector networks trained on: + - VoxLingua107 + - NIST LRE17 Train + Dev + Eval / CTS + AfV without Maghrebi Arabic + - NIST SRE16 + - NIST SRE18 + - NIST SRE19 CMN2 + - NIST SRE21 + - NIST SRE CTS Superset + - IARPA Babel + - Fleurs + - LWAZI 2009 + - NCHLT 2014 + - AMMI 2020 + - CommonVoice Tigrinya, Indian English, French + - ADI 2017 + - AST + - Gaussian back-end trained on: + - NIST LRE22 dev with 2-fold cross-val + x10 augmentations + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it uses Wav2Vec2 XLSR 300M + - To change the default network run scripts with the config-file argument: +```bash +run_011_train_xvector.sh --config-file global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh +run_030_extract_xvectors.sh --config-file global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh --use-gpu true +run_040_be_final.sh --config-file global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh +``` + +## Results + +| Config | Model Type | Model Details | Back-end | Dev MinCp | Dev ActCp | Eval MinCp | Eval ActCp | +| ------ | ---------- | ------------- | -------- | :-------: | :-------: | :--------: | :--------: | +| config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh | ECAPA-TDNN 2048x4 | Stage-1 | GBE | 0.100 | 0.101 | 0.105 | 0.106 | +| config_fbank64_stmn_fwseres2net50s8_v1.0.sh | fw-SE Res2Net50 scale=8 | Stage-1 | GBE | 0.092 | 0.093 | 0.103 | 0.104 | +| Fusion ECAPA-TDNN + FwSE Res2Net50 | | | FoCal | 0.082 | 0.083 | 0.089 | 0.090 | +| config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh | Wav2VectXLR 300M + ECAPA-TDNN 1024x3 | Stage-1 | GBE | 0.088 | 0.089 | 0.106 | 0.107 | +| " | " | Stage-2 | GBE | 0.083 | 0.085 | 0.089 | 0.090 | +| Fusion ECAPA-TDNN + FwSE Res2Net50 + Wav2Vec2 | | | FoCal | 0.069 | 0.072 | 0.076 | 0.077 | diff --git a/egs/lre22/open.v2.8k/cmd.sh b/egs/lre22/open.v2.8k/cmd.sh new file mode 100755 index 00000000..15e4a015 --- /dev/null +++ b/egs/lre22/open.v2.8k/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 40G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01][234589]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/lre22/open.v2.8k/conf/clsp.conf b/egs/lre22/open.v2.8k/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/lre22/open.v2.8k/conf/coe_gpu_bigmem.conf b/egs/lre22/open.v2.8k/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/lre22/open.v2.8k/conf/coe_gpu_long.conf b/egs/lre22/open.v2.8k/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/lre22/open.v2.8k/conf/coe_gpu_rtx.conf b/egs/lre22/open.v2.8k/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/lre22/open.v2.8k/conf/coe_gpu_short.conf b/egs/lre22/open.v2.8k/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/lre22/open.v2.8k/conf/coe_gpu_v100.conf b/egs/lre22/open.v2.8k/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/lre22/open.v2.8k/conf/reverb_noise_aug.yaml b/egs/lre22/open.v2.8k/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml new file mode 100644 index 00000000..d33e30f4 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml @@ -0,0 +1,56 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml +trainer: + optim: + opt_type: sgd + lr: 0.04 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 18000 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 4000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 12 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd diff --git a/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v1.0.yaml b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v1.0.yaml new file mode 100644 index 00000000..090093b3 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v1.0.yaml @@ -0,0 +1,62 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +model: + xvector: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 32.0 + margin: 0.0 + margin_warmup_epochs: 0 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 20000 + min_lr: 1e-6 + warmup_steps: 10000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 6 + eff_batch_size: 512 + train_mode: full diff --git a/egs/lre22/open.v2.8k/conf/vad_8k.yaml b/egs/lre22/open.v2.8k/conf/vad_8k.yaml new file mode 100644 index 00000000..1cfe34b0 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/vad_8k.yaml @@ -0,0 +1,9 @@ +sample_frequency: 8000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: -4.89 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 +wav_scale: 1 diff --git a/egs/lre22/open.v2.8k/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml b/egs/lre22/open.v2.8k/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml new file mode 100644 index 00000000..beb687d2 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml @@ -0,0 +1,63 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + dropout_rate: 0.05 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_start: 2 diff --git a/egs/lre22/open.v2.8k/datapath.sh b/egs/lre22/open.v2.8k/datapath.sh new file mode 100644 index 00000000..02e2ddd4 --- /dev/null +++ b/egs/lre22/open.v2.8k/datapath.sh @@ -0,0 +1,87 @@ +# Copyright +# 2022 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + +#paths to databases + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + ldc_root3=/export/fs02/corpora3/LDC + ldc_root5=/export/corpora5/LDC + ldc_root=/export/corpora6/LDC + sre16_dev_root=$ldc_root/LDC2019S20/data/dev/R148_0_0 + sre16_eval_root=$ldc_root/LDC2019S20/data/eval/R149_0_1 + sre18_dev_root=$ldc_root5/LDC2018E46 + sre18_eval_root=$ldc_root3/LDC2018E51 + sre19cmn2_eval_root=$ldc_root3/LDC2019E58 + sre_superset_root=$ldc_root/LDC2021E08 + sre21_dev_root=$ldc_root/LDC2021E09 + sre21_eval_root=$ldc_root/LDC2021E10 + lre17_train_root=$ldc_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$ldc_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$ldc_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=/export/corpora6/lre22_test_data_v2 + voxlingua_root=/export/corpora6/voxlingua107 + musan_root=/export/corpora5/JHU/musan + babel_assamese_root=$ldc_root/LDC2016S06 + babel_bengali_root=$ldc_root/LDC2016S08 + babel_pashto_root=$ldc_root/LDC2016S09 + babel_turkish_root=$ldc_root/LDC2016S10 + babel_georgian_root=$ldc_root/LDC2016S12 + babel_vietnam_root=$ldc_root/LDC2017S01 + babel_haitian_root=$ldc_root/LDC2017S03 + babel_lao_root=$ldc_root/LDC2017S08 + babel_tamil_root=$ldc_root/LDC2017S13 + babel_zulu_root=$ldc_root/LDC2017S19 + babel_kurmanji_root=$ldc_root/LDC2017S22 + babel_tok_root=$ldc_root/LDC2018S02 + babel_kazakh_root=$ldc_root/LDC2018S13 + babel_telugu_root=$ldc_root/LDC2018S16 + babel_lithuanian_root=$ldc_root/LDC2019S03 + fleurs_root=/export/corpora6/LRE/FLEURS2022 + lwazi_root=/export/corpora6/LRE/Lwazi2009 + nchlt_root=/export/corpora6/LRE/NCHLT2014 + ammi_root=/export/corpora6/LRE/AMMI2020 + cv20_root=/export/corpora5/mozilla-common-voice/cv-corpus-5.1-2020-06-22 + cv22_root=/export/corpora6/LRE/CommonVoice2020/cv-corpus-11.0-2022-09-21 + adi_root=/export/corpora6/ADI17 + ast_root=/export/corpora6/LRE/AST2004 +elif [ "$(hostname --domain)" == "cm.gemini" ];then + ldc_root=/export/common/data/corpora/LDC + sre_root=/export/common/data/corpora/NIST/SRE + my_root=/exp/jvillalba/corpora + sre16_dev_root=/exp/jvillalba/corpora/LDC2019S20/data/dev/R148_0_0 + sre16_eval_root=/exp/jvillalba/corpora/LDC2019S20/data/eval/R149_0_1 + sre18_dev_root=$sre_root/SRE18/LDC2018E46_2018_NIST_Speaker_Recognition_Evaluation_Development_Set + sre18_eval_root=$sre_root/SRE18/Eval/LDC2018E51 + sre19cmn2_eval_root=/exp/jvillalba/corpora/LDC2019E58 + sre_superset_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E08 + sre21_dev_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E09 + sre21_eval_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E10 + lre17_train_root=$my_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$my_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$my_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=$my_root/lre22_test_data_v2 + voxlingua_root=$my_root/voxlingua107 + musan_root=/export/common/data/corpora/MUSAN/musan + babel_assamese_root=$ldc_root/LDC2016S06 + babel_bengali_root=$ldc_root/LDC2016S08 + babel_pashto_root=$ldc_root/LDC2016S09 + babel_turkish_root=$my_root/LDC2016S10 + babel_georgian_root=$my_root/LDC2016S12 + babel_vietnam_root=$my_root/LDC2017S01 + babel_haitian_root=$my_root/LDC2017S03 + babel_lao_root=$ldc_root/LDC2017S08 + babel_tamil_root=$ldc_root/LDC2017S13 + babel_zulu_root=$ldc_root/LDC2017S19 + babel_kurmanji_root=$ldc_root/LDC2017S22 + babel_tok_root=$my_root/LDC2018S02 + babel_kazakh_root=$ldc_root/LDC2018S13 + babel_telugu_root=$ldc_root/LDC2018S16 + babel_lithuanian_root=$my_root/LDC2019S03 + adi_root=/exp/jvillalba/corpora/ADI17 + +else + echo "Put your database paths here" + exit 1 +fi diff --git a/egs/lre22/open.v2.8k/default_config.sh b/egs/lre22/open.v2.8k/default_config.sh new file mode 120000 index 00000000..94d038cf --- /dev/null +++ b/egs/lre22/open.v2.8k/default_config.sh @@ -0,0 +1 @@ +global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh b/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh new file mode 100644 index 00000000..910b4bad --- /dev/null +++ b/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh @@ -0,0 +1,31 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 1024x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=open + +# x-vector cfg +nnet_stages=2 +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn1024x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0012.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_name=${hf_model_name}_ecapatdnn1024x3_v1.0 +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0006.pth + diff --git a/egs/lre22/open.v2.8k/hyp_utils b/egs/lre22/open.v2.8k/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/lre22/open.v2.8k/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/local b/egs/lre22/open.v2.8k/local new file mode 120000 index 00000000..c2a3fdea --- /dev/null +++ b/egs/lre22/open.v2.8k/local @@ -0,0 +1 @@ +../fixed.v1.8k/local \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/path.sh b/egs/lre22/open.v2.8k/path.sh new file mode 100644 index 00000000..6994fdab --- /dev/null +++ b/egs/lre22/open.v2.8k/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/lre22/open.v2.8k/resources b/egs/lre22/open.v2.8k/resources new file mode 120000 index 00000000..113b3492 --- /dev/null +++ b/egs/lre22/open.v2.8k/resources @@ -0,0 +1 @@ +../fixed.v1.8k/resources \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/run_001_prepare_data.sh b/egs/lre22/open.v2.8k/run_001_prepare_data.sh new file mode 100755 index 00000000..bb64cdbe --- /dev/null +++ b/egs/lre22/open.v2.8k/run_001_prepare_data.sh @@ -0,0 +1,342 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +stage=1 + +. parse_options.sh || exit 1; +. datapath.sh + + +if [ $stage -le 1 ];then + # Prepares voxlingua 107 for training + hyp_utils/conda_env.sh \ + local/prepare_voxlingua107.py \ + --corpus-dir $voxlingua_root \ + --output-dir data/voxlingua107 \ + --remove-langs en-en es-es ar-ar pt-pt \ + --map-langs-to-lre-codes \ + --target-fs 8000 + +fi + +if [ $stage -le 2 ];then + # Prepare LRE17 Training data + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_train \ + --subset train \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_cts \ + --subset dev \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_afv \ + --subset dev \ + --source vast \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_cts \ + --subset eval \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_afv \ + --subset eval \ + --source vast \ + --target-fs 8000 + +fi + +if [ $stage -le 3 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_dev.py \ + --corpus-dir $lre22_dev_root \ + --output-dir data/lre22_dev \ + --target-fs 8000 + +fi + +if [ $stage -le 4 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_eval.py \ + --corpus-dir $lre22_eval_root \ + --output-dir data/lre22_eval \ + --target-fs 8000 + +fi + +if [ $stage -le 5 ];then + local/make_sre16_train_dev.sh $sre16_dev_root 8 data + local/make_sre16_train_eval.sh $sre16_eval_root 8 data +fi + +if [ $stage -le 6 ];then + local/make_sre18_dev_unlabeled.sh $sre18_dev_root 8 data + local/make_sre18_train_dev.sh $sre18_dev_root 8 data + local/make_sre18_train_eval.sh $sre18_eval_root 8 data +fi + +if [ $stage -le 7 ];then + # Prepare sre19 + local/make_sre19cmn2_eval.sh $sre19cmn2_eval_root 8 data +fi + +if [ $stage -le 8 ];then + # Prepare SRE21 dev + hyp_utils/conda_env.sh \ + local/prepare_sre21av_dev_audio.py \ + --corpus-dir $sre21_dev_root \ + --target-fs 8000 \ + --output-path data/sre21_audio_dev \ + --av-output-path data/sre21_audio-visual_dev + # Prepare SRE21 eval + hyp_utils/conda_env.sh \ + local/prepare_sre21av_eval_audio.py \ + --corpus-dir $sre21_eval_root \ + --target-fs 8000 \ + --output-path data/sre21_audio_eval \ + --av-output-path data/sre21_audio-visual_eval + +fi + +if [ $stage -le 9 ];then + # Prepare SRE CTS superset + hyp_utils/conda_env.sh \ + local/prepare_sre_cts_superset.py \ + --corpus-dir $sre_superset_root \ + --target-fs 8000 \ + --output-dir data/sre_cts_superset +fi + +if [ $stage -le 10 ];then + # Prepare babel datasets + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_assamese_root \ + --target-fs 8000 \ + --lang-code as-as \ + --output-dir data/babel_assamese + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_bengali_root \ + --target-fs 8000 \ + --lang-code bn-bn \ + --output-dir data/babel_bengali + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_pashto_root \ + --target-fs 8000 \ + --lang-code ps-ps \ + --output-dir data/babel_pashto + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_turkish_root \ + --target-fs 8000 \ + --lang-code tr-tr \ + --output-dir data/babel_turkish + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_georgian_root \ + --target-fs 8000 \ + --lang-code ka-ka \ + --output-dir data/babel_georgian + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_vietnam_root \ + --target-fs 8000 \ + --lang-code vi-vi \ + --output-dir data/babel_vietnam + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_haitian_root \ + --target-fs 8000 \ + --lang-code ht-ht \ + --output-dir data/babel_haitian + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_lao_root \ + --target-fs 8000 \ + --lang-code lo-lo \ + --output-dir data/babel_lao + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_tamil_root \ + --target-fs 8000 \ + --lang-code ta-ta \ + --output-dir data/babel_tamil + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_zulu_root \ + --target-fs 8000 \ + --lang-code zul-zul \ + --output-dir data/babel_zulu + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_kurmanji_root \ + --target-fs 8000 \ + --lang-code kur-kur \ + --output-dir data/babel_kurmanji + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_tok_root \ + --target-fs 8000 \ + --lang-code tok-tok \ + --output-dir data/babel_tok + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_kazakh_root \ + --target-fs 8000 \ + --lang-code kk-kk \ + --output-dir data/babel_kazakh + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_telugu_root \ + --target-fs 8000 \ + --lang-code te-te \ + --output-dir data/babel_telugu + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_lithuanian_root \ + --target-fs 8000 \ + --lang-code lt-lt \ + --output-dir data/babel_lithuanian + +fi + +if [ $stage -le 11 ];then + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre.py \ + --corpus-dir $fleurs_root \ + --output-dir data/fleurs22 \ + --map-langs-to-lre-codes --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre_cat.py \ + --corpus-dir $lwazi_root \ + --output-dir data/lwazi09 \ + --map-langs-to-lre-codes --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre_cat.py \ + --corpus-dir $nchlt_root \ + --output-dir data/nchlt14 \ + --map-langs-to-lre-codes --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre_cat.py \ + --corpus-dir $ammi_root \ + --output-dir data/ammi20 \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 12 ];then + + hyp_utils/conda_env.sh \ + local/prepare_common_voice_cat.py \ + --corpus-dir $cv22_root \ + --output-dir data/cv22_tir \ + --keep-langs tir-tir \ + --map-langs-to-lre-codes --target-fs 8000 +fi + + +if [ $stage -le 13 ];then + hyp_utils/conda_env.sh \ + local/prepare_common_voice_accents_cat.py \ + --corpus-dir $cv20_root \ + --output-dir data/cv20_eng_ine \ + --lang en \ + --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_common_voice_accents_cat.py \ + --corpus-dir $cv20_root \ + --output-dir data/cv20_fra \ + --lang fr \ + --target-fs 8000 + +fi + +if [ $stage -le 14 ];then + hyp_utils/conda_env.sh \ + local/prepare_adi17.py \ + --corpus-dir $adi_root \ + --output-dir data/adi17 \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 15 ];then + hyp_utils/conda_env.sh \ + local/prepare_ast_cat.py \ + --corpus-dir $ast_root \ + --output-dir data/ast \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 16 ];then + #combine data + utils/combine_data.sh \ + data/babel \ + data/babel_{a*,b*,g*,k*,l*,p*,t*,v*,zulu} + + utils/combine_data.sh \ + data/cv \ + data/cv20_eng_ine data/cv20_fra data/cv22_tir + + utils/combine_data.sh \ + data/sre16 \ + data/sre16_train_{dev*,eval*} + + utils/combine_data.sh \ + data/sre18 \ + data/sre18_train_{dev*,eval*} data/sre18_dev_unlabeled + + utils/combine_data.sh \ + data/sre19 \ + data/sre19_eval_{enroll,test}_cmn2 + + utils/combine_data.sh \ + data/sre21_cts \ + data/sre21_*_cts + + utils/combine_data.sh \ + data/sre21_afv \ + data/sre21_audio*_{dev*,eval*}_afv + + utils/combine_data.sh \ + data/sre16-21_cts \ + data/sre1{6,8,9} data/sre21_cts + +fi + +if [ $stage -le 5 ];then + if [ -d ../fixed.v1.8k/lre-scorer ];then + ln -s ../fixed.v1.8k/lre-scorer + else + local/download_lre22_scorer.sh + fi + if [ -d ../fixed.v1.8k/focal_multiclass ];then + ln -s ../fixed.v1.8k/focal_multiclass + else + local/download_focal.sh + fi +fi diff --git a/egs/lre22/open.v2.8k/run_002_compute_evad.sh b/egs/lre22/open.v2.8k/run_002_compute_evad.sh new file mode 100755 index 00000000..f7ccdfa7 --- /dev/null +++ b/egs/lre22/open.v2.8k/run_002_compute_evad.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=b1 +storage_name=$(date +'%m_%d_%H_%M') +vaddir=`pwd`/exp/vad_e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/lre22-fixed-v1.8k-$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b1{0,1,2,3,4,5,6,7,8,9}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + elif [ "$nodes" == "fs05" ];then + utils/create_split_dir.pl \ + /export/fs05/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" + fi + fi +fi + +# VAD Train/Test Datasets +if [ $stage -le 2 ];then + for name in voxlingua107 \ + lre17_train \ + lre17_dev_cts lre17_dev_afv \ + lre17_eval_cts lre17_eval_afv \ + lre22_dev lre22_eval \ + babel sre16-21_cts sre21_afv sre_cts_superset \ + lwazi09 nchlt14 adi17 fleurs22 ammi20 \ + ast cv + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + diff --git a/egs/lre22/open.v2.8k/run_003_prepare_noises_rirs.sh b/egs/lre22/open.v2.8k/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..55da7f2a --- /dev/null +++ b/egs/lre22/open.v2.8k/run_003_prepare_noises_rirs.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 16 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name lre22-open-v2.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name lre22-open-v2.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../fixed.v1.8k/RIRS_NOISES ];then + ln -s ../fixed.v1.8k/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/lre22/open.v2.8k/run_004_apply_codecs.sh b/egs/lre22/open.v2.8k/run_004_apply_codecs.sh new file mode 100755 index 00000000..6efc016b --- /dev/null +++ b/egs/lre22/open.v2.8k/run_004_apply_codecs.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ];then + + for data in voxlingua107 \ + lre17_dev_afv lre17_eval_afv \ + sre21_afv ast cv \ + lwazi09 nchlt14 adi17 fleurs22 ammi20 + do + hyp_utils/conda_env.sh \ + local/apply_tel_codecs_to_kaldi_datadir.py \ + --input-dir data/$data \ + --output-dir data/${data}_codecs + done + +fi diff --git a/egs/lre22/open.v2.8k/run_010_prepare_xvec_train_data.sh b/egs/lre22/open.v2.8k/run_010_prepare_xvec_train_data.sh new file mode 100755 index 00000000..98aa9a4c --- /dev/null +++ b/egs/lre22/open.v2.8k/run_010_prepare_xvec_train_data.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # This script preprocess audio for x-vector training + for name in voxlingua107_codecs \ + lre17_train \ + lre17_{dev,eval}_{cts,afv,afv_codecs} \ + babel sre16-21_cts sre_cts_superset \ + sre21_afv_codecs cv_codecs adi17_codecs \ + lwazi09{,_codecs} nchlt14{,_codecs} fleurs22{,_codecs} ammi20{,_codecs} ast{,_codecs} + do + steps_xvec/preprocess_audios_for_nnet_train.sh \ + --nj 40 --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${name} data/${name}_proc_audio_no_sil exp/${name}_proc_audio_no_sil + utils/fix_data_dir.sh data/${name}_proc_audio_no_sil + done +fi + +if [ $stage -le 2 ];then + utils/combine_data.sh \ + data/lre17_proc_audio_no_sil \ + data/lre17_train_proc_audio_no_sil \ + data/lre17_{dev,eval}_{cts,afv,afv_codecs}_proc_audio_no_sil + + utils/combine_data.sh \ + data/babel_sre_proc_audio_no_sil \ + data/{babel,sre16-21_cts,sre21_afv_codecs,sre_cts_superset}_proc_audio_no_sil + + utils/combine_data.sh \ + data/others_afr_proc_audio_no_sil \ + data/adi17_proc_audio_no_sil \ + data/{lwazi09,nchlt14,fleurs22,ammi20,ast}{,_codecs}_proc_audio_no_sil +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 3s + hyp_utils/remove_short_audios.sh --min-len 3 data/voxlingua107_codecs_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/lre17_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/babel_sre_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/others_afr_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/cv_codecs_proc_audio_no_sil +fi + +if [ $stage -le 4 ];then + # merge all data + utils/combine_data.sh \ + data/open_proc_audio_no_sil \ + data/{voxlingua107_codecs,lre17,babel_sre,cv_codecs,others_afr}_proc_audio_no_sil \ +fi + + +if [ $stage -le 5 ]; then + for name in open_proc_audio_no_sil + do + hyp_utils/conda_env.sh \ + local/split_segments_train_val.py \ + --segments-file data/$name/utt2lang \ + --recordings-file data/$name/wav.scp \ + --durations-file data/$name/utt2dur \ + --val-percent 2. \ + --remove-langs fra-mix ara-ary en-en es-es pt-pt ar-ar \ + --output-dir data/$name/train_val_split + done +fi diff --git a/egs/lre22/open.v2.8k/run_011_train_xvector.sh b/egs/lre22/open.v2.8k/run_011_train_xvector.sh new file mode 100755 index 00000000..611a33ca --- /dev/null +++ b/egs/lre22/open.v2.8k/run_011_train_xvector.sh @@ -0,0 +1,125 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project lre22-open-v2.8k --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir $args \ + --num-gpus $ngpu \ + +fi +exit +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir $args \ + --num-gpus $ngpu \ + +fi + +if [ $stage -le 4 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s4_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s4_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s4_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s4_base_cfg $nnet_s4_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s3 \ + --trainer.exp-path $nnet_s4_dir $args \ + --num-gpus $ngpu \ + +fi + diff --git a/egs/lre22/open.v2.8k/run_030_extract_xvectors.sh b/egs/lre22/open.v2.8k/run_030_extract_xvectors.sh new file mode 100755 index 00000000..d7e2775b --- /dev/null +++ b/egs/lre22/open.v2.8k/run_030_extract_xvectors.sh @@ -0,0 +1,217 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=2 +nnet_stage=1 +config_file=default_config.sh +use_gpu=false +do_tsne=false +split_dev=false +hf_chunk_length=120 #seconds +xvec_chunk_length=120 #seconds +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 6G" +else + xvec_cmd="$train_cmd --mem 12G" +fi +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +# if [ $stage -le 1 ]; then +# # Extract xvectors for training +# for name in lre17_proc_audio_no_sil \ +# voxlingua107_codecs_proc_audio_no_sil \ +# babel_sre_proc_audio_no_sil \ +# cv_codecs_proc_audio_no_sil \ +# others_afr_proc_audio_no_sil +# do +# steps_xvec/extract_wav2vec2xvectors.sh \ +# --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ +# --use-bin-vad false \ +# --random-utt-length true --min-utt-length 3 --max-utt-length 30 \ +# $nnet data/${name} \ +# $xvector_dir/${name} +# done +# fi + +if [ $stage -le 2 ]; then + # Extract xvectors for training + for name in lre22_dev + do + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --use-bin-vad true --num-augs 10 --aug-config conf/reverb_noise_aug.yaml \ + --random-utt-length true --min-utt-length 3 --max-utt-length 30 \ + $nnet data/${name} \ + $xvector_dir/${name}_aug \ + data/${name}_aug + done +fi + + +if [ $stage -le 3 ]; then + # Extracts x-vectors for dev and eval + for name in lre22_dev lre22_eval + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + $nnet data/$name \ + $xvector_dir/$name + done +fi + + +if [ $stage -le 4 ]; then + for name in lre22_dev + do + if [ "$do_tsne" == "true" ] || [ "$split_dev" == "true" ];then + $train_cmd \ + $xvector_dir/$name/tsne/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 + + $train_cmd \ + $xvector_dir/$name/tsne_per_class/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne_per_class.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne_per_class \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 \ + --do-ahc --cluster-tsne --ahc-thr -5 + + if [ "$split_dev" == "true" ];then + hyp_utils/conda_env.sh \ + local/split_dev.py \ + --segs-file $xvector_dir/$name/tsne_per_class/segments.csv \ + --output-dir ./resources/dev_splits \ + --num-folds 2 + + # delete the split data dirs so they are regenerated later + rm -rf data/lre22_dev_p{1,2} + + fi + fi + done +fi + +if [ $stage -le 5 ]; then + if [ ! -d data/lre22_dev_p1 ];then + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/train_segments.csv \ + > p1.lst + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/test_segments.csv \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev data/lre22_dev_$p + done + fi +fi + +if [ $stage -le 6 ]; then + if [ -d data/lre22_dev_aug ] && [ ! -d data/lre22_dev_aug_p1 ];then + awk -v fsegs=./resources/dev_splits/fold_0/train_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1] +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p1.lst + + awk -v fsegs=./resources/dev_splits/fold_0/test_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1]=1; +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev_aug data/lre22_dev_aug_$p + done + fi +fi + +if [ $stage -le 7 ];then + if [ -f $xvector_dir/lre22_dev_aug/xvector.scp ];then + mkdir -p $xvector_dir/lre22_dev_aug_clean + cat $xvector_dir/lre22_dev/xvector.scp \ + $xvector_dir/lre22_dev_aug/xvector.scp \ + > $xvector_dir/lre22_dev_aug_clean/xvector.scp + + for p in "" _p1 _p2 + do + if [ ! -d data/lre22_dev_aug_clean$p ]; then + utils/combine_data.sh \ + data/lre22_dev_aug_clean$p \ + data/lre22_dev$p \ + data/lre22_dev_aug$p + fi + done + fi +fi + +exit diff --git a/egs/lre22/open.v2.8k/run_040_be_final.sh b/egs/lre22/open.v2.8k/run_040_be_final.sh new file mode 100755 index 00000000..fe5b6f18 --- /dev/null +++ b/egs/lre22/open.v2.8k/run_040_be_final.sh @@ -0,0 +1,434 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +fi + +xvector_dir=exp/xvectors/$nnet_name +be_base_dir=exp/be/$nnet_name +score_base_dir=exp/scores/$nnet_name + +if [ $stage -le 1 ];then + for r in 1 #0.9999 0.999 #0.99 0.975 0.95 + do + be_name=pca${r}_cw_lnorm_lgbe_lre22_aug + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + + + ) & + + done + + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}_p12/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + + + done + wait + +fi + +exit +# Back-ends below over-fitted + +if [ $stage -le 2 ];then + for r in 1 + do + for penalty in l2 #l1 + do + for c in 1 #0.1 1 + do + for ary_thr in 0.975 #0.85 0.7 #0.99 0.95 0.9 #15 ##1 5 10 20 + do + be_name=pca${r}_cw_lnorm_lsvm_${penalty}_c${c}_sqhinge_lre22_aug_lre17_aryt${ary_thr} + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + done + done + done + done + wait + +fi + +if [ $stage -le 3 ];then + for r in 1 # 0.9999 0.99 0.975 0.95 0.9 0.8 + do + for shrinking in true #false + do + for c in 1 10 #0.1 1 10 #0.01 0.1 1 10 # 0.0001 + do + for vl in false #true #false + do + if [ "$vl" == "true" ];then + do_vl="--do-vl" + else + do_vl="--no_do-vl" + fi + ary_thr=0.975 + be_name=pca${r}_cw_lnorm_gsvm_shrinking_${shrinking}_c${c}_lre17_aryt${ary_thr}_vl${vl}_aug_clean + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + #score_dir=$score_base_dir/${be_name}_logpost + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500\ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500 \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --svm.eval-type cat-log-post \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + + ) & + done + done + done + done + wait + +fi diff --git a/egs/lre22/open.v2.8k/run_050_fusion_v1.sh b/egs/lre22/open.v2.8k/run_050_fusion_v1.sh new file mode 100755 index 00000000..056c2f0b --- /dev/null +++ b/egs/lre22/open.v2.8k/run_050_fusion_v1.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +score_dir_fixed=../open.v1.8k/exp/scores +score_dir_0=exp/scores +nnet_1=fbank64_stmn_ecapatdnn2048x4_v1.0.s1 +nnet_2=fbank64_stmn_fwseres2net50s8_v1.0.s1 +nnet_3=wav2vec2xlsr300m_ecapatdnn1024x3_v1.0.s2 +be_1=pca1_cw_lnorm_lgbe_lre22_aug +score_dirs="$score_dir_fixed/$nnet_1/$be_1 +$score_dir_fixed/$nnet_2/$be_1 +$score_dir_0/$nnet_3/$be_1" + +train_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"_p12/cal_v1" }; print $0}') +test_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"/cal_v1" }; print $0}') + +output_dir=exp/fusion/fus_v1.0 + +local/train_fusion_lre22.sh "$train_score_dirs" $output_dir/train +local/score_lre22.sh \ + dev \ + ${output_dir}/train/lre22_dev_scores.tsv \ + ${output_dir}/train/lre22_dev_results + +local/eval_fusion_lre22.sh "$test_score_dirs" $output_dir/train/fus.mat $output_dir/test + +local/score_lre22.sh \ + dev \ + ${output_dir}/test/lre22_dev_scores.tsv \ + ${output_dir}/test/lre22_dev_results + +local/score_lre22.sh eval \ + ${output_dir}/test/lre22_eval_scores.tsv \ + ${output_dir}/test/lre22_eval_results + + + + + + diff --git a/egs/lre22/open.v2.8k/steps b/egs/lre22/open.v2.8k/steps new file mode 120000 index 00000000..aede39fe --- /dev/null +++ b/egs/lre22/open.v2.8k/steps @@ -0,0 +1 @@ +hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/steps_be b/egs/lre22/open.v2.8k/steps_be new file mode 120000 index 00000000..48aedc5a --- /dev/null +++ b/egs/lre22/open.v2.8k/steps_be @@ -0,0 +1 @@ +../fixed.v1.8k/steps_be \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/steps_xvec b/egs/lre22/open.v2.8k/steps_xvec new file mode 120000 index 00000000..af66a94d --- /dev/null +++ b/egs/lre22/open.v2.8k/steps_xvec @@ -0,0 +1 @@ +hyp_utils/xvectors \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/utils b/egs/lre22/open.v2.8k/utils new file mode 120000 index 00000000..3d590a1d --- /dev/null +++ b/egs/lre22/open.v2.8k/utils @@ -0,0 +1 @@ +hyp_utils/kaldi/utils \ No newline at end of file diff --git a/egs/sre18/v1.8k/local/score_dcf.py b/egs/sre18/v1.8k/local/score_dcf.py index 1137e049..cba16610 100755 --- a/egs/sre18/v1.8k/local/score_dcf.py +++ b/egs/sre18/v1.8k/local/score_dcf.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py b/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py index fa16dfce..31b527f7 100755 --- a/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py b/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py index d3b35fba..954a8a4a 100755 --- a/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py b/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py index d9668e1a..06b2bc87 100755 --- a/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py index c37d450a..af8895b2 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py index c19dc074..433cbbff 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py @@ -21,7 +21,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def combine_diar_scores(ndx, diar_ndx, diar2orig, diar_scores): diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py index fc94c754..1f1ffc81 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py index f7d83d30..19ca8bdf 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre18/v1.8k/steps_be/train-calibration-v1.py b/egs/sre18/v1.8k/steps_be/train-calibration-v1.py index fa1dfcf7..9eaa7187 100755 --- a/egs/sre18/v1.8k/steps_be/train-calibration-v1.py +++ b/egs/sre18/v1.8k/steps_be/train-calibration-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, verbose): diff --git a/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py b/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py index c9f22d83..46710992 100755 --- a/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py @@ -12,9 +12,9 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F -from hyperion.clustering import AHC +from hyperion.np.clustering import AHC from hyperion.utils.utt2info import Utt2Info diff --git a/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py b/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py index a1b0cad6..4724a24a 100755 --- a/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py @@ -13,7 +13,7 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.scp_list import SCPList diff --git a/egs/sre19-av-v/v0.1/local/score_dcf.py b/egs/sre19-av-v/v0.1/local/score_dcf.py index 514ebf51..772d107a 100755 --- a/egs/sre19-av-v/v0.1/local/score_dcf.py +++ b/egs/sre19-av-v/v0.1/local/score_dcf.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py b/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py index 8087cac2..576ea3d5 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py @@ -22,8 +22,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v1.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v1.py index 1527f514..9b490e72 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v1.py @@ -14,7 +14,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v2.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v2.py index 5ac23484..40187aa4 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v2.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v2.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v3.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v3.py index 2a7abe08..3d52788e 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v3.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v3.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v4.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v4.py index 9c22cc1f..f18a53f7 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v4.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v4.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v5.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v5.py index 660854e3..af75f526 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v5.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v5.py @@ -15,10 +15,10 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm # from hyperion.helpers import PLDAFactory as F -# from hyperion.transforms import TransformList +# from hyperion.np.transforms import TransformList from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v6.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v6.py index 86ca6a8f..e23e52a1 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v6.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v6.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v7.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v7.py index 19f78a23..85bd8ee4 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v7.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v7.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v9.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v9.py index a6774a68..d36b91ec 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v9.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v9.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-v8.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-v8.py index aa9539d4..a66794da 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-v8.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-v8.py @@ -18,7 +18,7 @@ from hyperion.utils.trial_scores import TrialScores # from hyperion.helpers import PLDAFactory as F -# from hyperion.transforms import TransformList +# from hyperion.np.transforms import TransformList from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-fusion-v1.py b/egs/sre19-av-v/v0.1/steps_be/eval-fusion-v1.py index 70b0c81b..fe24f947 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-fusion-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-fusion-v1.py @@ -17,7 +17,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def eval_fusion(in_score_files, ndx_file, model_file, out_score_file, fus_idx): diff --git a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py index 025d11a3..b6252df7 100644 --- a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py +++ b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py @@ -2,18 +2,14 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - import logging import numpy as np from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import softmax +from hyperion.utils.math_funcs import softmax from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.transforms import LNorm -from hyperion.clustering import AHC +from hyperion.np.transforms import LNorm +from hyperion.np.clustering import AHC def lnorm(x): @@ -23,9 +19,6 @@ def lnorm(x): def cosine_scr(x1, x2): - # t = LNorm() - # x1 = t.predict(x1) - # x2 = t.predict(x2) x1 = lnorm(x1) x2 = lnorm(x2) return np.dot(x1, x2.T) diff --git a/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py b/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py index 091a4ee1..11223607 100644 --- a/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py +++ b/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py @@ -18,7 +18,7 @@ from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.utils.utt2info import Utt2Info from hyperion.utils import TrialNdx, TrialKey -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList class FaceVideoTrialDataReaderV1(object): diff --git a/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py b/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py index 35c1a3bc..0d97a4fb 100755 --- a/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py @@ -21,8 +21,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): diff --git a/egs/sre19-av-v/v0.1/steps_be/train-calibration-v2.py b/egs/sre19-av-v/v0.1/steps_be/train-calibration-v2.py index b247f264..f1f89bdd 100755 --- a/egs/sre19-av-v/v0.1/steps_be/train-calibration-v2.py +++ b/egs/sre19-av-v/v0.1/steps_be/train-calibration-v2.py @@ -21,8 +21,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): diff --git a/egs/sre19-av-v/v0.1/steps_be/train-fusion-v1.py b/egs/sre19-av-v/v0.1/steps_be/train-fusion-v1.py index e28bfffa..564fccaa 100755 --- a/egs/sre19-av-v/v0.1/steps_be/train-fusion-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/train-fusion-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def train_fusion( diff --git a/egs/sre19-av-v/v0.1/steps_be/train-fusion-v2.py b/egs/sre19-av-v/v0.1/steps_be/train-fusion-v2.py index 0679eb7c..1f97d189 100755 --- a/egs/sre19-av-v/v0.1/steps_be/train-fusion-v2.py +++ b/egs/sre19-av-v/v0.1/steps_be/train-fusion-v2.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def train_fusion( diff --git a/egs/sre19-cmn2/v1/conf/ecapatdnn_small.yaml b/egs/sre19-cmn2/v1/conf/ecapatdnn_small.yaml new file mode 100644 index 00000000..fd386500 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/ecapatdnn_small.yaml @@ -0,0 +1,34 @@ +resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 +pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/sre19-cmn2/v1/conf/efficientnet_b4.yaml b/egs/sre19-cmn2/v1/conf/efficientnet_b4.yaml new file mode 100644 index 00000000..f87c1e02 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/efficientnet_b4.yaml @@ -0,0 +1,20 @@ +effnet_type: efficientnet-b4 +in_feats: 80 +in_channels: 1 +in_kernel_size: 3 +in_stride: 1 +se_r: 4 +fix_stem_head: true +mbconv_strides: +- 1 +- 1 +- 2 +- 2 +- 1 +- 2 +- 1 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/sre19-cmn2/v1/conf/efficientnet_b7.yaml b/egs/sre19-cmn2/v1/conf/efficientnet_b7.yaml new file mode 100644 index 00000000..bae5c7cb --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/efficientnet_b7.yaml @@ -0,0 +1,22 @@ +effnet_type: efficientnet-b7 +in_feats: 80 +in_channels: 1 +in_kernel_size: 3 +in_stride: 1 +se_r: 4 +fix_stem_head: true +mbconv_strides: +- 1 +- 1 +- 2 +- 2 +- 1 +- 2 +- 1 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 +norm_layer: instance-norm-affine +head_norm_layer: layer-norm diff --git a/egs/sre19-cmn2/v1/conf/fbank40_nomn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank40_nomn_16k.pyconf deleted file mode 100644 index d04eb2ec..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank40_nomn_16k.pyconf +++ /dev/null @@ -1,17 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -40 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-no-norm-mean diff --git a/egs/sre19-cmn2/v1/conf/fbank40_nope_hammw_stmn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank40_nope_hammw_stmn_16k.pyconf deleted file mode 100644 index da766d41..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank40_nope_hammw_stmn_16k.pyconf +++ /dev/null @@ -1,22 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-preemphasis-coeff -0 ---feats-window-type -hamming ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -40 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/fbank40_stmn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank40_stmn_16k.pyconf deleted file mode 100644 index 919efdec..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank40_stmn_16k.pyconf +++ /dev/null @@ -1,18 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -40 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/fbank40_stmvn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank40_stmvn_16k.pyconf deleted file mode 100644 index b81e9283..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank40_stmvn_16k.pyconf +++ /dev/null @@ -1,19 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -40 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 ---mvn-norm-var diff --git a/egs/sre19-cmn2/v1/conf/fbank64_8k.pyconf b/egs/sre19-cmn2/v1/conf/fbank64_8k.pyconf deleted file mode 100644 index f5a57052..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank64_8k.pyconf +++ /dev/null @@ -1,14 +0,0 @@ ---sample-frequency -8000 ---frame-length -25 ---low-freq -20 ---high-freq -3700 ---num-filters -64 ---snip-edges -false ---use-energy -false diff --git a/egs/sre19-cmn2/v1/conf/fbank64_mvn_8k.pyconf b/egs/sre19-cmn2/v1/conf/fbank64_mvn_8k.pyconf deleted file mode 100644 index 29ce58a9..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank64_mvn_8k.pyconf +++ /dev/null @@ -1,18 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -8000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -3700 ---feats-num-filters -64 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/fbank64_stmn_8k.pyconf b/egs/sre19-cmn2/v1/conf/fbank64_stmn_8k.pyconf deleted file mode 100644 index 29ce58a9..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank64_stmn_8k.pyconf +++ /dev/null @@ -1,18 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -8000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -3700 ---feats-num-filters -64 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/fbank80_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank80_16k.pyconf deleted file mode 100644 index 3e65fe32..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank80_16k.pyconf +++ /dev/null @@ -1,14 +0,0 @@ ---sample-frequency -16000 ---frame-length -25 ---low-freq -20 ---high-freq -7600 ---num-filters -80 ---snip-edges -false ---use-energy -false diff --git a/egs/sre19-cmn2/v1/conf/fbank80_mvn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank80_mvn_16k.pyconf deleted file mode 100644 index ffdbf165..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank80_mvn_16k.pyconf +++ /dev/null @@ -1,18 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -80 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/fbank80_stmn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank80_stmn_16k.pyconf deleted file mode 100644 index ffdbf165..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank80_stmn_16k.pyconf +++ /dev/null @@ -1,18 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -80 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/linfbank40_stmn_16k.pyconf b/egs/sre19-cmn2/v1/conf/linfbank40_stmn_16k.pyconf deleted file mode 100644 index f80faad2..00000000 --- a/egs/sre19-cmn2/v1/conf/linfbank40_stmn_16k.pyconf +++ /dev/null @@ -1,20 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -64 ---feats-high-freq -8000 ---feats-num-filters -40 ---feats-snip-edges -false ---feats-use-energy -false ---feats-fb-type -linear ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/lrsched_exp_default.yaml b/egs/sre19-cmn2/v1/conf/lrsched_exp_default.yaml new file mode 100644 index 00000000..fe08b704 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/lrsched_exp_default.yaml @@ -0,0 +1,7 @@ +lrsch_type: exp_lr +decay_rate: 0.5 +decay_steps: 8000 +hold_steps: 40000 +min_lr: 1.0e-05 +update_lr_on_opt_step: true +warmup_steps: 1000 diff --git a/egs/sre19-cmn2/v1/conf/optim_adam_default.yaml b/egs/sre19-cmn2/v1/conf/optim_adam_default.yaml new file mode 100644 index 00000000..b6620069 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/optim_adam_default.yaml @@ -0,0 +1,6 @@ +opt_type: adam +lr: 0.05 +amsgrad: true +beta1: 0.9 +beta2: 0.95 +weight_decay: 1.0e-05 diff --git a/egs/sre19-cmn2/v1/conf/res2net50.yaml b/egs/sre19-cmn2/v1/conf/res2net50.yaml new file mode 100644 index 00000000..48067a3d --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/res2net50.yaml @@ -0,0 +1,13 @@ +resnet_type: res2net50 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +res2net_width_factor: 3.25 +res2net_scale: 8 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/sre19-cmn2/v1/conf/resnet34.yaml b/egs/sre19-cmn2/v1/conf/resnet34.yaml new file mode 100644 index 00000000..98695823 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/resnet34.yaml @@ -0,0 +1,11 @@ +resnet_type: resnet34 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/sre19-cmn2/v1/conf/spinenet49.yaml b/egs/sre19-cmn2/v1/conf/spinenet49.yaml new file mode 100644 index 00000000..66b8d517 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/spinenet49.yaml @@ -0,0 +1,11 @@ +spinenet_type: spinenet49 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/sre19-cmn2/v1/conf/train_data_default.yaml b/egs/sre19-cmn2/v1/conf/train_data_default.yaml new file mode 100644 index 00000000..451ffa35 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_data_default.yaml @@ -0,0 +1,10 @@ +dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml +sampler: + batch_size: 32 + iters_per_epoch: 6 +data_loader: + num_workers: 8 + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/train_ecapatdnn_xvec_default.yaml b/egs/sre19-cmn2/v1/conf/train_ecapatdnn_xvec_default.yaml new file mode 100644 index 00000000..46298946 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_ecapatdnn_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: ecapatdnn_small.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/train_effnetb4_xvec_default.yaml b/egs/sre19-cmn2/v1/conf/train_effnetb4_xvec_default.yaml new file mode 100644 index 00000000..1bc74de6 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_effnetb4_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: efficientnet_b4.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/train_res2net50_xvec_default.yaml b/egs/sre19-cmn2/v1/conf/train_res2net50_xvec_default.yaml new file mode 100644 index 00000000..1d387790 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_res2net50_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: resnet34.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/train_resnet34_xvec_default.yaml b/egs/sre19-cmn2/v1/conf/train_resnet34_xvec_default.yaml new file mode 100644 index 00000000..1d387790 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_resnet34_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: resnet34.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/train_spinenet49_xvec_default.yaml b/egs/sre19-cmn2/v1/conf/train_spinenet49_xvec_default.yaml new file mode 100644 index 00000000..07167987 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_spinenet49_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: spinenet49.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/trainer_default.yaml b/egs/sre19-cmn2/v1/conf/trainer_default.yaml new file mode 100644 index 00000000..86dcc2e4 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/trainer_default.yaml @@ -0,0 +1,6 @@ +optim: optim_adam_default.yaml +lrsched: lrsched_exp_default.yaml +use_amp: true +log_interval: 1000 +epochs: 70 +eff_batch_size: 512 diff --git a/egs/sre19-cmn2/v1/conf/trainer_swa_default.yaml b/egs/sre19-cmn2/v1/conf/trainer_swa_default.yaml new file mode 100644 index 00000000..0cafad01 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/trainer_swa_default.yaml @@ -0,0 +1,9 @@ +optim: optim_adam_default.yaml +lrsched: lrsched_exp_default.yaml +use_amp: true +log_interval: 1000 +epochs: 80 +eff_batch_size: 512 +swa_start: 60 +swa_lr: 1e-3 +swa_anneal_epochs: 5 diff --git a/egs/sre19-cmn2/v1/conf/vad_16k.pyconf b/egs/sre19-cmn2/v1/conf/vad_16k.pyconf deleted file mode 100644 index b52af74b..00000000 --- a/egs/sre19-cmn2/v1/conf/vad_16k.pyconf +++ /dev/null @@ -1,16 +0,0 @@ ---sample-frequency -16000 ---frame-shift -10 ---frame-length -25 ---snip-edges -false ---vad-energy-threshold -5.5 ---vad-energy-mean-scale -0.5 ---vad-proportion-threshold -0.12 ---vad-frames-context -2 diff --git a/egs/sre19-cmn2/v1/conf/val_data_default.yaml b/egs/sre19-cmn2/v1/conf/val_data_default.yaml new file mode 100644 index 00000000..451ffa35 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/val_data_default.yaml @@ -0,0 +1,10 @@ +dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml +sampler: + batch_size: 32 + iters_per_epoch: 6 +data_loader: + num_workers: 8 + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/local/error_analysis.py b/egs/sre19-cmn2/v1/local/error_analysis.py index c4dbba5a..bbdb893d 100755 --- a/egs/sre19-cmn2/v1/local/error_analysis.py +++ b/egs/sre19-cmn2/v1/local/error_analysis.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre19-cmn2/v1/local/score_dcf.py b/egs/sre19-cmn2/v1/local/score_dcf.py index deb39682..fd7a3149 100755 --- a/egs/sre19-cmn2/v1/local/score_dcf.py +++ b/egs/sre19-cmn2/v1/local/score_dcf.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py index fa16dfce..31b527f7 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py index d3b35fba..954a8a4a 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py index 820c90db..5d77a896 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py @@ -20,7 +20,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py index c6f62957..0d5c3000 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py @@ -22,8 +22,8 @@ MultiTestTrialDataReader as TDR, ) from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py index ec4addef..e0b29fd4 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py @@ -17,8 +17,8 @@ from hyperion.utils import TrialScores from hyperion.helpers import MultiTestTrialDataReaderV2 as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py index 20e88a37..ebc77930 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py @@ -21,7 +21,7 @@ MultiTestTrialDataReader as TDR, ) from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def combine_diar_scores(ndx, orig_seg, subseg_scores): diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py index b77d3595..9ef02a02 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py @@ -17,7 +17,7 @@ from hyperion.utils import TrialScores from hyperion.helpers import MultiTestTrialDataReaderV2 as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def combine_diar_scores(ndx, orig_seg, subseg_scores): diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py index 0c5b31e0..76bf4bcd 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py index f7d83d30..19ca8bdf 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py b/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py index 779e62af..c57a1162 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py @@ -18,8 +18,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): diff --git a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py index c9f22d83..46710992 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py @@ -12,9 +12,9 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F -from hyperion.clustering import AHC +from hyperion.np.clustering import AHC from hyperion.utils.utt2info import Utt2Info diff --git a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py index d8d82405..df435852 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py +++ b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py @@ -13,9 +13,9 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F -from hyperion.clustering import AHC +from hyperion.np.clustering import AHC from hyperion.utils.utt2info import Utt2Info diff --git a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py index 1b039c40..6532b9aa 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py +++ b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py @@ -12,9 +12,9 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm, CORAL +from hyperion.np.transforms import TransformList, LDA, LNorm, CORAL from hyperion.helpers import PLDAFactory as F -from hyperion.clustering import AHC +from hyperion.np.clustering import AHC from hyperion.utils.utt2info import Utt2Info diff --git a/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py b/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py index f825d59b..c1087bf4 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.scp_list import SCPList diff --git a/egs/sre20-cts/v1/local/score_dcf.py b/egs/sre20-cts/v1/local/score_dcf.py index 1137e049..cba16610 100755 --- a/egs/sre20-cts/v1/local/score_dcf.py +++ b/egs/sre20-cts/v1/local/score_dcf.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py b/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py index a5373bf4..bfa0c7c3 100755 --- a/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py +++ b/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py @@ -25,11 +25,11 @@ # from hyperion.utils.trial_scores import TrialScores # from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm -from hyperion.clustering import AHC +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm +from hyperion.np.clustering import AHC from hyperion.utils import Utt2Info -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.classifiers import BinaryLogisticRegression as LR def apply_ahc( diff --git a/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py b/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py index fb5dd6f9..1cf80177 100755 --- a/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py @@ -18,8 +18,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py b/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py index e3d1db91..92d2c2d0 100755 --- a/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py @@ -18,8 +18,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.utils import Utt2Info diff --git a/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py b/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py index 0d67a741..f1d90241 100755 --- a/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py @@ -17,7 +17,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def eval_fusion(in_score_files, ndx_file, model_file, out_score_file, fus_idx): diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py index 651a1b7f..7ab376c1 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py index 49ad3b42..50966aeb 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py @@ -20,7 +20,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda_e( diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py index ac6710ad..e46f729b 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py index 7430caf4..c9657a66 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py @@ -16,10 +16,10 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores from hyperion.helpers import TrialDataReader as TDR -from hyperion.utils.math import cosine_scoring -from hyperion.pdfs import PLDA -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.utils.math_funcs import cosine_scoring +from hyperion.np.pdfs import PLDA +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py index fb2904b1..698c0f32 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py @@ -18,7 +18,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py index 9eaea8b5..24ef731b 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py @@ -15,11 +15,11 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring -from hyperion.pdfs import PLDA +from hyperion.utils.math_funcs import cosine_scoring +from hyperion.np.pdfs import PLDA from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre20-cts/v1/steps_be/train-calibration-v1.py b/egs/sre20-cts/v1/steps_be/train-calibration-v1.py index 779e62af..c57a1162 100755 --- a/egs/sre20-cts/v1/steps_be/train-calibration-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-calibration-v1.py @@ -18,8 +18,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): diff --git a/egs/sre20-cts/v1/steps_be/train-calibration-v2.py b/egs/sre20-cts/v1/steps_be/train-calibration-v2.py index 16d09e3a..28597899 100755 --- a/egs/sre20-cts/v1/steps_be/train-calibration-v2.py +++ b/egs/sre20-cts/v1/steps_be/train-calibration-v2.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils import Utt2Info -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration_cond(cond, scr, key, model_file, prior, lambda_reg, verbose): diff --git a/egs/sre20-cts/v1/steps_be/train-fusion-v1.py b/egs/sre20-cts/v1/steps_be/train-fusion-v1.py index a76b2b6c..9c7f5315 100755 --- a/egs/sre20-cts/v1/steps_be/train-fusion-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-fusion-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def train_fusion( diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py index a024281a..bdef3fc3 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py @@ -13,11 +13,11 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.helpers import VectorClassReader as VCR -from hyperion.pdfs import PLDA -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.pdfs import PLDA +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py index 568e7edf..51795676 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py @@ -13,11 +13,11 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.helpers import VectorClassReader as VCR -from hyperion.pdfs import PLDA -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.pdfs import PLDA +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank, svd diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py index 7633cf17..79c1cd6f 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py @@ -13,11 +13,11 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.helpers import VectorClassReader as VCR -from hyperion.pdfs import PLDA, SPLDA -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.pdfs import PLDA, SPLDA +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank, svd diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py b/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py index a388fb88..01d38b65 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py @@ -11,7 +11,7 @@ import numpy as np from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py b/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py index ac5bfa7e..e29da60b 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py @@ -12,7 +12,7 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils import Utt2Info diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py b/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py index 7326d649..baef33f1 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py @@ -12,7 +12,7 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, PCA, LDA, LNorm, CORAL +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm, CORAL from hyperion.helpers import PLDAFactory as F from hyperion.utils import Utt2Info from numpy.linalg import matrix_rank diff --git a/egs/sre21-av-a/v1.16k/README.md b/egs/sre21-av-a/v1.16k/README.md index e35577d7..d90dc0a4 100644 --- a/egs/sre21-av-a/v1.16k/README.md +++ b/egs/sre21-av-a/v1.16k/README.md @@ -7,6 +7,20 @@ The systems runs at 16 kHz, telephone data is upsampled to 16k using SoX This recipe is based on these works ``` +@inproceedings{Villalba2022, +author = {Jes\'us Villalba and Bengt J Borgstrom and Saurabh Kataria and Magdalena Rybicka and Carlos D Castillo and Jaejin Cho and L. Paola García-Perera and Pedro A. Torres-Carrasquillo and Najim Dehak}, +city = {ISCA}, +doi = {10.21437/Odyssey.2022-30}, +issue = {July}, +journal = {The Speaker and Language Recognition Workshop (Odyssey 2022)}, +month = {6}, +pages = {213-220}, +publisher = {ISCA}, +title = {Advances in Cross-Lingual and Cross-Source Audio-Visual Speaker Recognition: The JHU-MIT System for NIST SRE21}, +url = {https://www.isca-speech.org/archive/odyssey_2022/villalba22b_odyssey.html}, +year = {2022}, +} + @inproceedings{Villalba2020, address = {Tokyo, Japan}, author = {Villalba, Jes{\'{u}}s and Garcia-Romero, Daniel and Chen, Nanxin and Sell, Gregory and Borgstrom, Jonas and McCree, Alan and {Garcia Perera}, Leibny Paola and Kataria, Saurabh and Nidadavolu, Phani Sankar and Torres-Carrasquiilo, Pedro and Dehak, Najim}, @@ -88,8 +102,6 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_011_train_xvector.sh` - Trains the x-vector network on 4sec chunks - - - `run_012_finetune_xvector.sh` - Fine-tune x-vector network on 10-15 secs utts - `run_013_prepare_langid_train_data.sh` @@ -110,8 +122,8 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_040_eval_be_v1.sh, run_041_eval_be_v2.sh, run_042_eval_be_v3.sh, run_042b_eval_be_v3.sh` - Evals different back-end versions: - V1: Back-end trained on all data without adaptation - - V2: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and langauge, global PLDA adapted to SRE-Vox-CHN - - V3: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and langauge, source dependent PLDA adapted to SRE-CHN or Vox-CHN + - V2: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and language, global PLDA adapted to SRE-Vox-CHN + - V3: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and language, source dependent PLDA adapted to SRE-CHN or Vox-CHN - V3b: V3 with hyperparmeters tuned for x-vectors trained on VoxCeleb only - `run_fus*.sh` @@ -120,4 +132,39 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs ## Results -TODO +The back-end used for these results is: +- back-end V2 (run_041_eval_be_v2.sh) +- Without S-Norm +- Scores are calibrated as indicated in the paper. + +## SRE16 Eval40% YUE + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.57 | 0.135 | 0.237 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.23 | 0.136 | 0.187 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.38 | 0.147 | 0.189 | + +## SRE-CTS Superset dev set + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.37 | 0.076 | 0.106 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 | + +## SRE21 Audio Dev (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.91 | 0.393 | 0.409 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 5.22 | 0.370 | 0.377 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.79 | 0.309 | 0.325 | + +## SRE21 Audio Eval (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.68 | 0.395 | 0.401 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.92 | 0.405 | 0.412 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.80 | 0.357 | 0.360 | diff --git a/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml b/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml deleted file mode 100644 index 5451702f..00000000 --- a/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml +++ /dev/null @@ -1,59 +0,0 @@ -min_chunk_length: 4.0 -max_chunk_length: 4.0 -return_fullseqs: false -wav_scale: 32767 -batch_size: 512 -var_batch_size: false -iters_per_epoch: 6.0 -train_aug_cfg: conf/reverb_noise_aug.yaml -val_aug_cfg: conf/reverb_noise_aug.yaml -feats: fbank64_stmn_nb_16k.yaml -pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 32 -embed_dim: 32 -num_embed_layers: 1 -hid_act: relu6 -loss_type: arc-softmax -s: 30.0 -margin: 0.3 -margin_warmup_epochs: 30.0 -dropout_rate: 0.0 -in_feats: 64 -resnet_type: lresnet34 -in_channels: 1 -conv_channels: 64 -base_channels: 64 -in_kernel_size: 3 -in_stride: 1 -in_norm: false -no_maxpool: true -optim: - opt_type: adam - lr: 0.02 - # lr: 0.01 - beta1: 0.9 - beta2: 0.95 - amsgrad: true - weight_decay: 1e-5 -lrsched: - lrsch_type: exp_lr - decay_rate: 0.5 - decay_steps: 8000 - hold_steps: 10000 - min_lr: 1.0e-05 - warmup_steps: 1000 - update_lr_on_opt_step: true -grad_acc_steps: 1 -epochs: 70 -log_interval: 100 -use_tensorboard: false -use_wandb: false -wandb: - mode: online -ddp_type: ddp -use_amp: true -swa_start: 0 -swa_lr: 0.001 -swa_anneal_epochs: 10 -num_gpus: 4 diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..d68ea26e --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,104 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 8192 + dropout_rate: 0.0 + hid_act: relu6 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 35000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..30483a8b --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml b/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml new file mode 100644 index 00000000..c46365db --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml @@ -0,0 +1,78 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + data_loader: + num_workers: 8 +feats: fbank64_stmn_nb_16k.yaml +model: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 64 + conv_channels: 64 + in_kernel_size: 3 + in_stride: 1 + in_norm: false + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 32 + embed_dim: 32 + num_embed_layers: 1 + hid_act: relu6 + loss_type: arc-softmax + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 30.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.02 + beta1: 0.9 + beta2: 0.95 + amsgrad: true + weight_decay: 1e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 10000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + epochs: 70 + log_interval: 100 + use_amp: true + diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..7a9234b6 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,80 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + pool_net: + pool_type: mean+stddev + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 50 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..49f84a6a --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 21 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..4c427202 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,82 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..c85c0e7b --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..10607607 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,83 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + se_r: 256 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..c85c0e7b --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index c8732c36..1da68697 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,53 +9,19 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 lr=0.02 nnet_type=resnet1d -block_type=seres2bn # squeeze-excitation res2net bottleneck -channels=2048 -ep_channels=8192 -width_factor=1 -scale=8 -se_r=16 dropout=0 -attstats_inner=128 embed_dim=256 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet_enc.in-feats 80 \ - --resnet_enc.in-conv-channels $channels \ - --resnet_enc.in-kernel-size 5 \ - --resnet_enc.in-stride 1 \ - --resnet_enc.resb-type $block_type \ - --resnet_enc.resb-repeats 1 1 1 1 \ - --resnet_enc.resb-channels $channels \ - --resnet_enc.resb-kernel-sizes 3 \ - --resnet_enc.resb-dilations 2 3 4 5 \ - --resnet_enc.resb-strides 1 \ - --resnet_enc.res2net-width-factor $width_factor \ - --resnet_enc.res2net-scale $scale \ - --resnet_enc.se-r $se_r \ - --resnet_enc.multilayer \ - --resnet_enc.multilayer-concat \ - --resnet_enc.endpoint-channels $ep_channels \ - --pool_net.pool-type ch-wise-att-mean+stddev \ - --pool_net.inner-feats $attstats_inner \ - --embed-dim $embed_dim" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 35000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - +nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml nnet_name=${feat_type}_ecapatdnn2048x4_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=75 nnet_dir=exp/xvector_nnets/$nnet_name @@ -63,18 +29,14 @@ nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -88,7 +50,4 @@ else plda_data=voxceleb2cat_train_augx${plda_num_augs} fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh index 1903369e..6d14f27d 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh @@ -1,4 +1,4 @@ -# LResNet34 x-vector with mixed precision training +# Res2Net50 w26s4 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,50 +9,33 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.05 -nnet_type=res2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 50 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data -nnet_num_epochs=60 +nnet_base_cfg=conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s4_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0071.pth - +nnet=$nnet_dir/model_ep0061.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=21 ft_margin=0.5 -ft_margin_warmup=5 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0021.pth @@ -61,7 +44,4 @@ ft_nnet=$ft_nnet_dir/model_ep0021.pth plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh deleted file mode 100644 index 344e1288..00000000 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh +++ /dev/null @@ -1,67 +0,0 @@ -# LResNet34 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank80_stmn_16k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxcelebcat -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=res2net50 -dropout=0 -embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 50 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data -nnet_num_epochs=60 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth -#nnet=$nnet_dir/swa_model_ep0061.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=10 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=15 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index cae32b57..0b62008e 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,103 +9,40 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.02 -nnet_type=res2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 s=30 margin_warmup=20 margin=0.3 attstats_inner=128 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s8_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=10 -ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# xvector last-layer finetuning in-domain -reg_layers_classif=0 -reg_layers_enc="0 1 2 3 4" -nnet_adapt_data=voxcelebcat_sre_alllangs_mixfs_chnspks - -# ft2_batch_size_1gpu=4 -# ft2_eff_batch_size=128 # effective batch size -# ft2_ipe=4 -# ft2_lr=0.01 -# ft2_nnet_num_epochs=12 -# ft2_margin_warmup=3 -# ft2_reg_weight_embed=0.1 -# ft2_min_chunk=10 -# ft2_max_chunk=60 - -# ft2_opt_opt="--optim.opt-type sgd --optim.lr $ft2_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -# ft2_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -# ft2_nnet_name=${ft_nnet_name}.ft_eaffine_rege_w${ft2_reg_weigth_embed}_${ft2_min_chunk}_${ft2_max_chunk}_sgdcos_lr${ft2_lr}_b${ft2_eff_batch_size}_amp.v2 -# ft2_nnet_dir=exp/xvector_nnets/$ft2_nnet_name -# ft2_nnet=$ft2_nnet_dir/model_ep0010.pth - - -# xvector full nnet finetuning -ft2_batch_size_1gpu=6 -ft2_eff_batch_size=128 # effective batch size -ft2_ipe=1 -ft2_lr=0.01 -ft2_nnet_num_epochs=15 -ft2_margin=0.5 -ft2_margin_warmup=3 -ft2_reg_weight_embed=0.1 -ft2_reg_weight_enc=0.1 -ft2_min_chunk=10 -ft2_max_chunk=10 - -ft2_opt_opt="--optim.opt-type sgd --optim.lr $ft2_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft2_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft2_nnet_name=${ft_nnet_name}.ft_reg_wenc${ft2_reg_weight_enc}_we${ft2_reg_weight_embed}_${ft2_min_chunk}_${ft2_max_chunk}_sgdcos_lr${ft2_lr}_b${ft2_eff_batch_size}_amp.v1 -ft2_nnet_dir=exp/xvector_nnets/$ft2_nnet_name -ft2_nnet=$ft2_nnet_dir/model_ep0012.pth - - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index 96475c53..a57f16d9 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,21 +9,15 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.02 -nnet_type=tseres2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 se_r=256 s=30 @@ -31,13 +25,8 @@ margin_warmup=20 margin=0.3 attstats_inner=128 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 +nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_tseres2net50w26s4_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0075.pth nnet=$nnet_dir/swa_model_ep0076.pth @@ -49,12 +38,9 @@ ft_min_chunk=10 ft_max_chunk=15 ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -69,7 +55,4 @@ else plda_data=voxceleb2cat_train_augx${plda_num_augs} fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh similarity index 100% rename from egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh rename to egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh diff --git a/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh new file mode 100644 index 00000000..b5863308 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh @@ -0,0 +1,49 @@ +# Res2Net50 w26s8 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxcelebcat + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_args="--model.pool_net.pool-type mean+stddev" +nnet_name=${feat_type}_res2net50w26s8_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth +#nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +plda_type=splda diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh similarity index 100% rename from egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh rename to egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh diff --git a/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl b/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl index 27b1f152..18b6d40c 100755 --- a/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl +++ b/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl @@ -31,7 +31,7 @@ my $meta_path = "$data_base/vox1_meta.csv"; if (! -e "$meta_path") { $meta_path = "$out_dir/vox1_meta.csv"; - system("wget -O $meta_path $meta_url"); + system("wget --no-check-certificate -O $meta_path $meta_url"); } open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; @@ -53,7 +53,7 @@ my $lid_path = "$data_base/lang_vox1_final.csv"; if (! -e "$lid_path") { $lid_path = "$out_dir/lang_vox1_final.csv"; - system("wget -O $lid_path $lid_url"); + system("wget --no-check-certificate -O $lid_path $lid_url"); } open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; my %utt2lang = (); diff --git a/egs/sre21-av-a/v1.16k/local/plot-tsne-cts.py b/egs/sre21-av-a/v1.16k/local/plot-tsne-cts.py index 25cddea8..46769568 100755 --- a/egs/sre21-av-a/v1.16k/local/plot-tsne-cts.py +++ b/egs/sre21-av-a/v1.16k/local/plot-tsne-cts.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.transforms import PCA, SklTSNE, LNorm +from hyperion.np.transforms import PCA, SklTSNE, LNorm colors = ["b", "g", "r", "c", "m", "y", "k"] markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] diff --git a/egs/sre21-av-a/v1.16k/local/score_sre16.py b/egs/sre21-av-a/v1.16k/local/score_sre16.py index 4064b64f..af44fb53 100755 --- a/egs/sre21-av-a/v1.16k/local/score_sre16.py +++ b/egs/sre21-av-a/v1.16k/local/score_sre16.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_file): diff --git a/egs/sre21-av-a/v1.16k/local/score_sre21.py b/egs/sre21-av-a/v1.16k/local/score_sre21.py index 986aa3f6..72fc1a13 100755 --- a/egs/sre21-av-a/v1.16k/local/score_sre21.py +++ b/egs/sre21-av-a/v1.16k/local/score_sre21.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score(key_file, score_file, sre21_subset, output_file): diff --git a/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh b/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh index a5bc03eb..e56906f6 100755 --- a/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh +++ b/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh @@ -18,7 +18,7 @@ echo "Score SRE21 ${track} ${subset} for $score_dir" soft_dir=./sre21/scoring_software -if [ ! -f $s_dir/sre_scorer.py ];then +if [ ! -f $soft_dir/sre_scorer.py ];then echo "downloading scoring tool" local/download_sre21_scoring_tool.sh fi diff --git a/egs/sre21-av-a/v1.16k/local/score_sre_cts_superset.py b/egs/sre21-av-a/v1.16k/local/score_sre_cts_superset.py index 3f2223a4..bb61ca18 100755 --- a/egs/sre21-av-a/v1.16k/local/score_sre_cts_superset.py +++ b/egs/sre21-av-a/v1.16k/local/score_sre_cts_superset.py @@ -14,7 +14,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score(key_file, score_file, output_file): diff --git a/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh b/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh index f7aa7828..08f655ea 100755 --- a/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh +++ b/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh @@ -9,7 +9,6 @@ set -e nodes=fs01 storage_name=$(date +'%m_%d_%H_%M') vaddir=`pwd`/exp/vad_e -vad_config=conf/vad_16k.yaml stage=1 config_file=default_config.sh @@ -75,41 +74,3 @@ if [ $stage -le 3 ];then done fi -# #Enroll multi-speaker Datasets with time marks -# if [ $stage -le 3 ];then -# for name in sre18_dev_enroll_vast sre18_eval_enroll_vast sre19_av_a_dev_enroll sre19_av_a_eval_enroll -# do -# num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') -# nj=$(($num_spk < 40 ? $num_spk:40)) -# # we just run energy vad to get the utt2num_frames file -# hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ -# --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ -# data/${name} exp/make_vad/$name $vaddir -# utils/fix_data_dir.sh data/${name} -# local/sre18_diar_to_vad.sh data/${name} exp/make_vad $vaddir -# utils/fix_data_dir.sh data/${name} -# done -# fi - -# #Dihard Datasets -# if [ $stage -le 4 ];then -# for name in dihard2_train_dev dihard2_train_eval -# do -# num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') -# nj=$(($num_spk < 40 ? $num_spk:40)) -# # we just run energy vad to get the utt2num_frames file -# hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ -# --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ -# data/${name} exp/make_vad/$name $vaddir -# hyp_utils/rttm_to_bin_vad.sh --nj 5 data/$name/vad.rttm data/$name $vaddir -# utils/fix_data_dir.sh data/${name} -# done - -# fi - -# if [ $stage -le 5 ];then -# utils/combine_data.sh --extra-files "utt2num_frames" data/dihard2_train data/dihard2_train_dev data/dihard2_train_eval -# utils/fix_data_dir.sh data/dihard2_train -# fi - - diff --git a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh index 0608929c..d7ea8ed0 100755 --- a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh +++ b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh @@ -10,28 +10,66 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" . parse_options.sh || exit 1; . $config_file . datapath.sh -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/${nnet_data}_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type \ + --cfg $nnet_base_cfg $nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu \ + +fi + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + mkdir -p $ft_nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $ft_nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type \ + --cfg $ft_nnet_base_cfg $ft_nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet \ + --trainer.exp-path $ft_nnet_dir \ + --num-gpus $ngpu \ + +fi +exit + # Network Training if [ $stage -le 1 ]; then diff --git a/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh b/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh deleted file mode 100755 index 58a3fdc9..00000000 --- a/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=3 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($ft_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $ft_eff_batch_size $ft_min_chunk $ft_max_chunk | awk '{ print int($2/($1*$4*2/($3+$4))+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_proc_audio_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - mkdir -p $ft_nnet_dir/log - $cuda_cmd --gpu $ngpu $ft_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $ft_min_chunk --max-chunk-length $ft_max_chunk \ - --iters-per-epoch $ft_ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $ft_opt_opt $ft_lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $ft_nnet_num_epochs \ - --cos-scale $s --margin $ft_margin --margin-warmup-epochs $ft_margin_warmup \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --in-model-path $nnet \ - --train-mode ft-full \ - --exp-path $ft_nnet_dir $args - -fi - - diff --git a/egs/sre21-av-a/v1.16k/run_014_train_lid.sh b/egs/sre21-av-a/v1.16k/run_014_train_lid.sh index 6251de97..35d2c0bc 100755 --- a/egs/sre21-av-a/v1.16k/run_014_train_lid.sh +++ b/egs/sre21-av-a/v1.16k/run_014_train_lid.sh @@ -10,19 +10,17 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 -lid_ipe=1 +num_workers="" + . parse_options.sh || exit 1; . $config_file . datapath.sh list_dir=data/train_lid_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then @@ -33,22 +31,20 @@ lid_nnet_dir=exp/lid_nnets/lresnet34_lid_v1 # Network Training if [ $stage -le 1 ]; then - train_exec=torch-train-resnet-xvec-from-wav.py mkdir -p $lid_nnet_dir/log $cuda_cmd \ --gpu $ngpu $lid_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec --cfg conf/lresnet34_lid_v1.yaml \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_train_lid/train.scp \ - --val-list $list_dir/lists_train_lid/val.scp \ - --class-file $list_dir/lists_train_lid/class2int \ - --iters-per-epoch $lid_ipe \ - --num-workers $num_workers \ - --num-gpus $ngpu \ - --exp-path $lid_nnet_dir $args - + train_xvector_from_wav.py resnet \ + --cfg conf/train_lresnet34_lid_v1.yaml \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_train_lid/train.scp \ + --data.train.dataset.class-file $list_dir/lists_train_lid/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_train_lid/val.scp \ + --trainer.exp-path $lid_nnet_dir $extra_args \ + --num-gpus $ngpu fi -exit diff --git a/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh b/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh index 0941951f..73cb9a3d 100755 --- a/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh +++ b/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh @@ -195,7 +195,7 @@ if [ $stage -le 5 ]; then #SRE superset and 16 echo "SRE Superset Dev" steps_be/eval_be_plda_snorm_v2_cts.sh \ - --cmd "$train_cmd --mem 8G" \ + --cmd "$train_cmd --mem 12G" \ --plda_type $plda_type --ncoh $ncoh --num-parts 100 \ data/sre_cts_superset_16k_dev/trials \ data/sre_cts_superset_16k_dev/utt2enroll \ diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v1.py b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v1.py index f265ca30..51d21312 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v2.py b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v2.py index 35b2d501..a9e7ee03 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v2.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v2.py @@ -20,8 +20,8 @@ from hyperion.utils import TrialNdx, TrialScores, Utt2Info from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.io import RandomAccessDataReaderFactory as DRF diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v1.py b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v1.py index d122d14c..1e45f560 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v1.py @@ -20,7 +20,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v2.py b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v2.py index 3051383b..2eda0f47 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v2.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v2.py @@ -19,7 +19,7 @@ from hyperion.utils import TrialNdx, TrialScores, Utt2Info from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList conds = [ "cts_eng", diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v3.py b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v3.py index ee0cb558..8cceb387 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v3.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v3.py @@ -19,7 +19,7 @@ from hyperion.utils import TrialNdx, TrialScores, Utt2Info from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList conds = [ "cts_eng", diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre-superset.py b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre-superset.py index 0781f9f2..21d2337b 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre-superset.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre-superset.py @@ -18,8 +18,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, TrialNdx, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def read_ndx_and_scores(ndx_file, score_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre16.py b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre16.py index 7880e358..6b2da927 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre16.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre16.py @@ -18,7 +18,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, TrialNdx, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.classifiers import BinaryLogisticRegression as LR def read_ndx_and_scores(ndx_file, score_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-dev.py b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-dev.py index ce2be18c..240baf82 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-dev.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-dev.py @@ -18,8 +18,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, TrialNdx, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def read_ndx_and_scores(ndx_file, score_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-eval.py b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-eval.py index 407d78dc..50ce6943 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-eval.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-eval.py @@ -18,8 +18,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, TrialNdx, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def read_ndx_and_scores(ndx_file, score_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v1.py b/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v1.py index 205a73d3..933f8864 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v1.py @@ -17,7 +17,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def eval_fusion(in_score_files, ndx_file, model_file, out_score_file, fus_idx): diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v2.py b/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v2.py index 0fa1ee59..081d8f23 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v2.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v2.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, TrialNdx, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def read_ndx(ndx_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v1.py b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v1.py index 42d5d927..d7ba9129 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v1.py @@ -18,7 +18,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.scp_list import SCPList diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v2.py b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v2.py index 082f69a6..f38445c5 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v2.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v2.py @@ -18,7 +18,7 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v3.py b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v3.py index 423ab265..febda665 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v3.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v3.py @@ -18,7 +18,7 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-calibration-v1.py b/egs/sre21-av-a/v1.16k/steps_be/train-calibration-v1.py index 2c5fa488..01a26410 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-calibration-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-calibration-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def read_key_and_scores(key_file, score_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v1.py b/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v1.py index 8935b431..65c78b41 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def train_fusion( diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v2.py b/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v2.py index 1c51111c..b447b81e 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v2.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v2.py @@ -17,9 +17,9 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import GreedyFusionBinaryLR as GF -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_fusion_condition( diff --git a/egs/sre21-av-a/v1.8k/README.md b/egs/sre21-av-a/v1.8k/README.md index a105128c..b55f9bf0 100644 --- a/egs/sre21-av-a/v1.8k/README.md +++ b/egs/sre21-av-a/v1.8k/README.md @@ -10,6 +10,20 @@ copy the utt2est_lang files from the 16k data dirs to the VoxCeleb and SRE21 dat This recipe is based on these works ``` +@inproceedings{Villalba2022, +author = {Jes\'us Villalba and Bengt J Borgstrom and Saurabh Kataria and Magdalena Rybicka and Carlos D Castillo and Jaejin Cho and L. Paola García-Perera and Pedro A. Torres-Carrasquillo and Najim Dehak}, +city = {ISCA}, +doi = {10.21437/Odyssey.2022-30}, +issue = {July}, +journal = {The Speaker and Language Recognition Workshop (Odyssey 2022)}, +month = {6}, +pages = {213-220}, +publisher = {ISCA}, +title = {Advances in Cross-Lingual and Cross-Source Audio-Visual Speaker Recognition: The JHU-MIT System for NIST SRE21}, +url = {https://www.isca-speech.org/archive/odyssey_2022/villalba22b_odyssey.html}, +year = {2022}, +} + @inproceedings{Villalba2020, address = {Tokyo, Japan}, author = {Villalba, Jes{\'{u}}s and Garcia-Romero, Daniel and Chen, Nanxin and Sell, Gregory and Borgstrom, Jonas and McCree, Alan and {Garcia Perera}, Leibny Paola and Kataria, Saurabh and Nidadavolu, Phani Sankar and Torres-Carrasquiilo, Pedro and Dehak, Najim}, @@ -91,8 +105,6 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_011_train_xvector.sh` - Trains the x-vector network on 4sec chunks - - - `run_012_finetune_xvector.sh` - Fine-tune x-vector network on 10-15 secs utts - `run_030_extract_xvectors.sh` @@ -111,4 +123,39 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs ## Results -TODO +The back-end used for these results is: +- back-end V2 (run_041_eval_be_v2.sh) +- Without S-Norm +- Scores are calibrated as indicated in the paper. + +## SRE16 Eval40% YUE + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.922 | 0.154 | 0.200 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.168 | 0.127 | 0.134 | + + +## SRE-CTS Superset dev set + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.39 | 0.072 | 0.095 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.175 | 0.057 | 0.069 | + + +## SRE21 Audio Dev (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 6.65 | 0.418 | 0.436 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 3.73 | 0.319 | 0.325 | + + +## SRE21 Audio Eval (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.44 | 0.388 | 0.390 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.21 | 0.356 | 0.377 | + diff --git a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..bc311234 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,104 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 8192 + dropout_rate: 0.0 + hid_act: relu6 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 30000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..72dec1b7 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..416926d0 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,82 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 35000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..1b917e2c --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 diff --git a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..2d74799c --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,83 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + se_r: 256 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..c85c0e7b --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.8k/default_config.sh b/egs/sre21-av-a/v1.8k/default_config.sh index 91a20745..74b76b0a 120000 --- a/egs/sre21-av-a/v1.8k/default_config.sh +++ b/egs/sre21-av-a/v1.8k/default_config.sh @@ -1 +1 @@ -global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh \ No newline at end of file +global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh \ No newline at end of file diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index 69ad025b..65c2c924 100644 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,53 +9,19 @@ vad_config=conf/vad_8k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 lr=0.02 nnet_type=resnet1d -block_type=seres2bn # squeeze-excitation res2net bottleneck -channels=2048 -ep_channels=8192 -width_factor=1 -scale=8 -se_r=16 dropout=0 -attstats_inner=128 embed_dim=256 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet_enc.in-feats 64 \ - --resnet_enc.in-conv-channels $channels \ - --resnet_enc.in-kernel-size 5 \ - --resnet_enc.in-stride 1 \ - --resnet_enc.resb-type $block_type \ - --resnet_enc.resb-repeats 1 1 1 1 \ - --resnet_enc.resb-channels $channels \ - --resnet_enc.resb-kernel-sizes 3 \ - --resnet_enc.resb-dilations 2 3 4 5 \ - --resnet_enc.resb-strides 1 \ - --resnet_enc.res2net-width-factor $width_factor \ - --resnet_enc.res2net-scale $scale \ - --resnet_enc.se-r $se_r \ - --resnet_enc.multilayer \ - --resnet_enc.multilayer-concat \ - --resnet_enc.endpoint-channels $ep_channels \ - --pool_net.pool-type ch-wise-att-mean+stddev \ - --pool_net.inner-feats $attstats_inner \ - --embed-dim $embed_dim" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 30000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - +nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml nnet_name=${feat_type}_ecapatdnn2048x4_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=75 nnet_dir=exp/xvector_nnets/$nnet_name @@ -63,18 +29,14 @@ nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -82,8 +44,10 @@ ft_nnet=$ft_nnet_dir/model_ep0007.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh new file mode 100644 index 00000000..824361d0 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -0,0 +1,48 @@ +# Res2Net50 w26s8 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxcelebcat_sre_alllangs_mixfs + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s8_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth +nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +plda_type=splda diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh deleted file mode 100644 index e1a923d7..00000000 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ /dev/null @@ -1,68 +0,0 @@ -# LResNet34 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank64_stmn_8k.yaml -feat_type=fbank64_stmn - -#vad -vad_config=conf/vad_8k.yaml - -# x-vector training -nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=res2net50 -dropout=0 -embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - -s=30 -margin_warmup=20 -margin=0.3 -attstats_inner=128 - -nnet_opt="--resnet-type $nnet_type --in-feats 64 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 35000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth -nnet=$nnet_dir/swa_model_ep0076.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=10 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=15 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh new file mode 100644 index 00000000..58010842 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -0,0 +1,58 @@ +# Time SE Res2Net50 w26s4 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxcelebcat_sre_alllangs_mixfs + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 +se_r=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_tseres2net50w26s4_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0075.pth +nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_batch_size_1gpu=8 +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_ipe=1 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda + diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh deleted file mode 100644 index 9f5c8e70..00000000 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ /dev/null @@ -1,76 +0,0 @@ -# Time SE Res2Net50 w26s4 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank80_stmn_8k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_8k.yaml - -# x-vector training -nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=tseres2net50 -dropout=0 -embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 -se_r=256 - -s=30 -margin_warmup=20 -margin=0.3 -attstats_inner=128 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0075.pth -nnet=$nnet_dir/swa_model_ep0076.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=15 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=21 -ft_nnet_num_epochs=45 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0014.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh index 9891e812..1ffd35a8 100755 --- a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh +++ b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh @@ -10,22 +10,17 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" . parse_options.sh || exit 1; . $config_file . datapath.sh -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/${nnet_data}_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then @@ -35,45 +30,41 @@ fi # Network Training if [ $stage -le 1 ]; then - if [[ ${nnet_type} =~ resnet1d ]]; then - train_exec=torch-train-resnet1d-xvec-from-wav.py - elif [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]] || [[ ${nnet_type} =~ res2net ]] || [[ ${nnet_type} =~ res2next ]]; then - train_exec=torch-train-resnet-xvec-from-wav.py - elif [[ ${nnet_type} =~ efficientnet ]]; then - train_exec=torch-train-efficientnet-xvec-from-wav.py - elif [[ ${nnet_type} =~ tdnn ]]; then - train_exec=torch-train-tdnn-xvec-from-wav.py - elif [[ ${nnet_type} =~ transformer ]]; then - train_exec=torch-train-transformer-xvec-v1-from-wav.py - else - echo "$nnet_type not supported" - exit 1 - fi - mkdir -p $nnet_dir/log $cuda_cmd \ --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --embed-dim $embed_dim $nnet_opt $opt_opt $lrs_opt \ - --epochs $nnet_num_epochs \ - --cos-scale $s --margin $margin --margin-warmup-epochs $margin_warmup \ - --dropout-rate $dropout \ + train_xvector_from_wav.py $nnet_type \ + --cfg $nnet_base_cfg $nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args fi - -exit +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + mkdir -p $ft_nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $ft_nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type \ + --cfg $ft_nnet_base_cfg $ft_nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet \ + --trainer.exp-path $ft_nnet_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh b/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh deleted file mode 100755 index 58a3fdc9..00000000 --- a/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=3 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($ft_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $ft_eff_batch_size $ft_min_chunk $ft_max_chunk | awk '{ print int($2/($1*$4*2/($3+$4))+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_proc_audio_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - mkdir -p $ft_nnet_dir/log - $cuda_cmd --gpu $ngpu $ft_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $ft_min_chunk --max-chunk-length $ft_max_chunk \ - --iters-per-epoch $ft_ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $ft_opt_opt $ft_lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $ft_nnet_num_epochs \ - --cos-scale $s --margin $ft_margin --margin-warmup-epochs $ft_margin_warmup \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --in-model-path $nnet \ - --train-mode ft-full \ - --exp-path $ft_nnet_dir $args - -fi - - diff --git a/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh b/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh index a55761ae..92cbd887 100755 --- a/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh +++ b/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh @@ -153,7 +153,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh b/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh index f8eae0a1..6890eba9 100755 --- a/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh +++ b/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh @@ -187,7 +187,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 @@ -311,7 +311,7 @@ fi if [ $stage -le 7 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh b/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh index 263d7bbe..35afbb27 100755 --- a/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh +++ b/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh @@ -185,7 +185,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/sre21-av-v/v0.1/local/score_dcf.py b/egs/sre21-av-v/v0.1/local/score_dcf.py index 514ebf51..772d107a 100755 --- a/egs/sre21-av-v/v0.1/local/score_dcf.py +++ b/egs/sre21-av-v/v0.1/local/score_dcf.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre21-av/v1/local/score_sre21.py b/egs/sre21-av/v1/local/score_sre21.py index 986aa3f6..72fc1a13 100755 --- a/egs/sre21-av/v1/local/score_sre21.py +++ b/egs/sre21-av/v1/local/score_sre21.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score(key_file, score_file, sre21_subset, output_file): diff --git a/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py b/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py index 78231ba1..b280ab0e 100755 --- a/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py +++ b/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/voices_challenge/v0/steps_be/eval-be-v1.py b/egs/voices_challenge/v0/steps_be/eval-be-v1.py index dc3e3f87..19d582e4 100755 --- a/egs/voices_challenge/v0/steps_be/eval-be-v1.py +++ b/egs/voices_challenge/v0/steps_be/eval-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py b/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py index fa16dfce..31b527f7 100755 --- a/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py +++ b/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/voices_challenge/v0/steps_be/train-be-v1.py b/egs/voices_challenge/v0/steps_be/train-be-v1.py index 44f93a57..ed1b5f09 100755 --- a/egs/voices_challenge/v0/steps_be/train-be-v1.py +++ b/egs/voices_challenge/v0/steps_be/train-be-v1.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F diff --git a/egs/voices_challenge/v0/steps_be/train-be-v2.py b/egs/voices_challenge/v0/steps_be/train-be-v2.py index cd4d4470..fbb961b2 100755 --- a/egs/voices_challenge/v0/steps_be/train-be-v2.py +++ b/egs/voices_challenge/v0/steps_be/train-be-v2.py @@ -18,7 +18,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info diff --git a/egs/voices_challenge/v0/steps_be/train-calibration-v1.py b/egs/voices_challenge/v0/steps_be/train-calibration-v1.py index fa1dfcf7..9eaa7187 100755 --- a/egs/voices_challenge/v0/steps_be/train-calibration-v1.py +++ b/egs/voices_challenge/v0/steps_be/train-calibration-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, verbose): diff --git a/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py b/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py index 78231ba1..b280ab0e 100755 --- a/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py +++ b/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/voices_challenge/v1/steps_be/eval-be-v1.py b/egs/voices_challenge/v1/steps_be/eval-be-v1.py index dc3e3f87..19d582e4 100755 --- a/egs/voices_challenge/v1/steps_be/eval-be-v1.py +++ b/egs/voices_challenge/v1/steps_be/eval-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py b/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py index fa16dfce..31b527f7 100755 --- a/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py +++ b/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/voices_challenge/v1/steps_be/train-be-v1.py b/egs/voices_challenge/v1/steps_be/train-be-v1.py index 44f93a57..ed1b5f09 100755 --- a/egs/voices_challenge/v1/steps_be/train-be-v1.py +++ b/egs/voices_challenge/v1/steps_be/train-be-v1.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F diff --git a/egs/voices_challenge/v1/steps_be/train-be-v2.py b/egs/voices_challenge/v1/steps_be/train-be-v2.py index 36fbc341..fda28dc7 100755 --- a/egs/voices_challenge/v1/steps_be/train-be-v2.py +++ b/egs/voices_challenge/v1/steps_be/train-be-v2.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info diff --git a/egs/voices_challenge/v1/steps_be/train-calibration-v1.py b/egs/voices_challenge/v1/steps_be/train-calibration-v1.py index fa1dfcf7..9eaa7187 100755 --- a/egs/voices_challenge/v1/steps_be/train-calibration-v1.py +++ b/egs/voices_challenge/v1/steps_be/train-calibration-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, verbose): diff --git a/egs/voxceleb/adv.v1.1/README.md b/egs/voxceleb/adv.v1.1/README.md index bccc494b..49801544 100644 --- a/egs/voxceleb/adv.v1.1/README.md +++ b/egs/voxceleb/adv.v1.1/README.md @@ -1,25 +1,15 @@ # VoxCeleb Adversarial Attacks Version 1.1 -Last update 2021/05/17 +Last update 2023/04/10 Recipe to evaluate Adversarial Attacks to x-Vector Speaker Verification Systems -## Differences w.r.t VoxCeleb adv.v1 recipe - -In recipe version V1: - - We compute speech augmentations and acoustic features offline and dump them to disk. - - Augmentation is performed using Kaldi scripts and wav-reverbate tool - - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files. - -In this recipe: - - We compute speech augmentations and acoustic features are computed always on-the-fly, - we don't dump any features to disk. - - Augmentation is performed using Hyperin SpeechAugment class. - - The behavior of this class is controlled - by the the configuration file `conf/reverb_noise_aug.yml`, - which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe. - - Babble noise is created offline by mixing 3-10 single speaker files. +## Setup +To run attacks with Adversarial Robustness toolbox, you need to install it in the environment by +``` +pip install adversarial-robustness-toolbox[pytorch] +``` ## Threat Model @@ -92,48 +82,45 @@ run_0*.sh --config-file global_conf/config_victim_resnet34_transfer_lresnet.v1.s - `run_002_compute_evad.sh` - Computes Energy VAD for all datasets - - `run_002b_compute_fbank.sh` - - Computes log-filter-banks acoustic features for all datasets - - `run_003_prepare_noises_rirs.sh` - Prepares MUSAN noises, music to be used by SpeechAugment class. - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. - - `run_010_prepare_xvec_train_data.sh` + - `run_004_prepare_victim_xvec_train_data.sh` - Prepares audios train the victim x-vector model - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. - Removes silence from the audios - Removes utterances shorter than 4secs and speakers with less than 8 utterances. - Creates training and validation lists for x-vector training - - `run_011_train_victim_xvector.sh` + - `run_005_train_victim_xvector.sh` - Trains the victim x-vector network - - `run_012_prepare_transfer_xvec_train_data.sh` + - `run_006_prepare_transfer_xvec_train_data.sh` - Prepares audios train the transfer white-box x-vector model - If training data for victim and tranfer models is the same, it does nothing - - `run_013_train_transfer_xvector.sh` + - `run_007_train_transfer_xvector.sh` - Trains the transfer white-box x-vector network - - `run_030_extract_xvectors_victim_model.sh` + - `run_009_extract_xvectors_victim_model.sh` - Exctracts x-vectors for VoxCeleb1 test set using the victim model - - `run_031_extract_xvectors_transfer_model.sh` + - `run_010_extract_xvectors_transfer_model.sh` - Exctracts x-vectors for VoxCeleb1 test set using the transfer model - - `run_040_eval_be_victim_model.sh` + - `run_011_eval_be_victim_model.sh` - Eval cosine scoring back-end without attack on victim model x-vectors - Trains calibration for the victim model scores - Results are left in `exp/scores/$nnet_name/cosine/voxceleb1_o_clean_results` - - `run_041_eval_be_tranfer_model.sh` + - `run_012_eval_be_tranfer_model.sh` - Eval cosine scoring back-end without attack on transfer model x-vectors - Trains calibration for the tranfer model scores - Results are left in `exp/scores/$transfer_nnet_name/cosine/voxceleb1_o_clean_results` - - `run_043_eval_whitebox_attacks.sh` + - `run_013_eval_whitebox_attacks.sh` - Eval white box attacks implemented in Hyperion toolkit: FGSM, Iter-FGSM, PGD, Carlini-Wagner - Results are left in `exp/scores/$nnet_name/cosine_${attack_related_label}/voxceleb1_o_clean_results` - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF @@ -141,7 +128,7 @@ run_0*.sh --config-file global_conf/config_victim_resnet34_transfer_lresnet.v1.s - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - Wavs are saves to `exp/scores/$nnet_name/cosine_${attack_related_label}/wav` - - `run_044_eval_transfer_blackbox_attacks.sh` + - `run_014_eval_transfer_blackbox_attacks.sh` - Eval transfer black box attacks implemented in Hyperion toolkit: FGSM, Iter-FGSM, PGD, Carlini-Wagner - Results are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_${attack_related_label}/voxceleb1_o_clean_results` - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF @@ -149,11 +136,11 @@ run_0*.sh --config-file global_conf/config_victim_resnet34_transfer_lresnet.v1.s - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - Wavs are saves to `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_${attack_related_label}/wav` - - `run_045_eval_whitebox_attacks_with_randsmooth_defense.sh` + - `run_015_eval_whitebox_attacks_with_randsmooth_defense.sh` - Eval white box attacks with Gaussian randomized smoothing defense. - Results are left in `exp/scores/$nnet_name/cosine_${attack_related_label}_randsmooth${smooth_sigma}/voxceleb1_o_clean_results` - - `run_053_eval_art_whitebox_attacks.sh` + - `run_017_eval_art_whitebox_attacks.sh` - Eval white box attacks implemented in IBM's Adversarial Robustness Toolkit (ART): FGSM, Iter-FGSM, PGD, Carlini-Wagner - Results are left in `exp/scores/$nnet_name/cosine_art_${attack_related_label}/voxceleb1_o_clean_results` - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF @@ -161,7 +148,7 @@ run_0*.sh --config-file global_conf/config_victim_resnet34_transfer_lresnet.v1.s - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - Wavs are saves to `exp/scores/$nnet_name/cosine_art_${attack_related_label}/wav` - - `run_054_eval_art_transfer_blackbox_attacks.sh` + - `run_018_eval_art_transfer_blackbox_attacks.sh` - Eval transfer black box attacks implemented in IBM's Adversarial Robustness Toolkit (ART): FGSM, Iter-FGSM, PGD, Carlini-Wagner - Results are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_art_${attack_related_label}/voxceleb1_o_clean_results` - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF diff --git a/egs/voxceleb/adv.v1.1/conf b/egs/voxceleb/adv.v1.1/conf deleted file mode 120000 index 7dfe9dce..00000000 --- a/egs/voxceleb/adv.v1.1/conf +++ /dev/null @@ -1 +0,0 @@ -../../sre19-cmn2/v1/conf \ No newline at end of file diff --git a/egs/voxceleb/adv.v1.1/conf/advft_resnet34_xvec.yaml b/egs/voxceleb/adv.v1.1/conf/advft_resnet34_xvec.yaml new file mode 100644 index 00000000..fd9c95e1 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/advft_resnet34_xvec.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 +attack: + attack_type: pgd + max_iters: 10 + eps: 0.004 + alpha: 0.0008 + random_eps: true + p_attack: 0.5 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: lrsched_exp_default.yaml + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 8000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + use_amp: true + log_interval: 1000 + epochs: 20 + eff_batch_size: 512 diff --git a/egs/voxceleb/adv.v1.1/conf/clsp.conf b/egs/voxceleb/adv.v1.1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/adv.v1.1/conf/coe_gpu_long.conf b/egs/voxceleb/adv.v1.1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/adv.v1.1/conf/coe_gpu_rtx.conf b/egs/voxceleb/adv.v1.1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/adv.v1.1/conf/coe_gpu_short.conf b/egs/voxceleb/adv.v1.1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/adv.v1.1/conf/fbank80_stmn_16k.yaml b/egs/voxceleb/adv.v1.1/conf/fbank80_stmn_16k.yaml new file mode 100644 index 00000000..f4091f5d --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/fbank80_stmn_16k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/adv.v1.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/adv.v1.1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/voxceleb/adv.v1.1/conf/train_lresnet34_xvec.yaml b/egs/voxceleb/adv.v1.1/conf/train_lresnet34_xvec.yaml new file mode 100644 index 00000000..609f6829 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/train_lresnet34_xvec.yaml @@ -0,0 +1,68 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: lrsched_exp_default.yaml + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + use_amp: true + log_interval: 1000 + epochs: 70 + eff_batch_size: 512 diff --git a/egs/voxceleb/adv.v1.1/conf/train_resetdnn_xvec.yaml b/egs/voxceleb/adv.v1.1/conf/train_resetdnn_xvec.yaml new file mode 100644 index 00000000..c379ee76 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/train_resetdnn_xvec.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + tdnn_type: resetdnn + in_feats: 80 + num_enc_blocks: 5 + enc_hid_units: 512 + enc_expand_units: 1536 + kernel_size: + - 5 + - 3 + - 3 + - 3 + - 1 + dilation: + - 1 + - 2 + - 3 + - 4 + - 1 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.1 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: lrsched_exp_default.yaml + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + use_amp: true + log_interval: 1000 + epochs: 70 + eff_batch_size: 512 diff --git a/egs/voxceleb/adv.v1.1/conf/train_resnet34_xvec.yaml b/egs/voxceleb/adv.v1.1/conf/train_resnet34_xvec.yaml new file mode 100644 index 00000000..73ddcb68 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/train_resnet34_xvec.yaml @@ -0,0 +1,68 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: resnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: lrsched_exp_default.yaml + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + use_amp: true + log_interval: 1000 + epochs: 70 + eff_batch_size: 512 diff --git a/egs/voxceleb/adv.v1.1/conf/vad_16k.yaml b/egs/voxceleb/adv.v1.1/conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1.sh b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1.sh index d102a77a..b569604d 100644 --- a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1.sh +++ b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1.sh @@ -3,93 +3,41 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. -# victim acoustic features +# acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml + # victim x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=4 -max_chunk=4 -ipe=$nnet_num_augs -lr=0.05 - -nnet_type=lresnet34 -dropout=0 -embed_dim=256 -s=30 -margin_warmup=20 -margin=0.3 +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 +nnet_cfg=conf/train_lresnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - -# transfer model acoustic features +# transfer feature extractor transfer_feat_config=$feat_config transfer_feat_type=$feat_type # transfer model training -transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb1cat -transfer_nnet_num_augs=6 -transfer_aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=4 -transfer_max_chunk=4 -transfer_ipe=$transfer_nnet_num_augs -transfer_lr=0.05 +transfer_nnet_data=voxceleb2cat_train transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_feat_type}_${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - +transfer_nnet_name=${transfer_feat_type}_resetdnn5x512 +transfer_nnet_cfg=conf/train_resetdnn_xvec.yaml transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth +# adversarial finetuning +advft_nnet_name=${nnet_name}_advft +advft_nnet_cfg=conf/advft_lresnet34_xvec.yaml +advft_nnet_dir=exp/xvector_nnets/$advft_nnet_name +advft_nnet=$advft_nnet_dir/model_ep0070.pth -# options for adversarial finetuning of the victim model -advft_batch_size_1gpu=32 -advft_eff_batch_size=128 # effective batch size -advft_margin=0.3 -advft_margin_warmup=20 -advft_nnet_num_epochs=20 -advft_eps=0.004 -advft_eps_step=$(echo $advft_eps | awk '{ print $1/5}') -advft_p=0.5 -advft_lr=0.05 -advft_iters=10 -advft_attack_opts="--attack.attack-type pgd --attack.max-iter $advft_iters --attack.eps $advft_eps --attack.alpha $advft_eps_step --attack.random-eps --p-attack $advft_p" -advft_opt_opt="--optim.opt-type adam --optim.lr $advft_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -advft_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 8000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -advft_nnet_name=$nnet_name.advft_p${advft_p}_pgd${advft_iters}e${advft_eps}step${advft_eps_step}_arcm${advft_margin}wup${advft_margin_warmup}_optv1_adam_lr${advft_lr} -advft_nnet_dir=exp/xvector_nnets/$advft_nnet_name -advft_nnet=$advft_nnet_dir/model_ep0020.pth diff --git a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1_wavegan_defense.sh b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1_wavegan_defense.sh index 09d6b993..8105df2c 100644 --- a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1_wavegan_defense.sh +++ b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1_wavegan_defense.sh @@ -3,96 +3,42 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. -# victim acoustic features +# acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml + # victim x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=4 -max_chunk=4 -ipe=$nnet_num_augs -lr=0.05 - -nnet_type=lresnet34 -dropout=0 -embed_dim=256 -s=30 -margin_warmup=20 -margin=0.3 +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 +nnet_cfg=conf/train_lresnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - -# transfer model acoustic features +# transfer feature extractor transfer_feat_config=$feat_config transfer_feat_type=$feat_type # transfer model training -transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb1cat -transfer_nnet_num_augs=6 -transfer_aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=4 -transfer_max_chunk=4 -transfer_ipe=$transfer_nnet_num_augs -transfer_lr=0.05 +transfer_nnet_data=voxceleb2cat_train transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_feat_type}_${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - +transfer_nnet_name=${transfer_feat_type}_resetdnn5x512 +transfer_nnet_cfg=conf/train_resetdnn_xvec.yaml transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth - -# options for adversarial finetuning of the victim model -advft_batch_size_1gpu=32 -advft_eff_batch_size=128 # effective batch size -advft_margin=0.3 -advft_margin_warmup=20 -advft_nnet_num_epochs=20 -advft_eps=0.004 -advft_eps_step=$(echo $advft_eps | awk '{ print $1/5}') -advft_p=0.5 -advft_lr=0.05 -advft_iters=10 -advft_attack_opts="--attack.attack-type pgd --attack.max-iter $advft_iters --attack.eps $advft_eps --attack.alpha $advft_eps_step --attack.random-eps --p-attack $advft_p" -advft_opt_opt="--optim.opt-type adam --optim.lr $advft_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -advft_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 8000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -advft_nnet_name=$nnet_name.advft_p${advft_p}_pgd${advft_iters}e${advft_eps}step${advft_eps_step}_arcm${advft_margin}wup${advft_margin_warmup}_optv1_adam_lr${advft_lr} +# adversarial finetuning +advft_nnet_name=${nnet_name}_advft +advft_nnet_cfg=conf/advft_lresnet34_xvec.yaml advft_nnet_dir=exp/xvector_nnets/$advft_nnet_name -advft_nnet=$advft_nnet_dir/model_ep0020.pth +advft_nnet=$advft_nnet_dir/model_ep0070.pth # WaveGAN configs smoothing_after_wavegan=true diff --git a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_fbank40_stmn_resetdnn.v1.sh b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_fbank40_stmn_resetdnn.v1.sh index 54e47a29..3e7739d0 100644 --- a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_fbank40_stmn_resetdnn.v1.sh +++ b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_fbank40_stmn_resetdnn.v1.sh @@ -3,7 +3,7 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. -# victim acoustic features +# acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn @@ -12,70 +12,32 @@ vad_config=conf/vad_16k.yaml # victim x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=4 -max_chunk=4 -ipe=1 -lr=0.05 +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34 -nnet_type=resnet34 -dropout=0 -embed_dim=256 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -num_augs=5 +nnet_cfg=conf/train_resnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - -# transfer model acoustic features +# transfer feature extractor transfer_feat_config=conf/fbank40_stmn_16k.yaml transfer_feat_type=fbank40_stmn # transfer model training -transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb1cat -transfer_nnet_num_augs=6 -transfer_aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=4 -transfer_max_chunk=4 -transfer_ipe=$transfer_nnet_num_augs -transfer_lr=0.05 +transfer_nnet_data=voxceleb2cat_train transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 40 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_feat_type}_${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - +transfer_nnet_name=${transfer_feat_type}_resetdnn5x512 +transfer_nnet_cfg=conf/train_resetdnn_xvec.yaml transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth +# adversarial finetuning +advft_nnet_name=${nnet_name}_advft +advft_nnet_cfg=conf/advft_resnet34_xvec.yaml +advft_nnet_dir=exp/xvector_nnets/$advft_nnet_name +advft_nnet=$advft_nnet_dir/model_ep0070.pth + diff --git a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_lresnet.v1.sh b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_lresnet.v1.sh index 777b8b5d..00dfd4ff 100644 --- a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_lresnet.v1.sh +++ b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_lresnet.v1.sh @@ -3,70 +3,39 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. -# victim acoustic features +# Victim model ResNet34 x-vector +# For the black-box attacks we use Residual E-TDNN to generate the attack and transfer them to the ResNet34 +# Both models uses the same features: 80 fbanks +# Both models uses the same training data. + +# acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn -# victim x-vector training -nnet_data=voxceleb2cat -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=4 -max_chunk=4 -ipe=$nnet_num_augs -lr=0.05 +#vad +vad_config=conf/vad_16k.yaml -nnet_type=resnet34 -dropout=0 -embed_dim=256 +# victim x-vector training +nnet_data=voxceleb2cat_train -s=30 -margin_warmup=20 -margin=0.3 +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -num_augs=5 +nnet_cfg=conf/train_resnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - -# transfer model acoustic features +# transfer feature extractor transfer_feat_config=$feat_config transfer_feat_type=$feat_type # transfer model training -transfer_nnet_data=voxceleb2cat #this can be voxceleb2cat or voxceleb1cat -transfer_nnet_num_augs=6 -transfer_aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=4 -transfer_max_chunk=4 -transfer_ipe=$transfer_nnet_num_augs -transfer_lr=0.05 - -transfer_nnet_type=lresnet34 -transfer_dropout=0 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--resnet-type $transfer_nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_feat_type}_${transfer_nnet_type}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 +transfer_nnet_data=voxceleb2cat_train +transfer_nnet_type=resnet +transfer_nnet_name=${transfer_feat_type}_lresnet34 +transfer_nnet_cfg=conf/train_lresnet34_xvec.yaml transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_resetdnn.v1.sh b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_resetdnn.v1.sh index 482f3b7b..6570f4a2 100644 --- a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_resetdnn.v1.sh +++ b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_resetdnn.v1.sh @@ -3,75 +3,41 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. -# victim acoustic features +# acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml + # victim x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=4 -max_chunk=4 -ipe=$nnet_num_augs -lr=0.05 - -nnet_type=resnet34 -dropout=0 -embed_dim=256 -s=30 -margin_warmup=20 -margin=0.3 +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 +nnet_cfg=conf/train_resnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - -# transfer model acoustic features +# transfer feature extractor transfer_feat_config=$feat_config transfer_feat_type=$feat_type # transfer model training -transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb1cat -transfer_nnet_num_augs=6 -transfer_aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=4 -transfer_max_chunk=4 -transfer_ipe=$transfer_nnet_num_augs -transfer_lr=0.05 +transfer_nnet_data=voxceleb2cat_train transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_feat_type}_${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - +transfer_nnet_name=${transfer_feat_type}_resetdnn5x512 +transfer_nnet_cfg=conf/train_resetdnn_xvec.yaml transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth +# adversarial finetuning +advft_nnet_name=${nnet_name}_advft +advft_nnet_cfg=conf/advft_resnet34_xvec.yaml +advft_nnet_dir=exp/xvector_nnets/$advft_nnet_name +advft_nnet=$advft_nnet_dir/model_ep0070.pth + diff --git a/egs/voxceleb/adv.v1.1/local b/egs/voxceleb/adv.v1.1/local deleted file mode 120000 index ce1cbf90..00000000 --- a/egs/voxceleb/adv.v1.1/local +++ /dev/null @@ -1 +0,0 @@ -../v1/local \ No newline at end of file diff --git a/egs/voxceleb/adv.v1.1/local/attack_analysis.py b/egs/voxceleb/adv.v1.1/local/attack_analysis.py new file mode 100755 index 00000000..2e0fdb42 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/attack_analysis.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +import sys +import os +import argparse +import time +import logging + +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.np.metrics.verification_evaluator import ( + VerificationAdvAttackEvaluator as Eval, +) + + +def evaluate_attacks( + key_file, + clean_score_file, + attack_score_files, + attack_stats_files, + output_path, + prior, +): + + output_dir = os.path.dirname(output_path) + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + evaluator = Eval( + key_file, clean_score_file, attack_score_files, attack_stats_files, prior + ) + + # performance vs SNR + logging.info("compute perf vs snr for all trials") + df_clean = evaluator.compute_dcf_eer(return_df=True) + df_clean.insert(0, "snr", np.inf) + + df = evaluator.compute_dcf_eer_vs_stats( + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "all", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_all_snr_results.csv" % (output_path) + df = pd.concat([df_clean, df], ignore_index=True) + df.to_csv(file_path) + file_path = "%s_attack_all_snr" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) + + logging.info("compute perf vs snr for tar trials") + df = evaluator.compute_dcf_eer_vs_stats( + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "tar", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_tar_snr_results.csv" % (output_path) + df = pd.concat([df_clean, df], ignore_index=True) + df.to_csv(file_path) + file_path = "%s_attack_tar_snr" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) + + logging.info("compute perf vs snr for non trials") + df = evaluator.compute_dcf_eer_vs_stats( + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "non", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_non_snr_results.csv" % (output_path) + df = pd.concat([df_clean, df], ignore_index=True) + df.to_csv(file_path) + file_path = "%s_attack_non_snr" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) + + logging.info("find best attacks from snr point of view") + for i in range(len(attack_score_files)): + file_path = "%s_best_snr_tar_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "snr", + "tar", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=True, + ) + + file_path = "%s_best_snr_non_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "snr", + "non", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=True, + ) + + # performance vs Linf + logging.info("compute perf vs linf for all trials") + eps = np.ceil(np.asarray([0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]) * 2 ** 15) + df = evaluator.compute_dcf_eer_vs_stats( + "n_linf", eps, "all", higher_better=False, return_df=True + ) + file_path = "%s_attack_all_linf_results.csv" % (output_path) + df.to_csv(file_path) + file_path = "%s_attack_all_linf" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) + + logging.info("compute perf vs linf for tar trials") + df = evaluator.compute_dcf_eer_vs_stats( + "n_linf", eps, "tar", higher_better=False, return_df=True + ) + file_path = "%s_attack_tar_linf_results.csv" % (output_path) + df.to_csv(file_path) + file_path = "%s_attack_tar_linf" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) + + logging.info("compute perf vs linf for non trials") + df = evaluator.compute_dcf_eer_vs_stats( + "n_linf", eps, "non", higher_better=False, return_df=True + ) + file_path = "%s_attack_non_linf_results.csv" % (output_path) + df.to_csv(file_path) + file_path = "%s_attack_non_linf" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) + + # find the best attacks in terms of linf + logging.info("find best attacks from linf point of view") + for i in range(len(attack_score_files)): + file_path = "%s_best_linf_tar_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "n_linf", + "tar", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=False, + ) + + file_path = "%s_best_linf_non_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "n_linf", + "non", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=False, + ) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Analyses performance of adversarial attacks for spk. verif.", + ) + + parser.add_argument("--key-file", required=True) + parser.add_argument("--clean-score-file", required=True) + parser.add_argument("--attack-score-files", required=True, nargs="+") + parser.add_argument("--attack-stats-files", required=True, nargs="+") + parser.add_argument("--output-path", required=True) + parser.add_argument("--prior", default=0.05, type=float) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + evaluate_attacks(**vars(args)) diff --git a/egs/voxceleb/v1/local/attack_analysis.sh b/egs/voxceleb/adv.v1.1/local/attack_analysis.sh similarity index 100% rename from egs/voxceleb/v1/local/attack_analysis.sh rename to egs/voxceleb/adv.v1.1/local/attack_analysis.sh diff --git a/egs/voxceleb/v1/local/calibrate_voxceleb1_o_clean.sh b/egs/voxceleb/adv.v1.1/local/calibrate_voxceleb1_o_clean.sh similarity index 100% rename from egs/voxceleb/v1/local/calibrate_voxceleb1_o_clean.sh rename to egs/voxceleb/adv.v1.1/local/calibrate_voxceleb1_o_clean.sh diff --git a/egs/voxceleb/adv.v1.1/local/make_musan.py b/egs/voxceleb/adv.v1.1/local/make_musan.py new file mode 100755 index 00000000..b0ae6846 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/make_musan.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + + +def prepare_music(root_dir, fs, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_speech(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_noise(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def main(): + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) + + +if __name__ == "__main__": + main() diff --git a/egs/voxceleb/adv.v1.1/local/make_musan.sh b/egs/voxceleb/adv.v1.1/local/make_musan.sh new file mode 100755 index 00000000..4a6d30f9 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/make_musan.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +use_vocals='Y' + +. parse_options.sh || exit 1; + +if [ $# -ne 3 ];then + echo "Usage: $0 [options] "; + echo "e.g.: $0 /export/corpora/JHU/musan 8 data" + exit 1; +fi + +in_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} $fs ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf $data_dir/musan.tmp + diff --git a/egs/voxceleb/adv.v1.1/local/make_rirs_data.sh b/egs/voxceleb/adv.v1.1/local/make_rirs_data.sh new file mode 100755 index 00000000..c6652eda --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/make_rirs_data.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# +# Apache 2.0. +set -e + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom" +fi + +rir_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir + +rir_list=$rir_dir/rir_list +if [ "$fs" -eq 16 ];then + awk '{ key=$5; sub(/.*\//,"",key); print key,$5 }' $rir_list > $data_dir/wav.scp +else + awk '{ +key=$5; sub(/.*\//,"",key); +print key,"sox "$5" -r 8000 -t wav -b 16 -e signed-integer - |" }' \ + $rir_list > $data_dir/wav.scp +fi +awk '{ key=$5; sub(/.*\//,"",key); print key,$4 }' $rir_list > $data_dir/rir2room + diff --git a/egs/voxceleb/v1/local/make_trials_subset.py b/egs/voxceleb/adv.v1.1/local/make_trials_subset.py similarity index 100% rename from egs/voxceleb/v1/local/make_trials_subset.py rename to egs/voxceleb/adv.v1.1/local/make_trials_subset.py diff --git a/egs/voxceleb/v1/local/make_voxceleb1_o.pl b/egs/voxceleb/adv.v1.1/local/make_voxceleb1_o.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_o.pl rename to egs/voxceleb/adv.v1.1/local/make_voxceleb1_o.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb2cat.pl b/egs/voxceleb/adv.v1.1/local/make_voxceleb2cat.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb2cat.pl rename to egs/voxceleb/adv.v1.1/local/make_voxceleb2cat.pl diff --git a/egs/voxceleb/adv.v1.1/local/score_dcf.py b/egs/voxceleb/adv.v1.1/local/score_dcf.py new file mode 100755 index 00000000..3524d222 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/score_dcf.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import argparse +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils import SparseTrialScores, SparseTrialKey +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval + + +def score_dcf(key_file, score_file, output_path): + + logging.info("Load key: %s" % key_file) + key = SparseTrialKey.load_txt(key_file) + logging.info("Load scores: %s" % score_file) + scr = SparseTrialScores.load_txt(score_file) + logging.info("separating tar/non") + tar, non = scr.get_tar_non(key) + logging.info("computing EER/DCF") + priors = np.array([0.001, 0.005, 0.01, 0.05]) + min_dcf, act_dcf, eer, _, min_pmiss, min_pfa, act_pmiss, act_pfa = fast_eval( + tar, non, priors, return_probs=True + ) + + output_dir = os.path.dirname(output_path) + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + ntar = len(tar) + nnon = len(non) + + output_file = output_path + "_results" + with open(output_file, "w") as f: + s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f} ntar: {9:d} nnon: {10:d}\n".format( + eer * 100, + min_dcf[3], + act_dcf[3], + min_dcf[2], + act_dcf[2], + min_dcf[1], + act_dcf[1], + min_dcf[0], + act_dcf[0], + ntar, + nnon, + ) + f.write(s) + logging.info(s) + s = "min-pmiss={} min-pfa={} act-pmiss={} act-pfa={}".format( + min_pmiss, min_pfa, act_pmiss, act_pfa + ) + logging.info(s) + s = "min-Nmiss={} min-Nfa={} act-Nmiss={} act-Nfa={}".format( + min_pmiss * ntar, min_pfa * nnon, act_pmiss * ntar, act_pfa * nnon + ) + logging.info(s) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + score_dcf(**vars(args)) diff --git a/egs/voxceleb/v1/local/score_voxceleb1_o_clean.sh b/egs/voxceleb/adv.v1.1/local/score_voxceleb1_o_clean.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxceleb1_o_clean.sh rename to egs/voxceleb/adv.v1.1/local/score_voxceleb1_o_clean.sh diff --git a/egs/voxceleb/v1/local/score_voxceleb1_single_cond.sh b/egs/voxceleb/adv.v1.1/local/score_voxceleb1_single_cond.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxceleb1_single_cond.sh rename to egs/voxceleb/adv.v1.1/local/score_voxceleb1_single_cond.sh diff --git a/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh b/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh index eeae00ac..f6b8e62f 100755 --- a/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh +++ b/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh @@ -9,7 +9,6 @@ set -e nodes=fs01 storage_name=$(date +'%m_%d_%H_%M') vaddir=`pwd`/exp/vad_e -vad_config=conf/vad_16k.yaml stage=1 config_file=default_config.sh @@ -21,7 +20,7 @@ config_file=default_config.sh if [ $stage -le 1 ]; then # Prepare to distribute data over multiple machines if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage + dir_name=$USER/hyp-data/voxceleb/adv.v1.1/$storage_name/vad/storage if [ "$nodes" == "b0" ];then utils/create_split_dir.pl \ utils/create_split_dir.pl \ diff --git a/egs/voxceleb/adv.v1.1/run_004_prepare_victim_xvec_train_data.sh b/egs/voxceleb/adv.v1.1/run_004_prepare_victim_xvec_train_data.sh new file mode 100755 index 00000000..0e10ea68 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_004_prepare_victim_xvec_train_data.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 2 ]; then + # This script preprocess audio for x-vector training + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ + --storage_name voxceleb-adv.v1.1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${nnet_data} data/${nnet_data}_proc_audio_no_sil exp/${nnet_data}_proc_audio_no_sil + utils/fix_data_dir.sh data/${nnet_data}_proc_audio_no_sil + +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 4s + hyp_utils/remove_short_audios.sh --min-len 4 data/${nnet_data}_proc_audio_no_sil + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 4 utterances. + hyp_utils/remove_spk_few_utts.sh --min-num-utts 4 data/${nnet_data}_proc_audio_no_sil +fi + +if [ $stage -le 4 ]; then + # Prepare train and validation lists for x-vectors + local/make_train_lists_sup_embed_with_augm.sh \ + data/${nnet_data}_proc_audio_no_sil \ + data/${nnet_data}_proc_audio_no_sil/lists_xvec +fi + diff --git a/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh new file mode 100755 index 00000000..aa779902 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-adv.v1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu + +fi + diff --git a/egs/voxceleb/adv.v1.1/run_012_prepare_transfer_xvec_train_data.sh b/egs/voxceleb/adv.v1.1/run_006_prepare_transfer_xvec_train_data.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_012_prepare_transfer_xvec_train_data.sh rename to egs/voxceleb/adv.v1.1/run_006_prepare_transfer_xvec_train_data.sh diff --git a/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh new file mode 100755 index 00000000..420ac59d --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ "$nnet" == "$transfer_nnet" ];then + echo "Victim and transfer model are the same" + echo "Skipping this step" + exit 0 +fi + +list_dir=data/${transfer_nnet_data}_proc_audio_no_sil +nnet_type=$transfer_nnet_type +nnet_dir=$transfer_nnet_dir +nnet_cfg=$transfer_nnet_cfg +nnet_args=$transfer_nnet_args + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-adv.v1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu + +fi + diff --git a/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh new file mode 100755 index 00000000..4f2c137b --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ "$nnet" == "$transfer_nnet" ];then + echo "Victim and transfer model are the same" + echo "Skipping this step" + exit 0 +fi + +list_dir=data/${nnet_data}_proc_audio_no_sil +nnet_dir=$advft_nnet_dir +nnet_cfg=$advft_nnet_cfg +nnet_args=$advft_nnet_args + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-adv.v1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + adv_finetune_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu + +fi + + + +# #!/bin/bash +# # Copyright +# # 2019 Johns Hopkins University (Author: Jesus Villalba) +# # Apache 2.0. +# # +# . ./cmd.sh +# . ./path.sh +# set -e + +# stage=1 +# ngpu=4 +# config_file=default_config.sh +# resume=false +# interactive=false +# num_workers=8 + +# . parse_options.sh || exit 1; +# . $config_file +# . datapath.sh + +# batch_size=$(($advft_batch_size_1gpu*$ngpu)) +# grad_acc_steps=$(echo $batch_size $advft_eff_batch_size | awk '{ print int($2/$1+0.5)}') +# log_interval=$(echo 100*$grad_acc_steps | bc) +# list_dir=data/${nnet_data}_proc_audio_no_sil + +# args="" +# if [ "$resume" == "true" ];then +# args="--resume" +# fi + +# if [ "$interactive" == "true" ];then +# export cuda_cmd=run.pl +# fi + +# # Network Training +# if [ $stage -le 1 ]; then +# mkdir -p $advft_nnet_dir/log +# $cuda_cmd --gpu $ngpu $advft_nnet_dir/log/train.log \ +# hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ +# torch-adv-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \ +# --audio-path $list_dir/wav.scp \ +# --time-durs-file $list_dir/utt2dur \ +# --train-list $list_dir/lists_xvec/train.scp \ +# --val-list $list_dir/lists_xvec/val.scp \ +# --class-file $list_dir/lists_xvec/class2int \ +# --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ +# --iters-per-epoch $ipe \ +# --batch-size $batch_size \ +# --num-workers $num_workers \ +# --grad-acc-steps $grad_acc_steps $advft_opt_opt $advft_lrs_opt \ +# --epochs $advft_nnet_num_epochs \ +# --s $s --margin $advft_margin --margin-warmup-epochs $advft_margin_warmup \ +# --num-gpus $ngpu \ +# --train-mode ft-full \ +# --log-interval $log_interval \ +# --in-model-path $nnet \ +# --exp-path $advft_nnet_dir $advft_attack_opts $args + +# fi +# # + +# exit diff --git a/egs/voxceleb/adv.v1.1/run_009_extract_xvectors_victim_model.sh b/egs/voxceleb/adv.v1.1/run_009_extract_xvectors_victim_model.sh new file mode 100755 index 00000000..2df747e6 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_009_extract_xvectors_victim_model.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +use_gpu=false +xvec_chunk_length=12800 +ft=0 + +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 4G" +else + xvec_cmd="$train_cmd --mem 12G" +fi + +if [ $ft -eq 1 ];then + nnet_name=$advft_nnet_name + nnet=$advft_nnet +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [ $stage -le 1 ]; then + # Extracts x-vectors for evaluation + for name in voxceleb1_test + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + --feat-config $feat_config \ + $nnet data/$name \ + $xvector_dir/$name + done +fi + diff --git a/egs/voxceleb/adv.v1.1/run_010_extract_xvectors_transfer_model.sh b/egs/voxceleb/adv.v1.1/run_010_extract_xvectors_transfer_model.sh new file mode 100755 index 00000000..7e2488b3 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_010_extract_xvectors_transfer_model.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +use_gpu=false +xvec_chunk_length=12800 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 4G" +else + xvec_cmd="$train_cmd --mem 12G" +fi + +feat_config=$transfer_feat_config +nnet_name=$transfer_nnet_name +nnet=$transfer_nnet + +xvector_dir=exp/xvectors/$nnet_name + +if [ $stage -le 1 ]; then + # Extracts x-vectors for evaluation + for name in voxceleb1_test + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + --feat-config $feat_config \ + $nnet data/$name \ + $xvector_dir/$name + done +fi + + diff --git a/egs/voxceleb/adv.v1.1/run_010_prepare_victim_xvec_train_data.sh b/egs/voxceleb/adv.v1.1/run_010_prepare_victim_xvec_train_data.sh deleted file mode 100755 index f89c9822..00000000 --- a/egs/voxceleb/adv.v1.1/run_010_prepare_victim_xvec_train_data.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# Copyright -# 2020 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - -if [ $stage -le 2 ]; then - # This script preprocess audio for x-vector training - steps_xvec/preprocess_audios_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ - --storage_name voxceleb-adv.v2.1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ - data/${nnet_data} data/${nnet_data}_proc_audio_no_sil exp/${nnet_data}_proc_audio_no_sil - utils/fix_data_dir.sh data/${nnet_data}_proc_audio_no_sil - -fi - -if [ $stage -le 3 ]; then - # Now, we remove files with less than 4s - hyp_utils/remove_short_audios.sh --min-len 4 data/${nnet_data}_proc_audio_no_sil - - # We also want several utterances per speaker. Now we'll throw out speakers - # with fewer than 4 utterances. - hyp_utils/remove_spk_few_utts.sh --min-num-utts 4 data/${nnet_data}_proc_audio_no_sil - -fi - -if [ $stage -le 4 ]; then - # Prepare train and validation lists for x-vectors - local/make_train_lists_sup_embed_with_augm.sh \ - data/${nnet_data}_proc_audio_no_sil \ - data/${nnet_data}_proc_audio_no_sil/lists_xvec -fi - -exit diff --git a/egs/voxceleb/adv.v1.1/run_040_eval_be_victim_model.sh b/egs/voxceleb/adv.v1.1/run_011_eval_be_victim_model.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_040_eval_be_victim_model.sh rename to egs/voxceleb/adv.v1.1/run_011_eval_be_victim_model.sh diff --git a/egs/voxceleb/adv.v1.1/run_041_eval_be_transfer_model.sh b/egs/voxceleb/adv.v1.1/run_012_eval_be_transfer_model.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_041_eval_be_transfer_model.sh rename to egs/voxceleb/adv.v1.1/run_012_eval_be_transfer_model.sh diff --git a/egs/voxceleb/adv.v1.1/run_043_eval_whitebox_attacks.sh b/egs/voxceleb/adv.v1.1/run_013_eval_whitebox_attacks.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_043_eval_whitebox_attacks.sh rename to egs/voxceleb/adv.v1.1/run_013_eval_whitebox_attacks.sh diff --git a/egs/voxceleb/adv.v1.1/run_044_eval_transfer_blackbox_attacks.sh b/egs/voxceleb/adv.v1.1/run_014_eval_transfer_blackbox_attacks.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_044_eval_transfer_blackbox_attacks.sh rename to egs/voxceleb/adv.v1.1/run_014_eval_transfer_blackbox_attacks.sh diff --git a/egs/voxceleb/adv.v1.1/run_015_eval_whitebox_attacks_with_randsmooth_defense.sh b/egs/voxceleb/adv.v1.1/run_015_eval_whitebox_attacks_with_randsmooth_defense.sh new file mode 100755 index 00000000..fc9d6a7d --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_015_eval_whitebox_attacks_with_randsmooth_defense.sh @@ -0,0 +1,378 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +use_gpu=false +do_analysis=false +save_wav=false +use_trials_subset=false +sigmas="0.001 0.01" +max_test_length="" + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ "$use_gpu" == "true" ];then + eval_args="--use-gpu true" + eval_cmd="$cuda_eval_cmd" +else + eval_cmd="$train_cmd" +fi + +if [ -n "$max_test_length" ];then + eval_args="${eval_args} --max-test-length $max_test_length" +fi + +if [ "$use_trials_subset" == "true" ];then + condition=o_clean_1000_1000 +else + condition=o_clean +fi +trial_list=data/voxceleb1_test/trials_$condition + +xvector_dir=exp/xvectors/$nnet_name +score_dir=exp/scores/$nnet_name + +score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores +cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 + +#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) +thr005=2.94 +thr001=4.60 +thr0001=6.90 +declare -a score_array +declare -a stats_array + +if [ $stage -le 1 ];then + + for sigma in $sigmas + do + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/cosine_fgsm_e${eps}_randsmooth${sigma} + echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" + steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type fgsm --attack.eps $eps" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_fgsm_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + done +fi + + +if [ $stage -le 2 ];then + + for sigma in $sigmas + do + score_array=() + stats_array=() + for snr in 30 20 10 0 + do + score_plda_dir=$score_dir/cosine_fgsm_snr${snr}_randsmooth${sigma} + echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack snr=$snr" + steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type snr-fgsm --attack.snr $snr" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_fgsm_snrall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + done +fi + + +if [ $stage -le 3 ];then + for sigma in $sigmas + do + + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_randfgsm_e${eps}_a${alpha}_randsmooth${sigma} + echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" + steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type rand-fgsm --attack.eps $eps --attack.alpha $alpha" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_randfgsm_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + done +fi + + +if [ $stage -le 4 ];then + for sigma in $sigmas + do + + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_iterfgsm_e${eps}_a${alpha}_randsmooth${sigma} + echo "Eval Voxceleb 1 with Cosine scoring with Iterative FGSM attack eps=$eps" + steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type iter-fgsm --attack.eps $eps --attack.alpha $alpha" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav --smooth-sigma $sigma \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_iterfgsm_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + done +fi + + +if [ $stage -le 5 ];then + + for sigma in $sigmas + do + + for confidence in 0 #1 + do + for lr in 0.001 + do + for it in 10 + do + + score_plda_dir=$score_dir/cosine_cwl2_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} + echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence lr=$lr num-its=$it" + steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 100 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.max-iter $it" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + done + done + done + done +fi + + +if [ $stage -le 6 ];then + for sigma in $sigmas + do + + for confidence in 0 #1 + do + for lr in 0.001 + do + for it in 10 + do + score_plda_dir=$score_dir/cosine_cwrms_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} + echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner RMS attack confidence=$confidence lr=$lr num_its=$it" + steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.norm-time --attack.max-iter $it" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + done + done + done +fi + + +if [ $stage -le 7 ];then + for sigma in $sigmas + do + + for confidence in 0 #1 + do + for lr in 0.001 + do + for it in 10 + do + score_plda_dir=$score_dir/cosine_cwsnr_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} + echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner SNR attack confidence=$confidence lr=$lr num_its=$it" + steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.use-snr --attack.max-iter $it" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + done + done + done +fi + + +exit + diff --git a/egs/voxceleb/adv.v1.1/run_046_eval_whitebox_attacks_with_randsmooth_wavegan_defense.sh b/egs/voxceleb/adv.v1.1/run_016_eval_whitebox_attacks_with_randsmooth_wavegan_defense.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_046_eval_whitebox_attacks_with_randsmooth_wavegan_defense.sh rename to egs/voxceleb/adv.v1.1/run_016_eval_whitebox_attacks_with_randsmooth_wavegan_defense.sh diff --git a/egs/voxceleb/adv.v1.1/run_017_eval_art_whitebox_attacks.sh b/egs/voxceleb/adv.v1.1/run_017_eval_art_whitebox_attacks.sh new file mode 100755 index 00000000..3a92630f --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_017_eval_art_whitebox_attacks.sh @@ -0,0 +1,1152 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +use_gpu=false +do_analysis=false +save_wav=false +use_trials_subset=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ "$use_gpu" == "true" ];then + eval_args="--use-gpu true" + eval_cmd="$cuda_eval_cmd" +else + eval_cmd="$train_cmd" +fi + +if [ "$use_trials_subset" == "true" ];then + condition=o_clean_1000_1000 +else + condition=o_clean +fi +trial_list=data/voxceleb1_test/trials_$condition + +xvector_dir=exp/xvectors/$nnet_name +score_dir=exp/scores/$nnet_name + +score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores +cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 + +#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) +thr005=2.94 +thr001=4.60 +thr0001=6.90 + +declare -a score_array +declare -a stats_array + +if [ $stage -le 1 ];then + + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/cosine_art_fgsm_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_fgsm_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi +fi + +if [ $stage -le 2 ];then + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_fgsm_minimal_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGSM minimal attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_fgsm_minimal_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + +if [ $stage -le 3 ];then + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/cosine_art_fgml1_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 1" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_fgml1_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi +fi + +if [ $stage -le 4 ];then + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_fgml1_minimal_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 minimal attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal --attack.norm 1" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_fgml1_minimal_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 5 ];then + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/cosine_art_fgml2_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 2" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_fgml2_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi +fi + +if [ $stage -le 6 ];then + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_fgml2_minimal_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 minimal attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal --attack.norm 2" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_fgml2_minimal_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 7 ];then + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_iterfgsm_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with IterFGM attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type bim --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_iterfgsm_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi +fi + +if [ $stage -le 8 ];then + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_pgdlinf_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_pgdlinf_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi +fi + + +if [ $stage -le 9 ];then + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_pgdl1_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 1" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_pgdl1_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi +fi + +if [ $stage -le 10 ];then + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_pgdl2_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 2" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_pgdl2_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi +fi + +if [ $stage -le 11 ];then + + for confidence in 0 #1 + do + score_plda_dir=$score_dir/cosine_art_cwl2_conf${confidence} + echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + +fi + + +if [ $stage -le 12 ];then + + for confidence in 0 #1 + do + score_plda_dir=$score_dir/cosine_art_cwlinf_conf${confidence} + echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner Linf attack confidence=$confidence" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.initial-c 1e-5" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + +fi + +if [ $stage -le 14 ];then + score_array=() + stats_array=() + for norm in inf 1 2 + do + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_autopgdl${norm}_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with Auto-PGD $norm attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ + --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type auto-pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm $norm" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_autopgdl${norm}_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + done +fi + +if [ $stage -le 15 ];then + score_array=() + stats_array=() + for norm in inf 1 2 + do + for eps in 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_autocgdl${norm}_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with Auto-CGD $norm attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ + --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type auto-cgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm $norm" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_autocgdl${norm}_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + done +fi + +if [ $stage -le 16 ];then + score_array=() + stats_array=() + for eps in 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_deepfool_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with DeepFool attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ + --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type deepfool --attack.eps $eps --attack.max-iter 100" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_deepfool_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi +fi + +if [ $stage -le 17 ];then + + for confidence in 0 #1 + do + score_plda_dir=$score_dir/cosine_art_elasticnet_conf${confidence} + echo "Eval Voxceleb 1 with Cosine scoring with ElasticNet attack confidence=$confidence" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type elasticnet --attack.confidence $confidence --attack.max-iter 100 --attack.lr 0.01" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + +fi + + +if [ $stage -le 20 ];then + + for norm in inf 2 + do + score_plda_dir=$score_dir/cosine_art_hopskipjump_norm${norm} + echo "Eval Voxceleb 1 with Cosine scoring with Hopskipjump attack norm=$norm" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type hop-skip-jump --attack.norm $norm --attack.max-iter 50 --attack.max-eval 10000 --attack.init-eval 10 --attack.init-size 100" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + +fi + + +if [ $stage -le 23 ];then + + for eta in 0.01 + do + score_plda_dir=$score_dir/cosine_art_newtonfool_eta$eta + echo "Eval Voxceleb 1 with Cosine scoring with NewtonFool attack eta=$eta" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type newtonfool --attack.eta $eta" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + +fi + +if [ $stage -le 25 ];then + + for lambda_tv in 0.3 + do + score_plda_dir=$score_dir/cosine_art_shadow_theta$theta + echo "Eval Voxceleb 1 with Cosine scoring with Shadow attack lambda=$lambda_tv" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type shadow --attack.lambda-tv $lambda_tv" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + +fi + +if [ $stage -le 26 ];then + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_wass_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with Wassertein attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type wasserstein --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.reg 1" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_wass_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi +fi + +if [ $stage -le 27 ];then + + for confidence in 0 #1 + do + score_plda_dir=$score_dir/cosine_art_zoo_conf${confidence} + echo "Eval Voxceleb 1 with Cosine scoring with Zoo attack confidence=$confidence" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type zoo --attack.confidence $confidence" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + +fi + + +# The attacks below have issues when applying to audio + +# if [ $stage -le 13 ];then + +# for eps in 0.0001 +# do +# score_plda_dir=$score_dir/cosine_art_boundary_eps${eps} +# alpha=$(echo $eps | awk '{ print $0/5.}') +# echo "Eval Voxceleb 1 with Cosine scoring with boundary attack eps=$eps" +# steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ +# --cmd "$eval_cmd" $eval_args --nj 400 \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type boundary --attack.eps $eps --attack.delta $eps --attack.max-iter 5000" \ +# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ +# --cal-file $cal_file --threshold $thr005 \ +# $trial_list \ +# data/voxceleb1_test/utt2model \ +# data/voxceleb1_test \ +# $xvector_dir/voxceleb1_test/xvector.scp \ +# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + +# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ +# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + +# for f in $(ls $score_plda_dir/*_results); +# do +# echo $f +# cat $f +# echo "" +# done +# if [ "${do_analysis}" == "true" ];then +# score_analysis_dir=$score_plda_dir +# local/attack_analysis.sh \ +# --cmd "$train_cmd --mem 10G" \ +# $trial_list $score_clean \ +# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ +# $score_analysis_dir/voxceleb1 & +# fi + +# done + +# fi + +# it needs acces to hidden layers +# if [ $stage -le 18 ];then +# for eps in 0.00001 0.0001 0.001 0.01 0.1 +# do +# alpha=$(echo $eps | awk '{ print $0/5.}') +# score_plda_dir=$score_dir/cosine_art_fadv_e${eps} +# echo "Eval Voxceleb 1 with Cosine scoring with feature adversaries attack eps=$eps" +# steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ +# --cmd "$eval_cmd" $eval_args --nj 80 \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type feature-adv --attack.delta $eps --attack.eps-step $alpha --attack.max-iter 100" \ +# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ +# --cal-file $cal_file --threshold $thr005 \ +# $trial_list \ +# data/voxceleb1_test/utt2model \ +# data/voxceleb1_test \ +# $xvector_dir/voxceleb1_test/xvector.scp \ +# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + +# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ +# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + +# for f in $(ls $score_plda_dir/*_results); +# do +# echo $f +# cat $f +# echo "" +# done + +# score_array+=($score_plda_dir/voxceleb1_scores) +# stats_array+=($score_plda_dir/voxceleb1_stats) + +# done +# if [ "${do_analysis}" == "true" ];then +# score_analysis_dir=$score_dir/cosine_art_fadv_eall +# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ +# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ +# $score_analysis_dir/voxceleb1 & +# fi +# fi + +# if [ $stage -le 19 ];then +# score_array=() +# stats_array=() +# for norm in inf 1 2 +# do +# for sigma in 0.0002 +# do +# score_plda_dir=$score_dir/cosine_art_geoda${norm}_s${sigma} +# echo "Eval Voxceleb 1 with Cosine scoring with GeoDA $norm sigma=$sigma" +# steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ +# --cmd "$eval_cmd" $eval_args --nj 80 \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type geoda --attack.max-iter 4000 --attack.sigma-geoda $sigma --attack.norm $norm" \ +# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ +# --cal-file $cal_file --threshold $thr005 \ +# $trial_list \ +# data/voxceleb1_test/utt2model \ +# data/voxceleb1_test \ +# $xvector_dir/voxceleb1_test/xvector.scp \ +# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + +# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ +# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + +# for f in $(ls $score_plda_dir/*_results); +# do +# echo $f +# cat $f +# echo "" +# done + +# score_array+=($score_plda_dir/voxceleb1_scores) +# stats_array+=($score_plda_dir/voxceleb1_stats) + +# done +# if [ "${do_analysis}" == "true" ];then +# score_analysis_dir=$score_dir/cosine_art_geoda${norm}_sall +# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ +# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ +# $score_analysis_dir/voxceleb1 & +# fi +# done +# fi + +# +# if [ $stage -le 21 ];then + +# for norm in inf 1 2 +# do +# score_plda_dir=$score_dir/cosine_art_brendel_norm${norm} +# echo "Eval Voxceleb 1 with Cosine scoring with Brendel attack norm=$norm" +# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type brendel --attack.norm $norm --attack.max-iter 1000 --attack.lr 1e-3 --attack.binary-search-steps 10 --attack.init-size 100" \ +# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ +# --cal-file $cal_file --threshold $thr005 \ +# $trial_list \ +# data/voxceleb1_test/utt2model \ +# data/voxceleb1_test \ +# $xvector_dir/voxceleb1_test/xvector.scp \ +# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + +# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ +# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + +# for f in $(ls $score_plda_dir/*_results); +# do +# echo $f +# cat $f +# echo "" +# done +# if [ "${do_analysis}" == "true" ];then +# score_analysis_dir=$score_plda_dir +# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ +# $trial_list $score_clean \ +# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ +# $score_analysis_dir/voxceleb1 & +# fi + +# done + +# fi + +## it needs to train some importance vector +# if [ $stage -le 22 ];then + +# for norm in 2 +# do +# score_plda_dir=$score_dir/cosine_art_lowprofool_norm${norm} +# echo "Eval Voxceleb 1 with Cosine scoring with LowProFool attack norm=$norm" +# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type low-pro-fool --attack.norm $norm --attack.max-iter 100" \ +# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ +# --cal-file $cal_file --threshold $thr005 \ +# $trial_list \ +# data/voxceleb1_test/utt2model \ +# data/voxceleb1_test \ +# $xvector_dir/voxceleb1_test/xvector.scp \ +# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + +# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ +# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + +# for f in $(ls $score_plda_dir/*_results); +# do +# echo $f +# cat $f +# echo "" +# done +# if [ "${do_analysis}" == "true" ];then +# score_analysis_dir=$score_plda_dir +# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ +# $trial_list $score_clean \ +# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ +# $score_analysis_dir/voxceleb1 & +# fi + +# done + +# fi + +## Too SLOW +# if [ $stage -le 24 ];then + +# for theta in 0.1 +# do +# score_plda_dir=$score_dir/cosine_art_jsma_theta$theta +# echo "Eval Voxceleb 1 with Cosine scoring with JSMA attack theta=$theta" +# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type jsma --attack.theta $theta" \ +# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ +# --cal-file $cal_file --threshold $thr005 \ +# $trial_list \ +# data/voxceleb1_test/utt2model \ +# data/voxceleb1_test \ +# $xvector_dir/voxceleb1_test/xvector.scp \ +# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + +# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ +# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + +# for f in $(ls $score_plda_dir/*_results); +# do +# echo $f +# cat $f +# echo "" +# done +# if [ "${do_analysis}" == "true" ];then +# score_analysis_dir=$score_plda_dir +# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ +# $trial_list $score_clean \ +# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ +# $score_analysis_dir/voxceleb1 & +# fi + +# done + +# fi diff --git a/egs/voxceleb/adv.v1.1/run_018_eval_art_transfer_blackbox_attacks.sh b/egs/voxceleb/adv.v1.1/run_018_eval_art_transfer_blackbox_attacks.sh new file mode 100755 index 00000000..bc6390f2 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_018_eval_art_transfer_blackbox_attacks.sh @@ -0,0 +1,633 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +use_gpu=false +do_analysis=false +save_wav=false +use_trials_subset=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +transfer_feat_config=$feat_config + +if [ "$use_gpu" == "true" ];then + eval_args="--use-gpu true" + eval_cmd="$cuda_eval_cmd" +else + eval_cmd="$train_cmd" +fi + +if [ "$use_trials_subset" == "true" ];then + condition=o_clean_1000_1000 +else + condition=o_clean +fi +trial_list=data/voxceleb1_test/trials_$condition + +xvector_dir=exp/xvectors/$nnet_name +score_dir=exp/scores/$nnet_name + +score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores +cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 + +transfer_xvector_dir=exp/xvectors/$transfer_nnet_name +transfer_score_dir=exp/scores/$transfer_nnet_name +transfer_cal_file=$transfer_score_dir/cosine_cal_v1/cal_tel.h5 + +#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) +thr005=2.94 +thr001=4.60 +thr0001=6.90 +declare -a score_array +declare -a stats_array + +if [ $stage -le 1 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 2 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + + +if [ $stage -le 3 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGM L1 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 1" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 4 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGM minimal L1 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 1" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + + +if [ $stage -le 5 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGM L2 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 2" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 6 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring FGM minimal L2 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 2" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 7 ];then + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with iter FGSM attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type bim --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + + +if [ $stage -le 8 ];then + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 9 ];then + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 1" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 10 ];then + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 2" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 11 ];then + + for confidence in 0 #1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwl2_conf${confidence} + echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + +fi + + +if [ $stage -le 12 ];then + + for confidence in 0 #1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwlinf_conf${confidence} + echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner LInf attack confidence=$confidence" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 40 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.eps 0.3" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + +fi + +wait + diff --git a/egs/voxceleb/adv.v1.1/run_030_extract_xvectors_victim_model.sh b/egs/voxceleb/adv.v1.1/run_030_extract_xvectors_victim_model.sh deleted file mode 100755 index ff068c1b..00000000 --- a/egs/voxceleb/adv.v1.1/run_030_extract_xvectors_victim_model.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -xvec_chunk_length=12800 -ft=0 - -. parse_options.sh || exit 1; -. $config_file - -if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" - xvec_cmd="$cuda_eval_cmd --mem 4G" -else - xvec_cmd="$train_cmd --mem 12G" -fi - -if [ $ft -eq 1 ];then - nnet_name=$advft_nnet_name - nnet=$advft_nnet -fi - -xvector_dir=exp/xvectors/$nnet_name - -if [ $stage -le 1 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors_from_wav.sh --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - --feat-config $feat_config \ - $nnet data/$name \ - $xvector_dir/$name - done -fi - -exit diff --git a/egs/voxceleb/adv.v1.1/run_031_extract_xvectors_transfer_model.sh b/egs/voxceleb/adv.v1.1/run_031_extract_xvectors_transfer_model.sh deleted file mode 100755 index df29fc12..00000000 --- a/egs/voxceleb/adv.v1.1/run_031_extract_xvectors_transfer_model.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -xvec_chunk_length=12800 -. parse_options.sh || exit 1; -. $config_file - -if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" - xvec_cmd="$cuda_eval_cmd --mem 4G" -else - xvec_cmd="$train_cmd --mem 12G" -fi - -feat_config=$transfer_feat_config -nnet_name=$transfer_nnet_name -nnet=$transfer_nnet - -xvector_dir=exp/xvectors/$nnet_name - -if [ $stage -le 1 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors_from_wav.sh --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - --feat-config $feat_config \ - $nnet data/$name \ - $xvector_dir/$name - done -fi - -exit diff --git a/egs/voxceleb/adv.v1.1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh b/egs/voxceleb/adv.v1.1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh deleted file mode 100755 index 3077ecf6..00000000 --- a/egs/voxceleb/adv.v1.1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh +++ /dev/null @@ -1,556 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -use_trials_subset=false -sigmas="0.001 0.01" -max_test_length="" - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -if [ -n "$max_test_length" ];then - eval_args="${eval_args} --max-test-length $max_test_length" -fi - -if [ "$use_trials_subset" == "true" ];then - condition=o_clean_1000_1000 -else - condition=o_clean -fi -trial_list=data/voxceleb1_test/trials_$condition - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - for sigma in $sigmas - do - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_fgsm_e${eps}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgsm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 2 ];then - - for sigma in $sigmas - do - score_array=() - stats_array=() - for snr in 30 20 10 0 - do - score_plda_dir=$score_dir/cosine_fgsm_snr${snr}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack snr=$snr" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type snr-fgsm --attack.snr $snr" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_fgsm_snrall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 3 ];then - for sigma in $sigmas - do - - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_randfgsm_e${eps}_a${alpha}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type rand-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_randfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 4 ];then - for sigma in $sigmas - do - - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_iterfgsm_e${eps}_a${alpha}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Iterative FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type iter-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav --smooth-sigma $sigma \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 5 ];then - - for sigma in $sigmas - do - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - - score_plda_dir=$score_dir/cosine_cwl2_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence lr=$lr num-its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - done - done - done - done -fi - - -if [ $stage -le 6 ];then - for sigma in $sigmas - do - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - score_plda_dir=$score_dir/cosine_cwrms_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner RMS attack confidence=$confidence lr=$lr num_its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.norm-time --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - done -fi - - -if [ $stage -le 7 ];then - for sigma in $sigmas - do - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - score_plda_dir=$score_dir/cosine_cwsnr_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner SNR attack confidence=$confidence lr=$lr num_its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.use-snr --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - done -fi - - -exit - - -# #!/bin/bash -# # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# # -# # Apache 2.0. -# # -# . ./cmd.sh -# . ./path.sh -# set -e - -# stage=1 -# config_file=default_config.sh -# use_gpu=false -# do_analysis=false -# save_wav=false - -# . parse_options.sh || exit 1; -# . $config_file -# . datapath.sh - -# if [ "$use_gpu" == "true" ];then -# eval_args="--use-gpu true" -# eval_cmd="$cuda_eval_cmd" -# else -# eval_cmd="$train_cmd" -# fi - -# xvector_dir=exp/xvectors/$nnet_name -# score_dir=exp/scores/$nnet_name - -# score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -# cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -# #thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -# thr005=2.94 -# thr001=4.60 -# thr0001=6.90 -# declare -a score_array -# declare -a stats_array - -# if [ $stage -le 1 ];then - -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/cosine_fgsm_e${eps}_randsmooth${sigma} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgsm --eps $eps \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_fgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done - -# fi - - - - -# if [ $stage -le 3 ];then -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_randfgsm_e${eps}_a${alpha}_randsmooth$sigma -# echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type rand-fgsm --eps $eps --alpha $alpha --smooth-sigma $sigma\ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# done - -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_randfgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done -# fi - - -# if [ $stage -le 4 ];then -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_iterfgsm_e${eps}_a${alpha}_randsmooth$sigma -# echo "Eval Voxceleb 1 with Cosine scoring with Iterative FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type iter-fgsm --eps $eps --alpha $alpha \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_iterfgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done -# fi - -# wait diff --git a/egs/voxceleb/adv.v1.1/run_053_eval_art_whitebox_attacks.sh b/egs/voxceleb/adv.v1.1/run_053_eval_art_whitebox_attacks.sh deleted file mode 100755 index 92fbcc92..00000000 --- a/egs/voxceleb/adv.v1.1/run_053_eval_art_whitebox_attacks.sh +++ /dev/null @@ -1,1078 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -use_trials_subset=false - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -if [ "$use_trials_subset" == "true" ];then - condition=o_clean_1000_1000 -else - condition=o_clean -fi -trial_list=data/voxceleb1_test/trials_$condition - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 - -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_art_fgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 2 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_fgsm_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM minimal attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgsm_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - -if [ $stage -le 3 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_art_fgml1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 4 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_fgml1_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 minimal attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml1_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 5 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_art_fgml2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 6 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_fgml2_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 minimal attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml2_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 7 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_iterfgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with IterFGM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type bim --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 8 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_pgdlinf_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_pgdlinf_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - - -if [ $stage -le 9 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_pgdl1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_pgdl1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 10 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_pgdl2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_pgdl2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 11 ];then - - for confidence in 0 #1 - do - score_plda_dir=$score_dir/cosine_art_cwl2_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - - -if [ $stage -le 12 ];then - - for confidence in 0 #1 - do - score_plda_dir=$score_dir/cosine_art_cwlinf_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner Linf attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.eps 0.3" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - - -# #!/bin/bash -# # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# # -# # Apache 2.0. -# # -# . ./cmd.sh -# . ./path.sh -# set -e - -# stage=1 -# config_file=default_config.sh -# use_gpu=false -# do_analysis=false -# save_wav=false - -# . parse_options.sh || exit 1; -# . $config_file -# . datapath.sh - -# if [ "$use_gpu" == "true" ];then -# eval_args="--use-gpu true" -# eval_cmd="$cuda_eval_cmd" -# else -# eval_cmd="$train_cmd" -# fi - -# xvector_dir=exp/xvectors/$nnet_name -# score_dir=exp/scores/$nnet_name - -# score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -# cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -# #thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -# thr005=2.94 -# thr001=4.60 -# thr0001=6.90 - -# declare -a score_array -# declare -a stats_array - -# if [ $stage -le 1 ];then - -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/cosine_art_fgsm_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgsm_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# fi - -# if [ $stage -le 2 ];then -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_fgsm_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM minimal attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-minimal" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgsm_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - -# if [ $stage -le 3 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/cosine_art_fgml1_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-norm 1" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgml1_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# fi - -# if [ $stage -le 4 ];then -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_fgml1_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 minimal attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-minimal --attack-norm 1" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgml1_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 5 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/cosine_art_fgml2_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-norm 2" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgml2_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# fi - -# if [ $stage -le 6 ];then -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_fgml2_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 minimal attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-minimal --attack-norm 2" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgml2_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 7 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_iterfgsm_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with IterFGM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type bim --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_iterfgsm_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# fi - -# if [ $stage -le 8 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_pgdlinf_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_pgdlinf_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# fi - - -# if [ $stage -le 9 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_pgdl1_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10 --attack-norm 1" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_pgdl1_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# fi - -# if [ $stage -le 10 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_pgdl2_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10 --attack-norm 2" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_pgdl2_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# fi - -# if [ $stage -le 11 ];then - -# for confidence in 0 #1 -# do -# score_plda_dir=$score_dir/cosine_art_cwl2_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type cw-l2 --attack-opt "--attack-confidence $confidence" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_plda_dir -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ -# $score_analysis_dir/voxceleb1 & -# fi - -# done - -# fi - - -# if [ $stage -le 12 ];then - -# for confidence in 0 #1 -# do -# score_plda_dir=$score_dir/cosine_art_cwlinf_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner Linf attack confidence=$confidence" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type cw-linf --attack-opt "--attack-confidence $confidence --attack-eps 0.3" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_plda_dir -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ -# $score_analysis_dir/voxceleb1 & -# fi - -# done - -# fi - - diff --git a/egs/voxceleb/adv.v1.1/run_054_eval_art_transfer_blackbox_attacks.sh b/egs/voxceleb/adv.v1.1/run_054_eval_art_transfer_blackbox_attacks.sh deleted file mode 100755 index bdcdeae4..00000000 --- a/egs/voxceleb/adv.v1.1/run_054_eval_art_transfer_blackbox_attacks.sh +++ /dev/null @@ -1,1260 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -use_trials_subset=false - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -transfer_feat_config=$feat_config - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -if [ "$use_trials_subset" == "true" ];then - condition=o_clean_1000_1000 -else - condition=o_clean -fi -trial_list=data/voxceleb1_test/trials_$condition - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -transfer_xvector_dir=exp/xvectors/$transfer_nnet_name -transfer_score_dir=exp/scores/$transfer_nnet_name -transfer_cal_file=$transfer_score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 2 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 3 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 4 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM minimal L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 5 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 6 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring FGM minimal L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 7 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with iter FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type bim --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 8 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 9 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 10 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 11 ];then - - for confidence in 0 #1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwl2_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - - -if [ $stage -le 12 ];then - - for confidence in 0 #1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwlinf_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner LInf attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 40 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.eps 0.3" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - -wait - - -# #!/bin/bash -# # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# # -# # Apache 2.0. -# # -# . ./cmd.sh -# . ./path.sh -# set -e - -# stage=1 -# config_file=default_config.sh -# use_gpu=false -# do_analysis=false -# save_wav=false - -# . parse_options.sh || exit 1; -# . $config_file -# . datapath.sh - -# if [ "$use_gpu" == "true" ];then -# eval_args="--use-gpu true" -# eval_cmd="$cuda_eval_cmd" -# else -# eval_cmd="$train_cmd" -# fi - -# xvector_dir=exp/xvectors/$nnet_name -# score_dir=exp/scores/$nnet_name - -# score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -# cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -# transfer_xvector_dir=exp/xvectors/$transfer_nnet_name -# transfer_score_dir=exp/scores/$transfer_nnet_name -# transfer_cal_file=$transfer_score_dir/cosine_cal_v1/cal_tel.h5 - -# #thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -# thr005=2.94 -# thr001=4.60 -# thr0001=6.90 -# declare -a score_array -# declare -a stats_array - -# if [ $stage -le 1 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 2 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-minimal" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - - -# if [ $stage -le 3 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM L1 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-norm 1" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 4 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM minimal L1 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-minimal --attack-norm 1" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - - -# if [ $stage -le 5 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM L2 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-norm 2" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 6 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring FGM minimal L2 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-minimal --attack-norm 2" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 7 ];then -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with iter FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type bim --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - - -# if [ $stage -le 8 ];then -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10" \ - -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 9 ];then -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10 --attack-norm 1" \ - -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 10 ];then -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10 --attack-norm 2" \ - -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 11 ];then - -# for confidence in 0 #1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwl2_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type cw-l2 --attack-opt "--attack-confidence $confidence" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_plda_dir -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ -# $score_analysis_dir/voxceleb1 & -# fi - -# done - -# fi - - -# if [ $stage -le 12 ];then - -# for confidence in 0 #1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwlinf_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner LInf attack confidence=$confidence" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 40 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type cw-linf --attack-opt "--attack-confidence $confidence --attack-eps 0.3" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_plda_dir -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ -# $score_analysis_dir/voxceleb1 & -# fi - -# done - -# fi - -# wait - diff --git a/egs/voxceleb/adv.v1/README.md b/egs/voxceleb/adv.v1/README.md deleted file mode 100644 index cace8c2c..00000000 --- a/egs/voxceleb/adv.v1/README.md +++ /dev/null @@ -1,164 +0,0 @@ -# VoxCeleb Adversarial Attacks Version 1 - -Last update 2021/04/22 - -Recipe to evaluate Adversarial Attacks to x-Vector Speaker Verification Systems - -## Threat Model - -Speaker verification pipeline where: - - Enrollment side is not under attack, x-vectors for enrollment utterances are - pre-computed and storded on disk - - Test side is under Adversarial Attacks. - The attack adds an inperceptible perturbation to the - test waveform to make the system to: - - Classify target trials as non-targets - - Classify non-target trials as targets - -As attacks happen in waveform domain, test x-vectors cannot be precomputed and -need to be recomputed for each trial. -Also, the speaker verification pipeline needs to be fully differentiable from wave to score, -so the attack algorithm can optimize the perturbation noise. - -However, to train the x-vector network, this recipe computes acoustic features and speech augmentations off-line. -Look version adv.v1.1, for a newer recipe which computes features -and augmentations on the fly. - -Two broad types of attacks: - - White-box: the attacker has access to the x-vector model under attack - - Transfer based Black-box: the attacker doesn't have access to the x-vector model under attack (black-box model), - but has access to another x-vector model (white-box). Perturvation is obtained from the white-box model - and used to attack the black-box model. - -Multiple attacks algorithms: FGSM, Iter-FGSM, PGD, Carlini-Wagner. - -## Citing - - If you use this recipe, please cite: -``` -@inproceedings{Villalba2020, -address = {Shanghai, China}, -author = {Villalba, Jes{\'{u}}s and Zhang, Yuekai and Dehak, Najim}, -booktitle = {Interspeech 2020}, -month = {sep}, -title = {{x-Vectors Meet Adversarial Attacks : Benchmarking Adversarial Robustness in Speaker Verification}}, -year = {2020} -} -``` - -## Training Data - - - x-Vector network is trained on Voxceleb2 dev + test with augmentations - - MUSAN noise - - RIR reverberation - -## Test Data - - - Test data is VoxCeleb 1 Original Clean trial list. - - We don't use the larger Entire and Hard list because of the high computing cost - of these experiments. - -## Usage - - - Run the run_0*.sh scripts in sequence - - By default it will use ResNet34 as victim model and Residual E-TDNN as transfer model - - You can change that modifying the configuration script. - - For example, to use LResNet34 as transfer model use `config_victim_resnet34_transfer_lresnet.v1.sh` - when calling each of the steps as -```bash -run_0*.sh --config-file global_conf/config_victim_resnet34_transfer_lresnet.v1.sh -``` - -## Recipe Steps: - - - `run_001_prepare_data.sh` - - Data preparation script to generate Kaldi style data directories for - - VoxCeleb2 train+test - - VoxCeleb1 Original eval sets - - - `run_002_compute_evad.sh` - - Computes Energy VAD for all datasets - - - `run_003_compute_fbank.sh` - - Computes log-filter-banks acoustic features for all datasets - - - `run_004_prepare_augment.sh` - - Prepares Kaldi style data directories for augmented training data with MUSAN noise and RIR reverberation. - - - `run_005_compute_fbank_augment.sh - - Computes log-filter-banks for augmented datasets - - - `run_010_prepare_victim_xvec_train_data.sh` - - Prepares features train the victim x-vector model - - Applies sort-time mean normalization and remove silence frames - - Removes utterances shorter than 4secs and speakers with less than 8 utterances. - - Creates training and validation lists for x-vector training - - - `run_011_train_victim_xvector.sh` - - Trains the victim x-vector network - - - `run_012_prepare_transfer_xvec_train_data.sh` - - Prepares features train the transfer white-box x-vector model - - If training data for victim and tranfer models is the same, it does nothing - - - `run_013_train_transfer_xvector.sh` - - Trains the transfer white-box x-vector network - - - `run_030_extract_xvectors_victim_model.sh` - - Exctracts x-vectors for VoxCeleb1 test set using the victim model - - - `run_031_extract_xvectors_transfer_model.sh` - - Exctracts x-vectors for VoxCeleb1 test set using the transfer model - - - `run_040_eval_be_victim_model.sh` - - Eval cosine scoring back-end without attack on victim model x-vectors - - Trains calibration for the victim model scores - - Results are left in `exp/scores/$nnet_name/cosine/voxceleb1_o_clean_results` - - - `run_041_eval_be_tranfer_model.sh` - - Eval cosine scoring back-end without attack on transfer model x-vectors - - Trains calibration for the tranfer model scores - - Results are left in `exp/scores/$transfer_nnet_name/cosine/voxceleb1_o_clean_results` - - - `run_042_eval_victim_from_wav.sh` - - Eval cosine scoring back-end without attack on victim model x-vectors - from the test wave, computing features and x-vectors on the fly. - - This script is just to check that we get the same result as in step 40. - - You don't need to run it. - - Results are left in `exp/scores/$nnet_name/cosine_from_wav/voxceleb1_o_clean_results` - - - `run_043_eval_whitebox_attacks.sh` - - Eval white box attacks implemented in Hyperion toolkit: FGSM, Iter-FGSM, PGD, Carlini-Wagner - - Results are left in `exp/scores/$nnet_name/cosine_${attack_related_label}/voxceleb1_o_clean_results` - - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF - - Curves are left in `exp/scores/$nnet_name/cosine_${attack_related_label}_eall/` - - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - - Wavs are saves to `exp/scores/$nnet_name/cosine_${attack_related_label}/wav` - - - `run_044_eval_transfer_blackbox_attacks.sh` - - Eval transfer black box attacks implemented in Hyperion toolkit: FGSM, Iter-FGSM, PGD, Carlini-Wagner - - Results are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_${attack_related_label}/voxceleb1_o_clean_results` - - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF - - Curves are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_${attack_related_label}_eall/` - - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - - Wavs are saves to `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_${attack_related_label}/wav` - - - `run_045_eval_whitebox_attacks_with_randsmooth_defense.sh` - - Eval white box attacks with Gaussian randomized smoothing defense. - - Results are left in `exp/scores/$nnet_name/cosine_${attack_related_label}_randsmooth${smooth_sigma}/voxceleb1_o_clean_results` - - - `run_053_eval_art_whitebox_attacks.sh` - - Eval white box attacks implemented in IBM's Adversarial Robustness Toolkit (ART): FGSM, Iter-FGSM, PGD, Carlini-Wagner - - Results are left in `exp/scores/$nnet_name/cosine_art_${attack_related_label}/voxceleb1_o_clean_results` - - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF - - Curves are left in `exp/scores/$nnet_name/cosine_art_${attack_related_label}_eall/` - - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - - Wavs are saves to `exp/scores/$nnet_name/cosine_art_${attack_related_label}/wav` - - - `run_054_eval_art_transfer_blackbox_attacks.sh` - - Eval transfer black box attacks implemented in IBM's Adversarial Robustness Toolkit (ART): FGSM, Iter-FGSM, PGD, Carlini-Wagner - - Results are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_art_${attack_related_label}/voxceleb1_o_clean_results` - - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF - - Curves are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_art_${attack_related_label}_eall/` - - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - - Wavs are saves to `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_art_${attack_related_label}/wav` diff --git a/egs/voxceleb/adv.v1/cmd.sh b/egs/voxceleb/adv.v1/cmd.sh deleted file mode 100755 index 9fb941ae..00000000 --- a/egs/voxceleb/adv.v1/cmd.sh +++ /dev/null @@ -1,25 +0,0 @@ -# you can change cmd.sh depending on what type of queue you are using. -# If you have no queueing system and want to run on a local machine, you -# can change all instances 'queue.pl' to run.pl (but be careful and run -# commands one by one: most recipes will exhaust the memory on your -# machine). queue.pl works with GridEngine (qsub). slurm.pl works -# with slurm. Different queues are configured differently, with different -# queue names and different ways of specifying things like memory; -# to account for these differences you can create and edit the file -# conf/queue.conf to match your queue's configuration. Search for -# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, -# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. - -if [ "$(hostname -d)" == "cm.gemini" ];then - #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" - export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" - export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" - export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" -else - export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\"" - export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\"" - export cuda_eval_cmd="$train_cmd" -fi - - - diff --git a/egs/voxceleb/adv.v1/conf b/egs/voxceleb/adv.v1/conf deleted file mode 120000 index 7dfe9dce..00000000 --- a/egs/voxceleb/adv.v1/conf +++ /dev/null @@ -1 +0,0 @@ -../../sre19-cmn2/v1/conf \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/datapath.sh b/egs/voxceleb/adv.v1/datapath.sh deleted file mode 100644 index 6d48a66d..00000000 --- a/egs/voxceleb/adv.v1/datapath.sh +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Paths to the databases used in the experiment - -#paths to databases - -if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then - voxceleb1_root=/export/corpora5/VoxCeleb1_v1 - voxceleb2_root=/export/corpora5/VoxCeleb2 - musan_root=/export/corpora5/JHU/musan -elif [ "$(hostname --domain)" == "cm.gemini" ];then - voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 - voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 - musan_root=/expscratch/dgromero/corpora-open/musan -else - echo "Put your database paths here" - exit 1 -fi - - diff --git a/egs/voxceleb/adv.v1/default_config.sh b/egs/voxceleb/adv.v1/default_config.sh deleted file mode 120000 index c91ded65..00000000 --- a/egs/voxceleb/adv.v1/default_config.sh +++ /dev/null @@ -1 +0,0 @@ -global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/global_conf/config_victim_lresnet34_transfer_resetdnn.v1.sh b/egs/voxceleb/adv.v1/global_conf/config_victim_lresnet34_transfer_resetdnn.v1.sh deleted file mode 100644 index 172da763..00000000 --- a/egs/voxceleb/adv.v1/global_conf/config_victim_lresnet34_transfer_resetdnn.v1.sh +++ /dev/null @@ -1,66 +0,0 @@ -# Victim model Light ResNet34 x-vector -# For the black-box attacks we use Residual E-TDNN to generate the attack and transfer them to the ResNet34 -# Both models uses the same features: 80 fbanks -# Both models uses the same training data. - -# victim x-vector training -nnet_data=voxceleb2cat_train_combined - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.05 - -nnet_type=lresnet34 -dropout=0 -embed_dim=256 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -num_augs=5 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - - -# transfer model training -transfer_nnet_data=voxceleb2cat_train_combined #this can be voxceleb2cat or voxceleb2cat_combined - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=400 -transfer_max_chunk=400 -transfer_ipe=1 -transfer_lr=0.05 - -transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - -transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name -transfer_nnet=$transfer_nnet_dir/model_ep0070.pth - - diff --git a/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_lresnet.v1.sh b/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_lresnet.v1.sh deleted file mode 100644 index 97f4283e..00000000 --- a/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_lresnet.v1.sh +++ /dev/null @@ -1,61 +0,0 @@ -# Victim model ResNet34 x-vector -# For the black-box attacks we use Light ResNet34 to generate the attack and transfer them to the ResNet34 -# Both models uses the same features: 80 fbanks -# Both models uses the same training data. - -# victim x-vector training -nnet_data=voxceleb2cat_train_combined - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.05 - -nnet_type=resnet34 -dropout=0 -embed_dim=256 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -num_augs=5 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - - -# transfer model training -transfer_nnet_data=voxceleb2cat_train_combined #this can be voxceleb2cat or voxceleb2cat_combined - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=400 -transfer_max_chunk=400 -transfer_ipe=1 -transfer_lr=0.05 - -transfer_nnet_type=lresnet34 -transfer_dropout=0 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--resnet-type $transfer_nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_nnet_type}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - -transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name -transfer_nnet=$transfer_nnet_dir/model_ep0070.pth - - diff --git a/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh b/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh deleted file mode 100644 index 407c0cfd..00000000 --- a/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh +++ /dev/null @@ -1,66 +0,0 @@ -# Victim model ResNet34 x-vector -# For the black-box attacks we use Residual E-TDNN to generate the attack and transfer them to the ResNet34 -# Both models uses the same features: 80 fbanks -# Both models uses the same training data. - -# victim x-vector training -nnet_data=voxceleb2cat_train_combined - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.05 - -nnet_type=resnet34 -dropout=0 -embed_dim=256 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -num_augs=5 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - - -# transfer model training -transfer_nnet_data=voxceleb2cat_train_combined #this can be voxceleb2cat or voxceleb2cat_combined - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=400 -transfer_max_chunk=400 -transfer_ipe=1 -transfer_lr=0.05 - -transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - -transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name -transfer_nnet=$transfer_nnet_dir/model_ep0070.pth - - diff --git a/egs/voxceleb/adv.v1/local b/egs/voxceleb/adv.v1/local deleted file mode 120000 index ce1cbf90..00000000 --- a/egs/voxceleb/adv.v1/local +++ /dev/null @@ -1 +0,0 @@ -../v1/local \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/path.sh b/egs/voxceleb/adv.v1/path.sh deleted file mode 100755 index 42bfa7be..00000000 --- a/egs/voxceleb/adv.v1/path.sh +++ /dev/null @@ -1,6 +0,0 @@ - -export HYP_ROOT=$(readlink -f `pwd -P`/../../..) -export TOOLS_ROOT=$HYP_ROOT/tools - -. $TOOLS_ROOT/path.sh -HYP_ART_ENV=$HYP_ENV diff --git a/egs/voxceleb/adv.v1/run_001_prepare_data.sh b/egs/voxceleb/adv.v1/run_001_prepare_data.sh deleted file mode 100755 index 8af0f353..00000000 --- a/egs/voxceleb/adv.v1/run_001_prepare_data.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. datapath.sh - - -if [ $stage -le 1 ];then - - # Prepare the VoxCeleb2 dataset for training. - local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train -fi - -if [ $stage -le 2 ];then - # prepare voxceleb1 for test - local/make_voxceleb1_o.pl $voxceleb1_root data -fi diff --git a/egs/voxceleb/adv.v1/run_002_compute_evad.sh b/egs/voxceleb/adv.v1/run_002_compute_evad.sh deleted file mode 100755 index 98b0db7d..00000000 --- a/egs/voxceleb/adv.v1/run_002_compute_evad.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -nodes=fs01 -storage_name=$(date +'%m_%d_%H_%M') -vaddir=`pwd`/exp/vad_e -vad_config=conf/vad_16k.yaml - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - - -if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $fbankdir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $fbankdir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $fbankdir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $fbankdir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $fbankdir/storage - else - echo "we don't distribute data between multiple machines" - fi - fi -fi - -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ - --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_vad/$name $vaddir - utils/fix_data_dir.sh data/${name} - done -fi - - diff --git a/egs/voxceleb/adv.v1/run_003_compute_fbank.sh b/egs/voxceleb/adv.v1/run_003_compute_fbank.sh deleted file mode 100755 index 7bd8b6a3..00000000 --- a/egs/voxceleb/adv.v1/run_003_compute_fbank.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -nodes=fs01 -storage_name=$(date +'%m_%d_%H_%M') -fbankdir=`pwd`/exp/fbank -vaddir=`pwd`/exp/fbank -vaddir_gt=`pwd`/exp/vad_gt - -stage=1 -config_file=default_config.sh -feat_vers="numpy" - -. parse_options.sh || exit 1; - -if [ "$feat_vers" == "kaldi" ];then - make_fbank=steps/make_fbank.sh - fbank_cfg=conf/fbank80_16k.conf -else - fbank_cfg=conf/fbank80_16k.yaml - if [ "$feat_vers" == "numpy" ];then - make_fbank=steps_pyfe/make_fbank.sh - else - make_fbank=steps_pyfe/make_torch_fbank.sh - fi -fi - -# Make filterbanks -if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $fbankdir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/fbank/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $fbankdir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $fbankdir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $fbankdir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $fbankdir/storage - else - echo "we don't distribute data between multiple machines" - fi - fi -fi - -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - $make_fbank --write-utt2num-frames true --fbank-config $fbank_cfg --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_fbank/$name $fbankdir - utils/fix_data_dir.sh data/${name} - done -fi - - diff --git a/egs/voxceleb/adv.v1/run_004_prepare_augment.sh b/egs/voxceleb/adv.v1/run_004_prepare_augment.sh deleted file mode 100755 index 7d78ae92..00000000 --- a/egs/voxceleb/adv.v1/run_004_prepare_augment.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -# In this script, we augment the SWBD,SRE,MX6 and Voxceleb data with reverberation, -# noise, music, and babble, and combined it with the clean data. -# The combined list will be used to train the xvector DNN. - -frame_shift=0.01 - -if [ $stage -le 1 ]; then - - if [ ! -d "RIRS_NOISES" ]; then - if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then - ln -s ../../sre19-cmn2/v1/RIRS_NOISES - else - # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises - wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip - unzip rirs_noises.zip - fi - fi - - # Prepare the MUSAN corpus, which consists of music, speech, and noise - # suitable for augmentation. - local/make_musan.sh $musan_root 16 data - - # Get the duration of the MUSAN recordings. This will be used by the - # script augment_data_dir.py. - for name in speech noise music; do - utils/data/get_utt2dur.sh data/musan_${name} - mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur - done - -fi - - -if [ $stage -le 2 ]; then - - for name in voxceleb2cat_train - do - export TMPDIR=data/tmp - mkdir -p $TMPDIR - - awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/$name/utt2num_frames > data/$name/reco2dur - - # Make a reverberated version of the list. Note that we don't add any - # additive noise here. - - # Make a version with reverberated speech - rvb_opts=() - rvb_opts+=(--rir-set-parameters "0.2, RIRS_NOISES/real_rirs_isotropic_noises/rir_list") - rvb_opts+=(--rir-set-parameters "0.4, RIRS_NOISES/simulated_rirs/smallroom/rir_list") - rvb_opts+=(--rir-set-parameters "0.4, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") - - python steps/data/reverberate_data_dir.py \ - "${rvb_opts[@]}" \ - --speech-rvb-probability 1 \ - --pointsource-noise-addition-probability 0 \ - --isotropic-noise-addition-probability 0 \ - --num-replications 1 \ - --source-sampling-rate 16000 \ - data/${name} data/${name}_reverb - cp data/${name}/vad.scp data/${name}_reverb/ - utils/copy_data_dir.sh --utt-suffix "-reverb" data/${name}_reverb data/${name}_reverb.new - rm -rf data/${name}_reverb - mv data/${name}_reverb.new data/${name}_reverb - - - # Augment with musan_noise - python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0:13:8" --fg-noise-dir "data/musan_noise" data/${name} data/${name}_noise - # Augment with musan_music - python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/${name} data/${name}_music - # Augment with musan_speech - python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13:10" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/${name} data/${name}_babble - - - awk '{ $1=$1"-reverb"; print $0}' data/${name}/reco2dur > data/${name}_reverb/reco2dur - - # Augment with musan_noise - python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0:13:8" --fg-noise-dir "data/musan_noise" data/${name}_reverb data/${name}_reverb_noise - # Augment with musan_music - python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/${name}_reverb data/${name}_reverb_music - # Augment with musan_speech - python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13:10" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/${name}_reverb data/${name}_reverb_babble - - - # Combine noise only - utils/combine_data.sh data/${name}_noise_all \ - data/${name}_noise data/${name}_music data/${name}_babble - - # Combine reverbs - utils/combine_data.sh data/${name}_reverb_all data/${name}_reverb \ - data/${name}_reverb_noise data/${name}_reverb_music data/${name}_reverb_babble - - # Combine reverb, noise, music, and babble into one directory. - utils/combine_data.sh data/${name}_aug data/${name}_reverb_all data/${name}_noise_all - unset TMPDIR - done - -fi - - -if [ $stage -le 3 ];then - # Take a random subset of the augmentations - utils/subset_data_dir.sh data/voxceleb2cat_train_aug \ - $(wc -l data/voxceleb2cat_train/utt2spk | awk '{ print int('$num_augs'*$1)}') \ - data/voxceleb2cat_train_augx${num_augs} - utils/fix_data_dir.sh data/voxceleb2cat_train_augx${num_augs} -fi - - -exit diff --git a/egs/voxceleb/adv.v1/run_005_compute_fbank_augment.sh b/egs/voxceleb/adv.v1/run_005_compute_fbank_augment.sh deleted file mode 100755 index 10d13e03..00000000 --- a/egs/voxceleb/adv.v1/run_005_compute_fbank_augment.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -fbankdir=`pwd`/exp/fbank - -stage=1 -config_file=default_config.sh -feat_vers="numpy" - -. parse_options.sh || exit 1; -. $config_file - -if [ "$feat_vers" == "kaldi" ];then - make_fbank=steps/make_fbank.sh - fbank_cfg=conf/fbank80_16k.conf -else - fbank_cfg=conf/fbank80_16k.yaml - if [ "$feat_vers" == "numpy" ];then - make_fbank=steps_pyfe/make_fbank.sh - else - make_fbank=steps_pyfe/make_torch_fbank.sh - fi -fi - -export TMPDIR=data/tmp -mkdir -p $TMPDIR - -if [ $stage -le 1 ];then - - # Make filterbanks for the augmented data. Note that we do not compute a new - # vad.scp file here. Instead, we use the vad.scp from the clean version of - # the list. - for name in voxceleb2cat_train_augx${num_augs} - do - $make_fbank --write-utt2num-frames true \ - --fbank-config $fbank_cfg --nj 120 --cmd "$train_cmd" \ - data/$name exp/make_fbank/$name $fbankdir - fix_data_dir.sh data/$name - done - -fi - - -if [ $stage -le 2 ];then - - # Combine the clean and augmented lists. - utils/combine_data.sh --extra-files "utt2num_frames" data/voxceleb2cat_train_combined data/voxceleb2cat_train_augx${num_augs} data/voxceleb2cat_train - -fi - -exit - diff --git a/egs/voxceleb/adv.v1/run_010_prepare_victim_xvec_train_data.sh b/egs/voxceleb/adv.v1/run_010_prepare_victim_xvec_train_data.sh deleted file mode 100755 index 25a59571..00000000 --- a/egs/voxceleb/adv.v1/run_010_prepare_victim_xvec_train_data.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - -# Now we prepare the features to generate examples for xvector training. -if [ $stage -le 2 ]; then - # This script applies CMVN and removes nonspeech frames. Note that this is somewhat - # wasteful, as it roughly doubles the amount of training data on disk. After - # creating training examples, this can be removed. - steps_xvec/prepare_feats_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ - --storage_name voxceleb-adv.v1-$(date +'%m_%d_%H_%M') \ - data/${nnet_data} data/${nnet_data}_no_sil exp/${nnet_data}_no_sil - utils/fix_data_dir.sh data/${nnet_data}_no_sil - -fi - - -if [ $stage -le 3 ]; then - # Now, we need to remove features that are too short after removing silence - # frames. We want atleast 4s (400 frames) per utterance. - hyp_utils/remove_short_utts.sh --min-len 400 data/${nnet_data}_no_sil - - # We also want several utterances per speaker. Now we'll throw out speakers - # with fewer than 8 utterances. - hyp_utils/remove_spk_few_utts.sh --min-num-utts 8 data/${nnet_data}_no_sil - -fi - -if [ $stage -le 4 ]; then - # Prepare train and validation lists for x-vectors - local/make_train_lists_sup_embed_with_augm.sh data/${nnet_data}_no_sil data/${nnet_data}_no_sil/lists_xvec -fi - -exit diff --git a/egs/voxceleb/adv.v1/run_011_train_victim_xvector.sh b/egs/voxceleb/adv.v1/run_011_train_victim_xvector.sh deleted file mode 100755 index 141afa62..00000000 --- a/egs/voxceleb/adv.v1/run_011_train_victim_xvector.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=8 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - - if [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]]; then - train_exec=torch-train-resnet-xvec.py - elif [[ ${nnet_type} =~ efficientnet ]]; then - train_exec=torch-train-efficientnet-xvec.py - elif [[ ${nnet_type} =~ tdnn ]]; then - train_exec=torch-train-tdnn-xvec.py - elif [[ ${nnet_type} =~ transformer ]]; then - train_exec=torch-train-transformer-xvec-v1.py - else - echo "$nnet_type not supported" - exit 1 - fi - - mkdir -p $nnet_dir/log - $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec \ - --data-rspec scp:$list_dir/feats.scp \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --num-frames-file $list_dir/utt2num_frames \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --embed-dim $embed_dim $nnet_opt $opt_opt $lrs_opt \ - --epochs $nnet_num_epochs \ - --s $s --margin $margin --margin-warmup-epochs $margin_warmup \ - --dropout-rate $dropout \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args - -fi - - -exit diff --git a/egs/voxceleb/adv.v1/run_012_prepare_transfer_xvec_train_data.sh b/egs/voxceleb/adv.v1/run_012_prepare_transfer_xvec_train_data.sh deleted file mode 100755 index b622e992..00000000 --- a/egs/voxceleb/adv.v1/run_012_prepare_transfer_xvec_train_data.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - -if [ "$transfer_nnet_data" == "$nnet_data" ];then - echo "Training data for victim and transfer model are the same" - echo "Skipping this step" - exit 0 -fi - -# Now we prepare the features to generate examples for xvector training. -if [ $stage -le 2 ]; then - # This script applies CMVN and removes nonspeech frames. Note that this is somewhat - # wasteful, as it roughly doubles the amount of training data on disk. After - # creating training examples, this can be removed. - steps_xvec/prepare_feats_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ - --storage_name voxceleb-$(date +'%m_%d_%H_%M') \ - data/${transfer_nnet_data} data/${transfer_nnet_data}_no_sil \ - exp/${transfer_nnet_data}_no_sil - utils/fix_data_dir.sh data/${transfer_nnet_data}_no_sil - -fi - - -if [ $stage -le 3 ]; then - # Now, we need to remove features that are too short after removing silence - # frames. We want atleast 4s (400 frames) per utterance. - hyp_utils/remove_short_utts.sh --min-len 400 data/${transfer_nnet_data}_no_sil - - # We also want several utterances per speaker. Now we'll throw out speakers - # with fewer than 8 utterances. - hyp_utils/remove_spk_few_utts.sh --min-num-utts 8 data/${transfer_nnet_data}_no_sil - -fi - -if [ $stage -le 4 ]; then - # Prepare train and validation lists for x-vectors - local/make_train_lists_sup_embed_with_augm.sh data/${transfer_nnet_data}_no_sil \ - data/${transfer_nnet_data}_no_sil/lists_xvec -fi - -exit diff --git a/egs/voxceleb/adv.v1/run_013_train_transfer_xvector.sh b/egs/voxceleb/adv.v1/run_013_train_transfer_xvector.sh deleted file mode 100755 index ad2c0177..00000000 --- a/egs/voxceleb/adv.v1/run_013_train_transfer_xvector.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=8 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$nnet" == "$transfer_nnet" ];then - echo "Victim and transfer model are the same" - echo "Skipping this step" - exit 0 -fi - -nnet_data=$transfer_nnet_data -batch_size_1gpu=$transfer_batch_size_1gpu -eff_batch_size=$transfer_eff_batch_size -min_chunk=$transfer_min_chunk -max_chunk=$transfer_max_chunk -ipe=$transfer_ipe - -nnet_type=$transfer_nnet_type -dropout=$transfer_dropout -embed_dim=$transfer_embed_dim - -s=$transfer_s -margin_warmup=$transfer_margin_warmup -margin=$transfer_margin - -nnet_dir=$transfer_nnet_dir -nnet_opt=$transfer_nnet_opt -opt_opt=$transfer_opt_opt -lrs_opt=$transfer_lrs_opt - -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - - if [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]]; then - train_exec=torch-train-resnet-xvec.py - elif [[ ${nnet_type} =~ efficientnet ]]; then - train_exec=torch-train-efficientnet-xvec.py - elif [[ ${nnet_type} =~ tdnn ]]; then - train_exec=torch-train-tdnn-xvec.py - elif [[ ${nnet_type} =~ transformer ]]; then - train_exec=torch-train-transformer-xvec-v1.py - else - echo "$nnet_type not supported" - exit 1 - fi - - mkdir -p $nnet_dir/log - $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec \ - --data-rspec scp:$list_dir/feats.scp \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --num-frames-file $list_dir/utt2num_frames \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --embed-dim $embed_dim $nnet_opt $opt_opt $lrs_opt \ - --epochs $nnet_num_epochs \ - --s $s --margin $margin --margin-warmup-epochs $margin_warmup \ - --dropout-rate $dropout \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args - -fi - - -exit diff --git a/egs/voxceleb/adv.v1/run_030_extract_xvectors_victim_model.sh b/egs/voxceleb/adv.v1/run_030_extract_xvectors_victim_model.sh deleted file mode 100755 index 02eb78de..00000000 --- a/egs/voxceleb/adv.v1/run_030_extract_xvectors_victim_model.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false - -. parse_options.sh || exit 1; -. $config_file - -if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --chunk-length 12800" - xvec_cmd="$cuda_eval_cmd" -else - xvec_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name - -if [ $stage -le 1 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors.sh --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - $nnet data/$name \ - $xvector_dir/$name - done -fi - -exit diff --git a/egs/voxceleb/adv.v1/run_031_extract_xvectors_transfer_model.sh b/egs/voxceleb/adv.v1/run_031_extract_xvectors_transfer_model.sh deleted file mode 100755 index 5daf2ec8..00000000 --- a/egs/voxceleb/adv.v1/run_031_extract_xvectors_transfer_model.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false - -. parse_options.sh || exit 1; -. $config_file - -if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --chunk-length 12800" - xvec_cmd="$cuda_eval_cmd" -else - xvec_cmd="$train_cmd" -fi - -nnet_name=$transfer_nnet_name -nnet=$transfer_nnet - -xvector_dir=exp/xvectors/$nnet_name - -if [ $stage -le 1 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors.sh --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - $nnet data/$name \ - $xvector_dir/$name - done -fi - -exit diff --git a/egs/voxceleb/adv.v1/run_040_eval_be_victim_model.sh b/egs/voxceleb/adv.v1/run_040_eval_be_victim_model.sh deleted file mode 100755 index ac8c8a24..00000000 --- a/egs/voxceleb/adv.v1/run_040_eval_be_victim_model.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name -score_plda_dir=$score_dir/cosine - -if [ $stage -le 1 ];then - - echo "Eval Voxceleb 1 with Cosine scoring" - steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $score_plda_dir/voxceleb1_scores - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - -fi - - -if [ $stage -le 2 ];then - local/calibrate_voxceleb1_o_clean.sh --cmd "$train_cmd" $score_plda_dir - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test ${score_plda_dir}_cal_v1 - - for f in $(ls ${score_plda_dir}_cal_v1/*_results); - do - echo $f - cat $f - echo "" - done - - -fi diff --git a/egs/voxceleb/adv.v1/run_041_eval_be_transfer_model.sh b/egs/voxceleb/adv.v1/run_041_eval_be_transfer_model.sh deleted file mode 100755 index b9451768..00000000 --- a/egs/voxceleb/adv.v1/run_041_eval_be_transfer_model.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - - -nnet_name=$transfer_nnet_name -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name -score_plda_dir=$score_dir/cosine - -if [ $stage -le 1 ];then - - echo "Eval Voxceleb 1 with Cosine scoring" - steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $score_plda_dir/voxceleb1_scores - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - -fi - -if [ $stage -le 2 ];then - local/calibrate_voxceleb1_o_clean.sh --cmd "$train_cmd" $score_plda_dir - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test ${score_plda_dir}_cal_v1 - - for f in $(ls ${score_plda_dir}_cal_v1/*_results); - do - echo $f - cat $f - echo "" - done - - -fi diff --git a/egs/voxceleb/adv.v1/run_042_eval_victim_from_wav.sh b/egs/voxceleb/adv.v1/run_042_eval_victim_from_wav.sh deleted file mode 100755 index b8ee5ada..00000000 --- a/egs/voxceleb/adv.v1/run_042_eval_victim_from_wav.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 -score_plda_dir=$score_dir/cosine_from_wav - -if [ $stage -le 1 ];then - - echo "Eval Voxceleb 1 with Cosine scoring" - steps_xvec/eval_cosine_scoring_from_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 200 \ - --feat-config conf/fbank80_stmn_16k.yaml \ - --cal-file $cal_file \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - -fi - - diff --git a/egs/voxceleb/adv.v1/run_043_eval_whitebox_attacks.sh b/egs/voxceleb/adv.v1/run_043_eval_whitebox_attacks.sh deleted file mode 100755 index 55500abd..00000000 --- a/egs/voxceleb/adv.v1/run_043_eval_whitebox_attacks.sh +++ /dev/null @@ -1,346 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -feat_config=conf/fbank80_stmn_16k.yaml - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_fgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgsm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 2 ];then - score_array=() - stats_array=() - for snr in 30 20 10 0 - do - score_plda_dir=$score_dir/cosine_fgsm_snr${snr} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack snr=$snr" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type snr-fgsm --attack.snr $snr" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_fgsm_snrall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 3 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_randfgsm_e${eps}_a${alpha} - echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type rand-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_randfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 4 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_iterfgsm_e${eps}_a${alpha} - echo "Eval Voxceleb 1 with Cosine scoring with Iterative FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type iter-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 5 ];then - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - - score_plda_dir=$score_dir/cosine_cwl2_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence lr=$lr num-its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - done - done - done - -fi - - -if [ $stage -le 6 ];then - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - score_plda_dir=$score_dir/cosine_cwrms_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner RMS attack confidence=$confidence lr=$lr num_its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.norm-time --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - -fi - - -if [ $stage -le 7 ];then - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - score_plda_dir=$score_dir/cosine_cwsnr_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner SNR attack confidence=$confidence lr=$lr num_its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.use-snr --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done -fi - - -exit - diff --git a/egs/voxceleb/adv.v1/run_044_eval_transfer_blackbox_attacks.sh b/egs/voxceleb/adv.v1/run_044_eval_transfer_blackbox_attacks.sh deleted file mode 100755 index 937b4b6b..00000000 --- a/egs/voxceleb/adv.v1/run_044_eval_transfer_blackbox_attacks.sh +++ /dev/null @@ -1,481 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -feat_config=conf/fbank80_stmn_16k.yaml - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -transfer_feat_config=$feat_config - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -transfer_xvector_dir=exp/xvectors/$transfer_nnet_name -transfer_score_dir=exp/scores/$transfer_nnet_name -transfer_cal_file=$transfer_score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_fgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgsm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 2 ];then - - score_array=() - stats_array=() - - for snr in 30 20 10 0 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_fgsm_snr${snr} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack snr=$snr" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type snr-fgsm --attack.snr $snr" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_fgsm_snrall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 3 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_randfgsm_e${eps}_a${alpha} - echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type rand-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_randfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 4 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_iterfgsm_e${eps}_a${alpha} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type iter-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 5 ];then - - for confidence in 0 1 - do - for lr in 0.001 - do - for it in 10 - do - - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_cwl2_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence lr=$lr num-its=$it" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 100 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - -fi - - -if [ $stage -le 6 ];then - - for confidence in 0 1 - do - for lr in 0.001 - do - for it in 10 - do - - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_cwrms_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner RMS attack confidence=$confidence lr=$lr num-its=$it" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 200 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.norm-time --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - -fi - - -if [ $stage -le 7 ];then - - for confidence in 0 1 - do - for lr in 0.001 - do - for it in 10 - do - - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_cwsnr_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner SNR attack confidence=$confidence lr=$lr num-its=$it" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 100 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.norm-time --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - -fi - - -# if [ $stage -le -8 ];then - -# for confidence in 0 1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_cwl0_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L0 attack confidence=$confidence" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 1000 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type cw-l0 --confidence $confidence --c-factor 10 \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# data/voxceleb1_test/trials_o_clean \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_plda_dir -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# data/voxceleb1_test/trials_o_clean $score_clean \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ -# $score_analysis_dir/voxceleb1 & -# fi -# done - -# fi - - -# if [ $stage -le 9 ];then - -# for confidence in 0 1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_cwlinf_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner LInf attack confidence=$confidence" -# steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 40 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type cw-linf --confidence $confidence --c-factor 2 \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# data/voxceleb1_test/trials_o_clean \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_plda_dir -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# data/voxceleb1_test/trials_o_clean $score_clean \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ -# $score_analysis_dir/voxceleb1 & -# fi - -# done - -# fi - -wait - diff --git a/egs/voxceleb/adv.v1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh b/egs/voxceleb/adv.v1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh deleted file mode 100755 index ad2e4cdf..00000000 --- a/egs/voxceleb/adv.v1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh +++ /dev/null @@ -1,544 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -feat_config=conf/fbank80_stmn_16k.yaml -sigmas="0.001 0.01" -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - for sigma in $sigmas - do - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_fgsm_e${eps}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgsm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 2 ];then - - for sigma in $sigmas - do - score_array=() - stats_array=() - for snr in 30 20 10 0 - do - score_plda_dir=$score_dir/cosine_fgsm_snr${snr}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack snr=$snr" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type snr-fgsm --attack.snr $snr" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_fgsm_snrall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 3 ];then - for sigma in $sigmas - do - - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_randfgsm_e${eps}_a${alpha}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type rand-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_randfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 4 ];then - for sigma in $sigmas - do - - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_iterfgsm_e${eps}_a${alpha}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Iterative FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type iter-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav --smooth-sigma $sigma \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 5 ];then - - for sigma in $sigmas - do - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - - score_plda_dir=$score_dir/cosine_cwl2_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence lr=$lr num-its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - done - done - done - done -fi - - -if [ $stage -le 6 ];then - for sigma in $sigmas - do - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - score_plda_dir=$score_dir/cosine_cwrms_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner RMS attack confidence=$confidence lr=$lr num_its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.norm-time --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - done -fi - - -if [ $stage -le 7 ];then - for sigma in $sigmas - do - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - score_plda_dir=$score_dir/cosine_cwsnr_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner SNR attack confidence=$confidence lr=$lr num_its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.use-snr --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - done -fi - - -exit - - - -# #!/bin/bash -# # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# # -# # Apache 2.0. -# # -# . ./cmd.sh -# . ./path.sh -# set -e - -# stage=1 -# config_file=default_config.sh -# use_gpu=false -# do_analysis=false -# save_wav=false - -# . parse_options.sh || exit 1; -# . $config_file -# . datapath.sh - -# if [ "$use_gpu" == "true" ];then -# eval_args="--use-gpu true" -# eval_cmd="$cuda_eval_cmd" -# else -# eval_cmd="$train_cmd" -# fi - -# xvector_dir=exp/xvectors/$nnet_name -# score_dir=exp/scores/$nnet_name - -# score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -# cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -# #thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -# thr005=2.94 -# thr001=4.60 -# thr0001=6.90 -# declare -a score_array -# declare -a stats_array - -# if [ $stage -le 1 ];then - -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/cosine_fgsm_e${eps}_randsmooth${sigma} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgsm --eps $eps \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ -# data/voxceleb1_test/trials_o_clean \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_fgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done - -# fi - - - - -# if [ $stage -le 3 ];then -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_randfgsm_e${eps}_a${alpha}_randsmooth$sigma -# echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type rand-fgsm --eps $eps --alpha $alpha --smooth-sigma $sigma\ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# data/voxceleb1_test/trials_o_clean \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# done - -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_randfgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done -# fi - - -# if [ $stage -le 4 ];then -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_iterfgsm_e${eps}_a${alpha}_randsmooth$sigma -# echo "Eval Voxceleb 1 with Cosine scoring with Iterative FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type iter-fgsm --eps $eps --alpha $alpha \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ -# data/voxceleb1_test/trials_o_clean \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_iterfgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done -# fi - -# wait diff --git a/egs/voxceleb/adv.v1/run_053_eval_art_whitebox_attacks.sh b/egs/voxceleb/adv.v1/run_053_eval_art_whitebox_attacks.sh deleted file mode 100755 index 3d01fbfa..00000000 --- a/egs/voxceleb/adv.v1/run_053_eval_art_whitebox_attacks.sh +++ /dev/null @@ -1,536 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -feat_config=conf/fbank80_stmn_16k.yaml - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 - -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_art_fgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 2 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_fgsm_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM minimal attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgsm_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - -if [ $stage -le 3 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_art_fgml1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 4 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_fgml1_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 minimal attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml1_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 5 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_art_fgml2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 6 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_fgml2_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 minimal attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml2_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 7 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_iterfgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with IterFGM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type bim --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 8 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_pgdlinf_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_pgdlinf_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - - -if [ $stage -le 9 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_pgdl1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_pgdl1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 10 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_pgdl2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_pgdl2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 11 ];then - - for confidence in 0 #1 - do - score_plda_dir=$score_dir/cosine_art_cwl2_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - - -if [ $stage -le 12 ];then - - for confidence in 0 #1 - do - score_plda_dir=$score_dir/cosine_art_cwlinf_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner Linf attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.eps 0.3" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - - diff --git a/egs/voxceleb/adv.v1/run_054_eval_art_transfer_blackbox_attacks.sh b/egs/voxceleb/adv.v1/run_054_eval_art_transfer_blackbox_attacks.sh deleted file mode 100755 index 254cef78..00000000 --- a/egs/voxceleb/adv.v1/run_054_eval_art_transfer_blackbox_attacks.sh +++ /dev/null @@ -1,626 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -feat_config=conf/fbank80_stmn_16k.yaml - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -transfer_feat_config=$feat_config - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -transfer_xvector_dir=exp/xvectors/$transfer_nnet_name -transfer_score_dir=exp/scores/$transfer_nnet_name -transfer_cal_file=$transfer_score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 2 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 3 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 4 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM minimal L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 5 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 6 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring FGM minimal L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 7 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with iter FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type bim --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 8 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 9 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 10 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 11 ];then - - for confidence in 0 #1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwl2_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - - -if [ $stage -le 12 ];then - - for confidence in 0 #1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwlinf_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner LInf attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 40 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.eps 0.3" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - -wait - diff --git a/egs/voxceleb/adv.v1/steps_adv b/egs/voxceleb/adv.v1/steps_adv deleted file mode 120000 index fa9be351..00000000 --- a/egs/voxceleb/adv.v1/steps_adv +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/adv \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/steps_fe b/egs/voxceleb/adv.v1/steps_fe deleted file mode 120000 index 73ccc1eb..00000000 --- a/egs/voxceleb/adv.v1/steps_fe +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/kaldi/vad \ No newline at end of file diff --git a/egs/voxceleb/adv.v2/cmd.sh b/egs/voxceleb/adv.v2/cmd.sh index 56b7eeeb..8f2d9b19 100755 --- a/egs/voxceleb/adv.v2/cmd.sh +++ b/egs/voxceleb/adv.v2/cmd.sh @@ -13,7 +13,8 @@ if [ "$(hostname -d)" == "cm.gemini" ];then #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" - export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 20G" export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" else diff --git a/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml b/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml index 79f33282..03a4b141 100644 --- a/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml +++ b/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml @@ -1,32 +1,64 @@ +data: + train: + dataset: + class_names: + - class_id + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 feats: fbank80_stmn_16k.yaml -min_chunk_length: 4 -max_chunk_length: 4 -iters_per_epoch: 6 -resnet_type: lresnet34 -in_feats: 80 -in_channels: 1 -in_kernel_size: 3 -in_stride: 1 -no_maxpool: true -dropout_rate: 0.0 -embed_dim: 10 -margin: 0.2 -margin_warmup_epochs: 6.0 -s: 30 -epochs: 20 -optim: - amsgrad: true - beta1: 0.9 - beta2: 0.95 - lr: 0.01 - opt_type: adam - weight_decay: 1.0e-05 -lrsched: - decay_rate: 0.5 - decay_steps: 8000 - eps: 1.0e-08 - hold_steps: 16000 - lrsch_type: exp_lr - min_lr: 1.0e-05 - update_lr_on_opt_step: true - warmup_steps: 1000 +model: + resnet_type: lresnet34 + in_feats: 80 + in_channels: 1 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + dropout_rate: 0.0 + embed_dim: 10 + margin: 0.2 + margin_warmup_epochs: 6.0 + s: 30 +trainer: + epochs: 20 + eff_batch_size: 512 + optim: + amsgrad: true + beta1: 0.9 + beta2: 0.95 + lr: 0.01 + opt_type: adam + weight_decay: 1.0e-05 + lrsched: + decay_rate: 0.5 + decay_steps: 8000 + eps: 1.0e-08 + hold_steps: 16000 + lrsch_type: exp_lr + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + diff --git a/egs/voxceleb/adv.v2/conf/lresnet34_spknet.yaml b/egs/voxceleb/adv.v2/conf/lresnet34_spknet.yaml index 0a78edb5..a12487ee 100644 --- a/egs/voxceleb/adv.v2/conf/lresnet34_spknet.yaml +++ b/egs/voxceleb/adv.v2/conf/lresnet34_spknet.yaml @@ -1,34 +1,68 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 feats: fbank80_stmn_16k.yaml -train_aug_cfg: conf/reverb_noise_aug.yaml -val_aug_cfg: conf/reverb_noise_aug.yaml -min_chunk_length: 4 -max_chunk_length: 4 -iters_per_epoch: 6 -resnet_type: lresnet34 -in_feats: 80 -in_channels: 1 -in_kernel_size: 3 -in_stride: 1 -no_maxpool: true -dropout_rate: 0.0 -embed_dim: 256 -margin: 0.3 -margin_warmup_epochs: 20.0 -s: 30 -epochs: 70 -optim: - amsgrad: true - beta1: 0.9 - beta2: 0.95 - lr: 0.05 - opt_type: adam - weight_decay: 1.0e-05 -lrsched: - decay_rate: 0.5 - decay_steps: 8000 - eps: 1.0e-08 - hold_steps: 40000 - lrsch_type: exp_lr - min_lr: 1.0e-05 - update_lr_on_opt_step: true - warmup_steps: 1000 +model: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: lrsched_exp_default.yaml + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + use_amp: true + log_interval: 1000 + epochs: 70 + eff_batch_size: 512 diff --git a/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml b/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml index 4754206d..a617622c 100644 --- a/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml +++ b/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml @@ -1,34 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 feats: fbank80_stmn_16k.yaml -min_chunk_length: 4 -max_chunk_length: 4 -iters_per_epoch: 6 -resnet_type: res2net50 -in_feats: 80 -in_channels: 1 -in_kernel_size: 3 -in_stride: 1 -no_maxpool: true -res2net_width_factor: 1.625 -res2net_scale: 4 -dropout_rate: 0.0 -embed_dim: 10 -margin: 0.2 -margin_warmup_epochs: 6.0 -s: 30 -epochs: 20 -optim: - amsgrad: true - beta1: 0.9 - beta2: 0.95 - lr: 0.01 - opt_type: adam - weight_decay: 1.0e-05 -lrsched: - decay_rate: 0.5 - decay_steps: 8000 - eps: 1.0e-08 - hold_steps: 16000 - lrsch_type: exp_lr - min_lr: 1.0e-05 - update_lr_on_opt_step: true - warmup_steps: 1000 +model: + resnet_type: res2net50 + in_feats: 80 + in_channels: 1 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + dropout_rate: 0.0 + embed_dim: 10 + margin: 0.2 + margin_warmup_epochs: 6.0 + s: 30 +trainer: + epochs: 20 + eff_batch_size: 256 + optim: + amsgrad: true + beta1: 0.9 + beta2: 0.95 + lr: 0.01 + opt_type: adam + weight_decay: 1.0e-05 + lrsched: + decay_rate: 0.5 + decay_steps: 8000 + eps: 1.0e-08 + hold_steps: 16000 + lrsch_type: exp_lr + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 diff --git a/egs/voxceleb/adv.v2/global_conf/config_spknet_fbank80_stmn_lresnet34_attacknet_same.v1.sh b/egs/voxceleb/adv.v2/global_conf/config_spknet_fbank80_stmn_lresnet34_attacknet_same.v1.sh index 5ae7f68e..ed10ff0a 100644 --- a/egs/voxceleb/adv.v2/global_conf/config_spknet_fbank80_stmn_lresnet34_attacknet_same.v1.sh +++ b/egs/voxceleb/adv.v2/global_conf/config_spknet_fbank80_stmn_lresnet34_attacknet_same.v1.sh @@ -6,8 +6,8 @@ spknet_command=resnet spknet_data=voxceleb2cat_train spknet_config=conf/lresnet34_spknet.yaml -spknet_batch_size_1gpu=128 -spknet_eff_batch_size=512 # effective batch size +#spknet_batch_size_1gpu=128 +#spknet_eff_batch_size=512 # effective batch size spknet_name=lresnet34 spknet_dir=exp/xvector_nnets/$spknet_name spknet=$spknet_dir/model_ep0070.pth @@ -26,8 +26,8 @@ spkv_attacks_common_opts="--save-failed" #save failed attacks also # Attack model LResNet34 configuration sign_nnet_command=resnet sign_nnet_config=conf/lresnet34_atnet.yaml -sign_nnet_batch_size_1gpu=128 -sign_nnet_eff_batch_size=512 # effective batch size +#sign_nnet_batch_size_1gpu=128 +#sign_nnet_eff_batch_size=512 # effective batch size sign_nnet_name=lresnet34 # SNRs in -100, 100 diff --git a/egs/voxceleb/adv.v2/local/calibrate_voxceleb1_o_clean.sh b/egs/voxceleb/adv.v2/local/calibrate_voxceleb1_o_clean.sh index 736c3fb0..01c06036 100755 --- a/egs/voxceleb/adv.v2/local/calibrate_voxceleb1_o_clean.sh +++ b/egs/voxceleb/adv.v2/local/calibrate_voxceleb1_o_clean.sh @@ -30,7 +30,7 @@ train_scores=$score_dir/voxceleb1_scores train_key=data/voxceleb1_test/trials_o_clean $cmd $cal_score_dir/train_cal_tel.log \ - steps_be/train-calibration-v1.py --score-file $train_scores \ + steps_backend/train-calibration-v1.py --score-file $train_scores \ --key-file $train_key --model-file $model_file --prior $prior --lambda-reg $l2_reg ndxs=(voxceleb1_test/trials_o_clean) @@ -43,7 +43,7 @@ do scores_out=$cal_score_dir/${scores[$i]}_scores ndx=data/${ndxs[$i]} $cmd $cal_score_dir/eval_cal_${scores[$i]}.log \ - steps_be/eval-calibration-v1.py --in-score-file $scores_in \ + steps_backend/eval-calibration-v1.py --in-score-file $scores_in \ --ndx-file $ndx --model-file $model_file --out-score-file $scores_out & done diff --git a/egs/voxceleb/adv.v2/local/make_some_figs.py b/egs/voxceleb/adv.v2/local/make_some_figs.py index 0b2b672f..84c167a4 100755 --- a/egs/voxceleb/adv.v2/local/make_some_figs.py +++ b/egs/voxceleb/adv.v2/local/make_some_figs.py @@ -11,7 +11,7 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.metrics.verification_evaluator import ( +from hyperion.np.metrics.verification_evaluator import ( VerificationAdvAttackEvaluator as Eval, ) diff --git a/egs/voxceleb/adv.v2/local/score_dcf.py b/egs/voxceleb/adv.v2/local/score_dcf.py index 50babe69..1718ad4d 100755 --- a/egs/voxceleb/adv.v2/local/score_dcf.py +++ b/egs/voxceleb/adv.v2/local/score_dcf.py @@ -14,7 +14,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import SparseTrialScores, SparseTrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh index c5d03ee2..a1acb1f6 100755 --- a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh +++ b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh @@ -10,45 +10,45 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" +use_tb=false +use_wandb=false . parse_options.sh || exit 1; . $config_file . datapath.sh -batch_size=$(($spknet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $spknet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${spknet_data}_proc_audio_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi +nnet_type=$spknet_command +nnet_data=$spknet_data +nnet_dir=$spknet_dir +nnet_cfg=$spknet_config +list_dir=data/${nnet_data}_proc_audio_no_sil if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi +nnet_type=$spknet_command +nnet_dir + # Network Training if [ $stage -le 1 ]; then - - mkdir -p $spknet_dir/log - $cuda_cmd --gpu $ngpu $spknet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $spknet_command --cfg $spknet_config \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $spknet_dir $args - + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu + fi + diff --git a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh index 53e8e5a6..b453260f 100755 --- a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh @@ -10,9 +10,7 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false -num_workers=4 xvec_use_gpu=false xvec_chunk_length=12800 @@ -27,15 +25,9 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/$attack_type_split_tag args="" -if [ "$resume" == "true" ];then - args="--resume" -fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl @@ -53,18 +45,17 @@ if [ $stage -le 1 ]; then mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_dir/trainval_wav.scp \ - --time-durs-file $list_dir/trainval_utt2dur \ - --train-list $list_dir/train_utt2attack \ - --val-list $list_dir/val_utt2attack \ - --class-file $list_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_dir/train_utt2attack \ + --data.train.dataset.class-file $list_dir/class_file \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args + fi if [ $stage -le 2 ]; then @@ -82,7 +73,7 @@ fi proj_dir=$sign_dir/test/tsne_${attack_type_split_tag} if [ $stage -le 3 ];then echo "Make TSNE plots on all test attacks" - echo "Result will be left in $proj_idr" + echo "Result will be left in $proj_dir" for p in 30 100 250 do for e in 12 64 @@ -112,7 +103,7 @@ if [ $stage -le 4 ]; then fi if [ $stage -le 5 ];then - echo "Compute cofusion matrices" + echo "Compute confusion matrices" echo "Result is left in $logits_dir/test/eval_acc.log" $train_cmd $logits_dir/test/eval_acc.log \ hyp_utils/conda_env.sh steps_backend/eval-classif-perf.py \ diff --git a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh index 79bf810a..de811505 100755 --- a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh @@ -10,9 +10,7 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false -num_workers=8 xvec_use_gpu=false xvec_chunk_length=12800 @@ -27,15 +25,9 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/$snr_split_tag args="" -if [ "$resume" == "true" ];then - args="--resume" -fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl @@ -53,19 +45,16 @@ if [ $stage -le 1 ]; then mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_dir/trainval_wav.scp \ - --time-durs-file $list_dir/trainval_utt2dur \ - --train-list $list_dir/train_utt2attack \ - --val-list $list_dir/val_utt2attack \ - --class-file $list_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args - + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_dir/train_utt2attack \ + --data.train.dataset.class-file $list_dir/class_file \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ + --num-gpus $ngpu fi if [ $stage -le 2 ]; then @@ -83,7 +72,7 @@ fi proj_dir=$sign_dir/test/tsne if [ $stage -le 3 ];then echo "Make TSNE plots on all test attacks" - echo "Result will be left in $proj_idr" + echo "Result will be left in $proj_dir" for p in 30 100 250 do for e in 12 64 @@ -101,7 +90,6 @@ if [ $stage -le 3 ];then wait fi - if [ $stage -le 4 ]; then echo "Eval signature network logits on test attacks" mkdir -p $list_dir/test @@ -114,7 +102,7 @@ if [ $stage -le 4 ]; then fi if [ $stage -le 5 ];then - echo "Compute cofusion matrices" + echo "Compute confusion matrices" echo "Result is left in $logits_dir/test/eval_acc.log" $train_cmd $logits_dir/test/eval_acc.log \ hyp_utils/conda_env.sh steps_backend/eval-classif-perf.py \ diff --git a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh index 3a4e9147..aa17a1ae 100755 --- a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh @@ -10,7 +10,6 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false num_workers=4 xvec_use_gpu=false @@ -27,16 +26,10 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/$threat_model_split_tag list_attack_type_dir=data/$attack_type_split_tag args="" -if [ "$resume" == "true" ];then - args="--resume" -fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl @@ -53,19 +46,17 @@ if [ $stage -le 1 ]; then echo "Train signature network on all attacks" mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_dir/trainval_wav.scp \ - --time-durs-file $list_dir/trainval_utt2dur \ - --train-list $list_dir/train_utt2attack \ - --val-list $list_dir/val_utt2attack \ - --class-file $list_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_dir/train_utt2attack \ + --data.train.dataset.class-file $list_dir/class_file \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ + --num-gpus $ngpu fi if [ $stage -le 2 ]; then @@ -83,7 +74,7 @@ fi proj_dir=$sign_dir/test/tsne_${attack_type_split_tag} if [ $stage -le 3 ];then echo "Make TSNE plots on all test attacks with colors indicating attack type" - echo "Result will be left in $proj_idr" + echo "Result will be left in $proj_dir" for p in 30 100 250 do for e in 12 64 @@ -137,7 +128,7 @@ if [ $stage -le 5 ]; then fi if [ $stage -le 6 ];then - echo "Compute cofusion matrices" + echo "Compute confusion matrices" echo "Result is left in $logits_dir/test/eval_acc.log" $train_cmd $logits_dir/test/eval_acc.log \ hyp_utils/conda_env.sh steps_backend/eval-classif-perf.py \ diff --git a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh index 31cd6139..3b93fabd 100755 --- a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh +++ b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh @@ -10,9 +10,7 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false -num_workers=4 xvec_use_gpu=false xvec_chunk_length=12800 @@ -27,18 +25,12 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) #list with only the known attacks list_someknown_dir=data/$sk_attack_type_split_tag # list with all the attacks list_all_dir=data/$attack_type_split_tag args="" -if [ "$resume" == "true" ];then - args="--resume" -fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl @@ -55,19 +47,18 @@ if [ $stage -le 1 ]; then echo "Train attack signature network on known attacks only" mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_someknown_dir/trainval_wav.scp \ - --time-durs-file $list_someknown_dir/trainval_utt2dur \ - --train-list $list_someknown_dir/train_utt2attack \ - --val-list $list_someknown_dir/val_utt2attack \ - --class-file $list_someknown_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ + --data.train.dataset.class-file $list_someknown_dir/class_file \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ + --num-gpus $ngpu + fi if [ $stage -le 2 ]; then @@ -302,7 +293,7 @@ if [ $stage -le 13 ]; then awk '!/benign/' $list_someknown_dir/train/utt2spk > $list_someknown_dir/train_nobenign/utt2spk steps_backend/train_be_v1.sh --cmd "$train_cmd" \ --plda-type splda \ - --y-dim 6 \ + --y-dim 5 \ $sign_dir/train/xvector.scp \ $list_someknown_dir/train_nobenign \ $be_dir diff --git a/egs/voxceleb/adv.v2/run_032_snr_verif.sh b/egs/voxceleb/adv.v2/run_032_snr_verif.sh index 8e4f0d41..12d42c99 100755 --- a/egs/voxceleb/adv.v2/run_032_snr_verif.sh +++ b/egs/voxceleb/adv.v2/run_032_snr_verif.sh @@ -10,9 +10,7 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false -num_workers=4 xvec_use_gpu=false xvec_chunk_length=12800 @@ -27,9 +25,6 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) #list with only the known attacks list_someknown_dir=data/$sk_snr_split_tag # list with all the attacks @@ -55,19 +50,17 @@ if [ $stage -le 1 ]; then echo "Train attack signature network on known attacks only" mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_someknown_dir/trainval_wav.scp \ - --time-durs-file $list_someknown_dir/trainval_utt2dur \ - --train-list $list_someknown_dir/train_utt2attack \ - --val-list $list_someknown_dir/val_utt2attack \ - --class-file $list_someknown_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ + --data.train.dataset.class-file $list_someknown_dir/class_file \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ + --num-gpus $ngpu fi if [ $stage -le 2 ]; then diff --git a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh index 1e87d749..cbfaaa81 100755 --- a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh +++ b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh @@ -10,9 +10,7 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false -num_workers=4 xvec_use_gpu=false xvec_chunk_length=12800 @@ -27,9 +25,6 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) #list with only the known attacks list_someknown_dir=data/$sk_threat_model_split_tag # list with all the attacks @@ -56,19 +51,18 @@ if [ $stage -le 1 ]; then echo "Train attack signature network on known attacks only" mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_someknown_dir/trainval_wav.scp \ - --time-durs-file $list_someknown_dir/trainval_utt2dur \ - --train-list $list_someknown_dir/train_utt2attack \ - --val-list $list_someknown_dir/val_utt2attack \ - --class-file $list_someknown_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ + --data.train.dataset.class-file $list_someknown_dir/class_file \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ + --num-gpus $ngpu + fi if [ $stage -le 2 ]; then diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py index 4b017114..ea570f60 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py @@ -18,7 +18,7 @@ from hyperion.utils import TrialNdx, TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py index 0b3c9125..48094d0f 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py @@ -15,11 +15,11 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring -from hyperion.pdfs import PLDA +from hyperion.utils.math_funcs import cosine_scoring +from hyperion.np.pdfs import PLDA from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList, LNorm +from hyperion.np.transforms import TransformList, LNorm def eval_plda( diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py index 0438e373..49720cb5 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py @@ -19,10 +19,10 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py index 3ebac1f6..29b0a2c8 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py @@ -19,7 +19,7 @@ from hyperion.utils import TrialNdx, TrialScores, Utt2Info from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList from hyperion.io import RandomAccessDataReaderFactory as DRF diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-calibration-v1.py b/egs/voxceleb/adv.v2/steps_backend/eval-calibration-v1.py new file mode 100755 index 00000000..fdd5516f --- /dev/null +++ b/egs/voxceleb/adv.v2/steps_backend/eval-calibration-v1.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Evals calibration +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.trial_scores import TrialScores +from hyperion.utils.trial_key import TrialKey +from hyperion.utils.trial_ndx import TrialNdx +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR + + +def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): + + logging.info("load ndx: %s" % ndx_file) + try: + ndx = TrialNdx.load_txt(ndx_file) + except: + ndx = TrialKey.load_txt(ndx_file) + + logging.info("load scores: %s" % in_score_file) + scr = TrialScores.load_txt(in_score_file) + scr = scr.align_with_ndx(ndx) + + logging.info("load model: %s" % model_file) + lr = LR.load(model_file) + logging.info("apply calibration") + s_cal = lr.predict(scr.scores.ravel()) + scr.scores = np.reshape(s_cal, scr.scores.shape) + + logging.info("save scores: %s" % out_score_file) + scr.save_txt(out_score_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Evals linear calibration") + + parser.add_argument("--in-score-file", dest="in_score_file", required=True) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_calibration(**namespace_to_dict(args)) diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py index 630bc244..0aeb2367 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py @@ -19,13 +19,13 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import ( +from hyperion.np.metrics.acc import compute_accuracy +from hyperion.np.metrics.confusion_matrix import ( compute_confusion_matrix, print_confusion_matrix, ) -from hyperion.transforms import PCA, LNorm -from hyperion.pdfs import SPLDA +from hyperion.np.transforms import PCA, LNorm +from hyperion.np.pdfs import SPLDA from numpy.linalg import matrix_rank # colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py index 5ad87f72..796422f8 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py @@ -19,13 +19,13 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import ( +from hyperion.np.metrics.acc import compute_accuracy +from hyperion.np.metrics.confusion_matrix import ( compute_confusion_matrix, print_confusion_matrix, ) -from hyperion.transforms import PCA, LNorm -from hyperion.pdfs import SPLDA +from hyperion.np.transforms import PCA, LNorm +from hyperion.np.pdfs import SPLDA from numpy.linalg import matrix_rank # colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py index e8dd6e00..cf20735f 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py @@ -19,8 +19,8 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import ( +from hyperion.np.metrics.acc import compute_accuracy +from hyperion.np.metrics.confusion_matrix import ( compute_xlabel_confusion_matrix, print_confusion_matrix, ) diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py index 6b259a2f..d3d828a5 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py @@ -20,8 +20,8 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import ( +from hyperion.np.metrics.acc import compute_accuracy +from hyperion.np.metrics.confusion_matrix import ( compute_confusion_matrix, print_confusion_matrix, ) @@ -52,7 +52,7 @@ def eval_classif_perf(score_file, key_file, class_file, output_path=None, **kwar acc = compute_accuracy(y_true, y_pred) logging.info("Classification accuracy %.2f %%" % (acc * 100)) - labels = np.arange(len(classes), dtype=np.int) + labels = np.arange(len(classes), dtype=int) C = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=False) logging.info("Unnormalized Confusion Matrix:") print_confusion_matrix(C, labels_true=classes) @@ -69,8 +69,6 @@ def eval_classif_perf(score_file, key_file, class_file, output_path=None, **kwar parser.add_argument("--score-file", required=True) parser.add_argument("--key-file", required=True) parser.add_argument("--class-file", required=True) - - # parser.add_argument('--output-path', dest='output_path', required=True) parser.add_argument( "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int ) diff --git a/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py b/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py index b681b0ac..e2c8e928 100755 --- a/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py +++ b/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, LDA, LNorm, PCA +from hyperion.np.transforms import TransformList, LDA, LNorm, PCA from hyperion.helpers import PLDAFactory as F from numpy.linalg import matrix_rank diff --git a/egs/voxceleb/adv.v2/steps_backend/train-calibration-v1.py b/egs/voxceleb/adv.v2/steps_backend/train-calibration-v1.py new file mode 100755 index 00000000..489ceed9 --- /dev/null +++ b/egs/voxceleb/adv.v2/steps_backend/train-calibration-v1.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Trains calibration for SRE18 tel condition +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.trial_scores import TrialScores +from hyperion.utils.trial_key import TrialKey +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR + + +def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): + + logging.info("load key: %s" % key_file) + key = TrialKey.load_txt(key_file) + logging.info("load scores: %s" % score_file) + scr = TrialScores.load_txt(score_file) + tar, non = scr.get_tar_non(key) + ntar = len(tar) + nnon = len(non) + + min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + logging.info("train calibration") + x = np.concatenate((tar, non)) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) + lr.fit(x, y) + print(lr.A) + print(lr.b) + logging.info("save calibration at %s" % model_file) + lr.save(model_file) + + logging.info("calibrate scores") + tar_cal = lr.predict(tar) + non_cal = lr.predict(non) + act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Trains llr calibration") + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", dest="lambda_reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + logging.debug(args) + + train_calibration(**namespace_to_dict(args)) diff --git a/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py b/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py index 03fa3325..b7725386 100755 --- a/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py +++ b/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py @@ -25,7 +25,7 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.transforms import LDA +from hyperion.np.transforms import LDA colors = ["b", "g", "r", "c", "m", "y", "k"] markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] diff --git a/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py b/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py index a76a6633..b02447e8 100755 --- a/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py +++ b/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py @@ -25,7 +25,7 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.transforms import PCA, SklTSNE, LNorm +from hyperion.np.transforms import PCA, SklTSNE, LNorm colors = ["b", "g", "r", "c", "m", "y", "k"] markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] diff --git a/egs/voxceleb/ssl.v1/README.md b/egs/voxceleb/ssl.v1/README.md new file mode 100644 index 00000000..73b1d039 --- /dev/null +++ b/egs/voxceleb/ssl.v1/README.md @@ -0,0 +1,207 @@ +# VoxCeleb SSL V1 + +Recipe for the Unsupervised VoxCeleb Speaker Verification Task: + - Trains embedding extractor using DINO + - Cluster embeddings of VoxCeleb2 to get pseuso-speaker labels + - Embedding Model is fine-tuned with Large Margin Softmax loss on the pseudo-speaker labels + - Repeakt embedding clustering to get new pseuso-speaker labels + - Embedding Model is fine-tuned with Large Margin Softmax loss on the new pseudo-speaker labels + +## Citing + +If you use our DINO implementation, please, cite these works: + +``` +@ARTICLE{9852303, + author={Cho, Jaejin and Villalba, Jesús and Moro-Velazquez, Laureano and Dehak, Najim}, + journal={IEEE Journal of Selected Topics in Signal Processing}, + title={Non-Contrastive Self-Supervised Learning for Utterance-Level Information Extraction From Speech}, + year={2022}, + volume={16}, + number={6}, + pages={1284-1295}, + keywords={Alzheimer's disease;Transfer learning;Speech processing;Feature extraction;Self-supervised learning;Training;Emotion recognition;Self-supervised learning;transfer learning;speaker verification;emotion recognition;Alzheimer's disease;distillation;non-contrastive}, + doi={10.1109/JSTSP.2022.3197315}} + +@inproceedings{cho22c_interspeech, + author={Jaejin Cho and Raghavendra Pappagari and Piotr Żelasko and Laureano Moro Velazquez and Jesus Villalba and Najim Dehak}, + title={{Non-contrastive self-supervised learning of utterance-level speech representations}}, + year=2022, + booktitle={Proc. Interspeech 2022}, + pages={4028--4032}, + doi={10.21437/Interspeech.2022-11141} +} +``` + +## Training Data + + - x-Vector network is trained on Voxceleb2 dev + test with augmentations + - MUSAN noise + - RIR reverberation + +## Test data + + - Test data is VoxCeleb 1 + - We evaluate the 3 conditions (with cleaned lists): + - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers + - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1 + - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials. + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it will use config global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh + - To use other configs: +```bash +run_xxx_xxxx.sh --config-file global_conf/other_config.sh +``` + + +## Recipe Steps: + + - `run_001_prepare_data.sh` + - Data preparation script to generate Kaldi style data directories for + - VoxCeleb2 train+test + - VoxCeleb1 O/E/H eval sets + + - `run_002_compute_evad.sh` + - Computes Energy VAD for all datasets + + - `run_003_prepare_noises_rirs.sh` + - Prepares MUSAN noises, music to be used by SpeechAugment class. + - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. + - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. + + - `run_004_prepare_xvec_train_data.sh` + - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. + - Removes silence from the audios + - Removes utterances shorter than 4secs and speakers with less than 8 utterances. + - Creates training and validation lists for x-vector training + + - `run_005_train_dino.sh` + - Trains DINO embeddings + + - `run_006_extract_dino_embeds_cluster_eval.sh` + - Extracts DINO embeddings for Vox2 and Vox1 + - Evaluates SV metrics in Vox1-O/E/H using Cosine Scoring + - Clusters Vox2 Embeddings into pseudo-speakers + - Trains PLDA on Vox2 pseudo-speakers + - Evaluates SV metrics in Vox1-O/E/H using PLDA + + - `run_007_train_xvector.sh` + - Fine-tunes DINO model in x-vector style using pseudo-labels from previous step + - First, it finetunes x-vector projection and output layer with the rest of network frozen + - Second, it finetunes full network + + - `run_008_extract_ft1_xvec_embeds_cluster_eval.sh` + - Extracts X-Vector embeddings for Vox2 and Vox1 + - Evaluates SV metrics in Vox1-O/E/H using Cosine Scoring + - Clusters Vox2 Embeddings into pseudo-speakers + - Trains PLDA on Vox2 pseudo-speakers + - Evaluates SV metrics in Vox1-O/E/H using PLDA + + - `run_009_finetune_xvector_s2.sh` + - Fine-tunes X-Vector model in x-vector style using pseudo-labels from previous step + - First, it finetunes x-vector projection and output layer with the rest of network frozen + - Second, it finetunes full network + + - `run_010_extract_ft2_xvec_embeds_cluster_eval.sh` + - Extracts X-Vector embeddings for Vox2 and Vox1 + - Evaluates SV metrics in Vox1-O/E/H using Cosine Scoring + - Clusters Vox2 Embeddings into pseudo-speakers + - Trains PLDA on Vox2 pseudo-speakers + - Evaluates SV metrics in Vox1-O/E/H using PLDA + + +## Results + +### VoxCeleb 1 Original-Clean trial list + +| Config | Model Type | DINO Clustering | X-Vector Clustering | Stage | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | --------------- | ------------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34.v1.2.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 3.96 | 0.276 | 0.423 | +| | | | | | PLDA | 3.18 | 0.182 | 0.273 | +| | | | | FT-1 | Cosine | 1.97 | 0.139 | 0.214 | +| | | | | FT-2 | Cosine | 1.80 | 0.133 | 0.200 | +| config_fbank80_stmn_lresnet34.v1.2.1.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 1.75 | 0.124 | 0.197 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.sh | ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 4.14 | 0.274 | 0.405 | +| | | | | | PLDA | 4.16 | 0.225 | 0.361 | +| | | | | FT-1 | Cosine | 2.68 | 0.173 | 0.258 | +| | | | | FT-2 | Cosine | 2.57 | 0.151 | 0.244 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.1.sh| ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 2.71 | 0.169 | 0.243 | +| config_fbank80_stmn_fwseresnet34.v1.2.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 4.57 | 0.344 | 0.553 | +| | | | | | PLDA | 2.92 | 0.232 | 0.410 | +| | | | | FT-1 | Cosine | 2.11 | 0.135 | 0.223 | +| | | | | FT-1 | PLDA | 1.75 | 0.137 | 0.236 | +| | | | | FT-2 | Cosine | 1.65 | 0.116 | 0.168 | +| | | | | FT-2 | PLDA | 1.67 | 0.137 | 0.193 | +| config_fbank80_stmn_fwseresnet34.v1.2.1.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 1.49 | 0.101 | 0.161 | +| | | | | FT-2 | PLDA | 1.53 | 0.109 | 0.168| +| config_fbank80_stmn_fwseresnet34.v1.2.2.sh | FW-SE ResNet34 / 0.1 x Cos Reg. | Cos+AHC+PLDA+AHC | Cos+AHC | DINO | Cosine | 3.96 | 0.232 | 0.358 | +| | | | | | PLDA | 4.04 | 0.185 | 0.291 | +| | | | | FT-1 | Cosine | 2.03 | 0.125 | 0.203 | +| | | | | FT-1 | PLDA | 2.44 | 0.149 | 0.231 | +| | | | | FT-2 | Cosine | 1.88 | 0.115 | 0.198 | +| | | | | FT-2 | PLDA | 2.57 | 0.147 | 0.234 | + + + +### VoxCeleb 1 Entire-Clean trial list + +| Config | Model Type | DINO Clustering | X-Vector Clustering | Stage | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | --------------- | ------------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34.v1.2.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 4.94 | 0.304 | 0.483 | +| | | | | | PLDA | 3.72 | 0.184 | 0.300 | +| | | | | FT-1 | Cosine | 2.35 | 0.136 | 0.217 | +| | | | | FT-2 | Cosine | 2.02 | 0.118 | 0.195 | +| config_fbank80_stmn_lresnet34.v1.2.1.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 1.98 | 0.116 | 0.185 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.sh | ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 4.61 | 0.293 | 0.455| +| | | | | | PLDA | 3.91 | 0.223 | 0.356 | +| | | | | FT-1 | Cosine | 3.04 | 0.168 | 0.263 | +| | | | | FT-2 | Cosine | 2.83 | 0.155 | 0.248 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.1.sh| ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 3.06 | 0.164 | 0.256 | +| config_fbank80_stmn_fwseresnet34.v1.2.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 5.50 | 0.426 | 0.664 | +| | | | | | PLDA | 3.33 | 0.245 | 0.425 | +| | | | | FT-1 | Cosine | 2.42 | 0.147 | 0.243 | +| | | | | FT-1 | PLDA | 2.03 | 0.144 | 0.255 | +| | | | | FT-2 | Cosine | 1.86 | 0.112 | 0.186 | +| | | | | FT-2 | PLDA | 1.77 | 0.121 | 0.208 | +| config_fbank80_stmn_fwseresnet34.v1.2.1.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 1.83 | 0.106 | 0.170 | +| | | | | FT-2 | PLDA | 1.68 | 0.109 | 0.188 | +| config_fbank80_stmn_fwseresnet34.v1.2.2.sh | FW-SE ResNet34 / 0.1 x Cos Reg. | Cos+AHC+PLDA+AHC | Cos+AHC | DINO | Cosine | 4.31 | 0.250 | 0.387 | +| | | | | | PLDA | 4.32 | 0.166 | 0.263 | +| | | | | FT-1 | Cosine | 2.61 | 0.138 | 0.210 | +| | | | | FT-1 | PLDA | 2.72 | 0.1366 | 0.216 | +| | | | | FT-2 | Cosine | 2.41 | 0.121 | 0.193 | +| | | | | FT-2 | PLDA | 2.82 | 0.140 | 0.219 | + + + +### VoxCeleb 1 Hard-Clean trial list + +| Config | Model Type | DINO Clustering | X-Vector Clustering | Stage | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | --------------- | ------------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34.v1.2.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 8.33 | 0.462 | 0.664 | +| | | | | | PLDA | 5.91 | 0.304 | 0.481 | +| | | | | FT-1 | Cosine | 3.89 | 0.215 | 0.340 | +| | | | | FT-2 | Cosine | 3.44 | 0.192 | 0.303 | +| config_fbank80_stmn_lresnet34.v1.2.1.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 3.33 | 0.185 | 0.290 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.sh | ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 8.38 | 0.458 | 0.635 | +| | | | | | PLDA | 6.48 | 0.360 | 0.532 | +| | | | | FT-1 | Cosine | 4.93 | 0.259 | 0.383 | +| | | | | FT-2 | Cosine | 4.73 | 0.251 | 0.375 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.1.sh| ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 4.90 | 0.251 | 0.378 | +| config_fbank80_stmn_fwseresnet34.v1.2.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 10.9 | 0.644 | 0.822 | +| | | | | | PLDA | 6.86 | 0.481 | 0.745 | +| | | | | FT-1 | Cosine | 4.35 | 0.25 | 0.393 | +| | | | | FT-1 | PLDA | 4.21 | 0.281 | 0.452 +| | | | | FT-2 | Cosine | 3.37 | 0.194 | 0.309 | +| | | | | FT-2 | PLDA | 3.51 | 0.219 | 0.351 | +| config_fbank80_stmn_fwseresnet34.v1.2.1.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 3.11 | 0.172 | 0.270 | +| | | | | FT-2 | PLDA | 3.15 | 0.186 | 0.294 | +| config_fbank80_stmn_fwseresnet34.v1.2.2.sh | FW-SE ResNet34 / 0.1 x Cos Reg. | Cos+AHC+PLDA+AHC | Cos+AHC | DINO | Cosine | 7.41 | 0.377 | 0.526 | +| | | | | | PLDA | 5.95 | 0.269 | 0.438 | +| | | | | FT-1 | Cosine | 4.38 | 0.222 | 0.337 | +| | | | | FT-1 | PLDA | 4.68 | 0.237 | 0.375 | +| | | | | FT-2 | Cosine | 4.07 | 0.197 | 0.301 | +| | | | | FT-2 | PLDA | 4.75 | 0.229 | 0.352 | diff --git a/egs/voxceleb/ssl.v1/cmd.sh b/egs/voxceleb/ssl.v1/cmd.sh new file mode 100755 index 00000000..4efc96e1 --- /dev/null +++ b/egs/voxceleb/ssl.v1/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 40G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/voxceleb/ssl.v1/conf/clsp.conf b/egs/voxceleb/ssl.v1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_cos_ahc_plda_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_cos_ahc_plda_ahc.yaml new file mode 100644 index 00000000..fb6673df --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_cos_ahc_plda_ahc.yaml @@ -0,0 +1,18 @@ +pca: + pca_var_r: 0.995 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +threshold_stage_1: 0.875 +threshold_stage_2: -100. +plda: + plda_type: splda + y_dim: 100 +max_samples_per_cluster: 50 +min_samples_per_cluster: 8 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc.yaml new file mode 100644 index 00000000..c1bf8c94 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc.yaml @@ -0,0 +1,12 @@ +pca: + pca_var_r: 0.995 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +threshold: 0.8 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml new file mode 100644 index 00000000..1a04d084 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml @@ -0,0 +1,20 @@ +pca: + pca_var_r: 0.995 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +#threshold_stage_1: 0.75 +#threshold_stage_2: 25 +threshold_stage_1: 0.8 +threshold_stage_2: 30 +plda: + plda_type: splda + y_dim: 100 +max_samples_per_cluster: 50 +min_samples_per_cluster: 8 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml new file mode 100644 index 00000000..3740d0e7 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml @@ -0,0 +1,18 @@ +pca: + pca_var_r: 0.99 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +threshold_stage_1: 0.80 +threshold_stage_2: -400 +plda: + plda_type: splda + y_dim: 100 +max_samples_per_cluster: 50 +min_samples_per_cluster: 8 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml new file mode 100644 index 00000000..9c446a2e --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml @@ -0,0 +1,12 @@ +pca: + pca_var_r: 0.99 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +threshold: 0.6 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml new file mode 100644 index 00000000..cf3adf41 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml @@ -0,0 +1,18 @@ +pca: + pca_var_r: 0.99 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +threshold_stage_1: 0.6 +threshold_stage_2: 0 +plda: + plda_type: splda + y_dim: 100 +max_samples_per_cluster: 50 +min_samples_per_cluster: 8 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/coe_gpu_bigmem.conf b/egs/voxceleb/ssl.v1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/ssl.v1/conf/coe_gpu_long.conf b/egs/voxceleb/ssl.v1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/ssl.v1/conf/coe_gpu_rtx.conf b/egs/voxceleb/ssl.v1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/ssl.v1/conf/coe_gpu_short.conf b/egs/voxceleb/ssl.v1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/ssl.v1/conf/coe_gpu_v100.conf b/egs/voxceleb/ssl.v1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/ssl.v1/conf/fbank80_specaug1_stmn_16k.yaml b/egs/voxceleb/ssl.v1/conf/fbank80_specaug1_stmn_16k.yaml new file mode 100644 index 00000000..8df42fc6 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/fbank80_specaug1_stmn_16k.yaml @@ -0,0 +1,24 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 8 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/ssl.v1/conf/plda.yaml b/egs/voxceleb/ssl.v1/conf/plda.yaml new file mode 100644 index 00000000..bbb8f051 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/plda.yaml @@ -0,0 +1,11 @@ +class_name: cluster +pca: + #pca_var_r: 0.975 + pca_var_r: 0.99 +do_lda: true +lda: + lda_dim: 120 +plda: + plda_type: splda + y_dim: 100 + epochs: 20 diff --git a/egs/voxceleb/ssl.v1/conf/reverb_noise_aug.yaml b/egs/voxceleb/ssl.v1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..86f55073 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/reverb_noise_aug.yaml @@ -0,0 +1,34 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max + mediumroom: + weight: 1 + rir_path: csv:data/rirs_mediumroom/rirs.csv + rir_norm: max + realroom: + weight: 1 + rir_path: csv:data/rirs_real/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 3 + max_snr: 18 diff --git a/egs/voxceleb/ssl.v1/conf/teacher_reverb_noise_aug.yaml b/egs/voxceleb/ssl.v1/conf/teacher_reverb_noise_aug.yaml new file mode 100644 index 00000000..6c2fecc0 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/teacher_reverb_noise_aug.yaml @@ -0,0 +1,26 @@ +reverb_aug: + reverb_prob: 0.3 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 10 + max_snr: 28 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 13 + max_snr: 28 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 13 + max_snr: 28 diff --git a/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.1.yaml b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.1.yaml new file mode 100644 index 00000000..5dec90f3 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.1.yaml @@ -0,0 +1,115 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.0 + norm_before: false + hid_act: swish + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.005 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 25 + use_amp: false + log_interval: 1000 + epochs: 140 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.2.yaml new file mode 100644 index 00000000..ae9ab1fa --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.2.yaml @@ -0,0 +1,115 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.0 + norm_before: false + hid_act: swish + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.005 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 15 + use_amp: false + log_interval: 1000 + epochs: 120 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml new file mode 100644 index 00000000..480ae04f --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml @@ -0,0 +1,68 @@ +data: + train: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 10 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: false + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 + target_key: cluster + train_mode: ft-embed-affine + \ No newline at end of file diff --git a/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml new file mode 100644 index 00000000..8a7a700c --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml @@ -0,0 +1,68 @@ +data: + train: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 10 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 256 + target_key: cluster + train_mode: full + \ No newline at end of file diff --git a/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml new file mode 100644 index 00000000..a12e05f0 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml @@ -0,0 +1,97 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.01 + norm_before: false + hid_act: swish + se_r: 4 + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +cosine_loss: + warmup_epochs: 20 + scale: 0.1 +trainer: + optim: + opt_type: adamw + lr: 0.005 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-04 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + # grad_clip: 15 + grad_clip: 5 + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.yaml new file mode 100644 index 00000000..24d09678 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.yaml @@ -0,0 +1,93 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.01 + norm_before: false + hid_act: swish + se_r: 4 + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.005 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 15 + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.0.yaml b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.0.yaml new file mode 100644 index 00000000..cb82c539 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.0.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: true + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 8.0 + min_chunk_length: 4.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: true + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 8.0 + min_chunk_length: 4.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.01 + norm_before: false + hid_act: swish + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.0025 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 25 + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.1.yaml b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.1.yaml new file mode 100644 index 00000000..ac185913 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.1.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.01 + norm_before: false + hid_act: swish + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.0025 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 25 + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.2.yaml new file mode 100644 index 00000000..fa6466ce --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.2.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.01 + norm_before: false + hid_act: swish + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.0025 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 15 + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.1_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.1_v1.2.yaml new file mode 100644 index 00000000..945fd42b --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.1_v1.2.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 10 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 + target_key: cluster + train_mode: ft-embed-affine + \ No newline at end of file diff --git a/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.2_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.2_v1.2.yaml new file mode 100644 index 00000000..e8fd36a2 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.2_v1.2.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 10 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.01 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 70 + eff_batch_size: 256 + target_key: cluster + train_mode: full + \ No newline at end of file diff --git a/egs/voxceleb/ssl.v1/datapath.sh b/egs/voxceleb/ssl.v1/datapath.sh new file mode 100644 index 00000000..a7eb575c --- /dev/null +++ b/egs/voxceleb/ssl.v1/datapath.sh @@ -0,0 +1,23 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + # voxceleb1_root=/export/corpora5/VoxCeleb1_v1 #voxceleb1 v1 + voxceleb1_root=/export/corpora5/VoxCeleb1_v2 #voxceleb1 v2 + voxceleb2_root=/export/corpora5/VoxCeleb2 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + voxsrc22_root=/exp/jvillalba/corpora/voxsrc22 + musan_root=/expscratch/dgromero/corpora-open/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/voxceleb/ssl.v1/default_config.sh b/egs/voxceleb/ssl.v1/default_config.sh new file mode 120000 index 00000000..f8aa12d5 --- /dev/null +++ b/egs/voxceleb/ssl.v1/default_config.sh @@ -0,0 +1 @@ +global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh \ No newline at end of file diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v1.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v1.2.sh new file mode 100644 index 00000000..0ecf904d --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v1.2.sh @@ -0,0 +1,68 @@ +# ECAPA-TDNN 512x3 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn512x3_dino.v1.2 + +nnet_s1_base_cfg=conf/train_ecapatdnn512x3_dino_v1.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0120.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_ecapatdnn512x3_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc_plda_ahc +cluster_ft_s1_cfg=conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s1_name=${cluster_ft_s1_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_name=$nnet_name.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_name=$nnet_name.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s2_method=cos_ahc_plda_ahc +cluster_ft_s2_cfg=conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s2_name=${cluster_ft_s2_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name + + diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh new file mode 100644 index 00000000..c4f5c8c7 --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh @@ -0,0 +1,66 @@ +# ECAPA-TDNN 512x3 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34_dino.v1.2 + +nnet_s1_base_cfg=conf/train_fwseresnet34_dino_v1.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0034.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc +cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s1_name=${cluster_ft_s1_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_name=$nnet_name.1.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_name=$nnet_name.1.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 2.2 +cluster_ft_s2_method=cos_ahc +cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s2_name=${cluster_ft_s2_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name + diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh new file mode 100644 index 00000000..11aab111 --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh @@ -0,0 +1,66 @@ +# ECAPA-TDNN 512x3 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34_dino.v1.2.2 + +nnet_s1_base_cfg=conf/train_fwseresnet34_dino_v1.2.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0100.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_fwseresnet34_xvec_stage1.1_v1.2.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0025.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_fwseresnet34_xvec_stage1.2_v1.2.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc +cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s1_name=${cluster_ft_s1_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_fwseresnet34_xvec_stage1.1_v1.2.2.yaml +nnet_ft_s2_1_name=$nnet_name.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0025.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_fwseresnet34_xvec_stage1.2_v1.2.2.yaml +nnet_ft_s2_2_name=$nnet_name.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 2.2 +cluster_ft_s2_method=cos_ahc +cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s2_name=${cluster_ft_s2_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name + diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.sh new file mode 100644 index 00000000..9fecaa96 --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.sh @@ -0,0 +1,66 @@ +# ECAPA-TDNN 512x3 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34_dino.v1.2 + +nnet_s1_base_cfg=conf/train_fwseresnet34_dino_v1.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0034.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc_plda_ahc +cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s1_name=${cluster_fs_s1_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_name=$nnet_name.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_name=$nnet_name.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 2.2 +cluster_ft_s2_method=cos_ahc_plda_ahc +cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s2_name=${cluster_ft_s2_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name + diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.0.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.0.sh new file mode 100644 index 00000000..0a621148 --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.0.sh @@ -0,0 +1,52 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34_dino.v1.0 + +nnet_s1_base_cfg=conf/train_lresnet34_dino_v1.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0060.pth + +nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# clustering +cluster_method=cos_ahc +cluster_name=${cluster_method}_1 +cluster_cfg=conf/ahc.yaml + +# plda +plda_cfg=conf/plda.yaml + +# back-end +do_plda=false +# do_snorm=true +# do_qmf=true +# do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.1.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.1.sh new file mode 100644 index 00000000..18fafd95 --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.1.sh @@ -0,0 +1,60 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34_dino.v1.1 + +nnet_s1_base_cfg=conf/train_lresnet34_dino_v1.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0080.pth + +# clustering +cluster_method=cos_ahc +cluster_name=${cluster_method}_1 +cluster_cfg=conf/ahc.yaml + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_lresnet34_stage1.1_v1.1.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0010.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_lresnet34_stage1.2_v1.1.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0080.pth + + +# # back-end +# do_plda=false +# # do_snorm=true +# # do_qmf=true +# # do_voxsrc22=true + +# plda_aug_config=conf/reverb_noise_aug.yaml +# plda_num_augs=0 +# if [ $plda_num_augs -eq 0 ]; then +# plda_data=voxceleb2cat_train +# else +# plda_data=voxceleb2cat_train_augx${plda_num_augs} +# fi +# plda_type=splda +# lda_dim=200 +# plda_y_dim=150 +# plda_z_dim=200 + diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.1.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.1.sh new file mode 100644 index 00000000..7b822cf4 --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.1.sh @@ -0,0 +1,65 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34_dino.v1.2 + +nnet_s1_base_cfg=conf/train_lresnet34_dino_v1.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0100.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc +cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s1_name=${cluster_ft_s1_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_name=$nnet_name.1.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_name=$nnet_name.1.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 2.2 +cluster_ft_s2_method=cos_ahc +cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s2_name=${cluster_ft_s2_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.sh new file mode 100644 index 00000000..4d02e22d --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.sh @@ -0,0 +1,65 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34_dino.v1.2 + +nnet_s1_base_cfg=conf/train_lresnet34_dino_v1.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0100.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc_plda_ahc +cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s1_name=${cluster_ft_s1_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_name=$nnet_name.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_name=$nnet_name.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 2.2 +cluster_ft_s2_method=cos_ahc_plda_ahc +cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s2_name=${cluster_ft_s2_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name diff --git a/egs/voxceleb/ssl.v1/hyp_utils b/egs/voxceleb/ssl.v1/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/ssl.v1/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/ssl.v1/path.sh b/egs/voxceleb/ssl.v1/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/voxceleb/ssl.v1/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/voxceleb/ssl.v1/run_001_prepare_data.sh b/egs/voxceleb/ssl.v1/run_001_prepare_data.sh new file mode 100755 index 00000000..563d3c2d --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_001_prepare_data.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. datapath.sh +. $config_file + +if [ $stage -le 1 ];then + # Prepare the VoxCeleb2 dataset for training. + hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train +fi + +if [ $stage -le 2 ];then + # prepare voxceleb1 for test + hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test +fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + # hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + # --vox1-corpus-dir $voxceleb1_root \ + # --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # split vox2 into 2 parts, for cohort and qmf training + hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/ssl.v1/run_002_compute_evad.sh b/egs/voxceleb/ssl.v1/run_002_compute_evad.sh new file mode 100755 index 00000000..acccace3 --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_002_compute_evad.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +vad_dir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml +nj=40 + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ -z "$vad_config" ];then + echo "We are not using VAD in this configuration" + exit 0 +fi + +if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" +fi + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + hyp_utils/create_data_split_dirs.sh \ + $vad_dir/$name \ + $USER/hyp-data/voxceleb/v1.2/vad $nodes + done +fi + +#Train datasets +if [ $stage -le 2 ];then + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj + echo "compute vad for $name" + $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-compute-energy-vad --cfg $vad_config \ + --recordings-file data/$name/recordings.csv \ + --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ + --part-idx JOB --num-parts $nj || exit 1 + + hyperion-tables cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion-dataset add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv + done +fi + + diff --git a/egs/voxceleb/ssl.v1/run_003_prepare_noises_rirs.sh b/egs/voxceleb/ssl.v1/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..73c7ed82 --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_003_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/voxceleb/ssl.v1/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/ssl.v1/run_004_prepare_xvec_train_data.sh new file mode 100755 index 00000000..3b7b9083 --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_004_prepare_xvec_train_data.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +nodes=b1 +nj=40 +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + hyp_utils/create_data_split_dirs.sh \ + exp/xvector_audios/$nnet_data \ + $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes +fi + +if [ $stage -le 2 ];then + output_dir=exp/proc_audio/$nnet_data + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$nnet_data/vad.csv" + update_durs="--update-seg-durs" + fi + + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac --remove-dc-offset $vad_args \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$nnet_data/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + + hyperion-dataset set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad +fi + +if [ $stage -le 3 ];then + hyperion-dataset remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 + + hyperion-dataset remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 +fi + +if [ $stage -le 4 ];then + hyperion-dataset split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val +fi + diff --git a/egs/voxceleb/ssl.v1/run_005_train_dino.sh b/egs/voxceleb/ssl.v1/run_005_train_dino.sh new file mode 100755 index 00000000..58399159 --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_005_train_dino.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_data_dir=data/${nnet_data}_xvector_train +val_data_dir=data/${nnet_data}_xvector_val + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# # Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-train-dino-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu \ + +fi + + +# # Finetune full model +# if [ $stage -le 2 ]; then +# if [ "$use_wandb" == "true" ];then +# extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" +# fi +# mkdir -p $nnet_s2_dir/log +# $cuda_cmd \ +# --gpu $ngpu $nnet_s2_dir/log/train.log \ +# hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ +# hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ +# --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ +# --data.train.dataset.segments-file $train_data_dir/segments.csv \ +# --data.train.dataset.class-files $train_data_dir/speaker.csv \ +# --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ +# --data.val.dataset.segments-file $val_data_dir/segments.csv \ +# --in-model-file $nnet_s1 \ +# --trainer.exp-path $nnet_s2_dir \ +# --num-gpus $ngpu \ + +# fi + +# # Finetune full model +# if [ $stage -le 3 ]; then +# if [ "$use_wandb" == "true" ];then +# extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" +# fi +# mkdir -p $nnet_s3_dir/log +# $cuda_cmd \ +# --gpu $ngpu $nnet_s3_dir/log/train.log \ +# hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ +# hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ +# --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ +# --data.train.dataset.segments-file $train_data_dir/segments.csv \ +# --data.train.dataset.class-files $train_data_dir/speaker.csv \ +# --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ +# --data.val.dataset.segments-file $val_data_dir/segments.csv \ +# --in-model-file $nnet_s2 \ +# --trainer.exp-path $nnet_s3_dir \ +# --num-gpus $ngpu \ + +# fi diff --git a/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh b/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh new file mode 100755 index 00000000..5bf085ae --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh @@ -0,0 +1,242 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=1 +ft_stage=0 +config_file=default_config.sh +use_gpu=true +xvec_chunk_length=120.0 +do_clustering=true +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" + num_gpus=1 +else + xvec_cmd="$train_cmd --mem 12G" + num_gpus=0 +fi + +if [ $ft_stage -eq 0 ];then + if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name + elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name + fi +elif [ $ft_stage -eq 1 ];then + if [ $nnet_stage -eq 1 ];then + nnet=$nnet_ft_s1_1 + nnet_name=$nnet_ft_s1_1_name + elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_ft_s1_2 + nnet_name=$nnet_ft_s1_2_name + fi + cluster_method=$cluster_ft_s1_method + cluster_cfg=$cluster_ft_s1_cfg + cluster_name=$cluster_ft_s1_name + cluster_dir=$cluster_ft_s1_dir +elif [ $ft_stage -eq 2 ];then + if [ $nnet_stage -eq 1 ];then + nnet=$nnet_ft_s2_1 + nnet_name=$nnet_ft_s2_1_name + elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_ft_s2_2 + nnet_name=$nnet_ft_s2_2_name + fi + cluster_method=$cluster_ft_s2_method + cluster_cfg=$cluster_ft_s2_cfg + cluster_name=$cluster_ft_s2_name + cluster_dir=$cluster_ft_s2_dir +fi +xvector_dir=exp/xvectors/$nnet_name +score_dir=exp/scores/$nnet_name +score_cosine_dir=$score_dir/cosine +score_plda_dir=$score_dir/${cluster_name}_plda + +if [ $stage -le 1 ]; then + # Extracts x-vectors for evaluation + nj=100 + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}') + nj=$(($num_segs < 100 ? $num_segs:100)) + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + +if [ $stage -le 2 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_dir/voxceleb1_scores.csv \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_dir/voxceleb1_results.csv + + cat $score_cosine_dir/voxceleb1_results.csv +fi + +if [ "$do_clustering" == "false" ];then + exit 0 +fi + +if [ $stage -le 3 ]; then + # Extract xvectors for training LDA/PLDA + nj=100 + for name in voxceleb2cat_train_filtered + do + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --random-utt-length --min-utt-length 30 --max-utt-length 30 \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + + +if [ $stage -le 4 ];then + echo "Cluster Vox2" + mkdir -p $cluster_dir + $train_cmd --mem 50G --num-threads 32 $cluster_dir/clustering.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV \ + hyperion-cluster-embeddings $cluster_method --cfg $cluster_cfg \ + --segments-file data/voxceleb2cat_train_filtered/segments.csv \ + --feats-file csv:$xvector_dir/voxceleb2cat_train_filtered/xvector.csv \ + --output-file $cluster_dir/voxceleb2cat_train/segments.csv +fi + +if [ $stage -le 5 ];then + hyperion-dataset add_cols_to_segments \ + --dataset data/voxceleb2cat_train_filtered \ + --column-names cluster \ + --right-table $cluster_dir/voxceleb2cat_train/segments.csv \ + --output-dataset $cluster_dir/voxceleb2cat_train_clustered \ + --remove-missing --create-class-info + + hyperion-dataset remove_classes_few_toomany_segments \ + --dataset $cluster_dir/voxceleb2cat_train_clustered \ + --class-name cluster \ + --min-segs 10 \ + --max-segs 50 \ + --rebuild-idx \ + --output-dataset $cluster_dir/voxceleb2cat_train_clustered_filtered +fi + +if [ $stage -le 6 ];then + echo "Train PLDA" + $train_cmd $cluster_dir/plda.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV \ + hyperion-train-plda --cfg $plda_cfg \ + --segments-file $cluster_dir/voxceleb2cat_train_clustered_filtered/segments.csv \ + --feats-file csv:$xvector_dir/voxceleb2cat_train_filtered/xvector.csv \ + --preproc-file $cluster_dir/plda/preproc.h5 \ + --plda-file $cluster_dir/plda/plda.h5 +fi + +if [ $stage -le 7 ];then + + echo "Eval Voxceleb 1 with PLDA" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_plda_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-plda-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_plda_dir/voxceleb1_scores.csv \ + --preproc-file $cluster_dir/plda/preproc.h5 \ + --plda-file $cluster_dir/plda/plda.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + hyperion-merge-scores --output-file $score_plda_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_plda_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_plda_dir/voxceleb1_results.csv + + cat $score_plda_dir/voxceleb1_results.csv +fi + +if [ $stage -le 8 ];then + hyperion-dataset split_train_val \ + --dataset $cluster_dir/voxceleb2cat_train_clustered_filtered \ + --val-prob 0.03 \ + --seed 1123581321 \ + --train-dataset $cluster_dir/voxceleb2cat_train_clustered_train \ + --val-dataset $cluster_dir/voxceleb2cat_train_clustered_val +fi + diff --git a/egs/voxceleb/ssl.v1/run_007_train_xvector.sh b/egs/voxceleb/ssl.v1/run_007_train_xvector.sh new file mode 100755 index 00000000..9732078a --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_007_train_xvector.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +ft_stage=1 +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $ft_stage -eq 1 ];then + nnet_s1_base_cfg=$nnet_ft_s1_1_base_cfg + nnet_s2_base_cfg=$nnet_ft_s1_2_base_cfg + nnet_s1_dir=$nnet_ft_s1_1_dir + nnet_s2_dir=$nnet_ft_s1_2_dir + nnet_s0=$nnet_s1 + nnet_s1=$nnet_ft_s1_1 + nnet_s2=$nnet_ft_s1_2 + train_data_dir=$cluster_dir/${nnet_data}_clustered_train + val_data_dir=$cluster_dir/${nnet_data}_clustered_val +elif [ $ft_stage -eq 2 ];then + nnet_s1_base_cfg=$nnet_ft_s2_1_base_cfg + nnet_s2_base_cfg=$nnet_ft_s2_2_base_cfg + nnet_s1_dir=$nnet_ft_s2_1_dir + nnet_s2_dir=$nnet_ft_s2_2_dir + nnet_s0=$nnet_ft_s1_2 + nnet_s1=$nnet_ft_s2_1 + nnet_s2=$nnet_ft_s2_2 + train_data_dir=$cluster_ft_s1_dir/${nnet_data}_clustered_train + val_data_dir=$cluster_ft_s1_dir/${nnet_data}_clustered_val +fi + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-ssl.v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Fine-tune last layer and embedding projection +if [ $stage -le 1 ]; then + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/cluster.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --in-model-file $nnet_s0 \ + --num-gpus $ngpu +fi + + +# Fine-tune full model +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/cluster.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu + +fi diff --git a/egs/voxceleb/ssl.v1/run_008_extract_ft1_xvec_embeds_cluster_eval.sh b/egs/voxceleb/ssl.v1/run_008_extract_ft1_xvec_embeds_cluster_eval.sh new file mode 100755 index 00000000..71cab44a --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_008_extract_ft1_xvec_embeds_cluster_eval.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +use_gpu=true +xvec_chunk_length=120.0 +do_clustering=true +. parse_options.sh || exit 1; + +./run_006_extract_dino_embeds_cluster_eval.sh \ + --config-file $config_file \ + --stage $stage \ + --nnet-stage $nnet_stage \ + --ft-stage 1 \ + --use-gpu $use_gpu \ + --xvec-chunk-length $xvec_chunk_length \ + --do-clustering $do_clustering diff --git a/egs/voxceleb/ssl.v1/run_009_finetune_xvector_s2.sh b/egs/voxceleb/ssl.v1/run_009_finetune_xvector_s2.sh new file mode 100755 index 00000000..ca7d058a --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_009_finetune_xvector_s2.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +. parse_options.sh || exit 1; + +./run_007_train_xvector.sh \ + --config-file $config_file \ + --ngpu $ngpu \ + --stage $stage \ + --ft-stage 2 \ + --interactive $interactive + diff --git a/egs/voxceleb/ssl.v1/run_010_extract_ft2_xvec_embeds_cluster_eval.sh b/egs/voxceleb/ssl.v1/run_010_extract_ft2_xvec_embeds_cluster_eval.sh new file mode 100755 index 00000000..4f09dfaf --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_010_extract_ft2_xvec_embeds_cluster_eval.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +use_gpu=true +xvec_chunk_length=120.0 +do_clustering=true +. parse_options.sh || exit 1; + +./run_006_extract_dino_embeds_cluster_eval.sh \ + --config-file $config_file \ + --stage $stage \ + --nnet-stage $nnet_stage \ + --ft-stage 2 \ + --use-gpu $use_gpu \ + --xvec-chunk-length $xvec_chunk_length \ + --do-clustering $do_clustering diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 5b5b93e5..efdb77c1 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -1,5 +1,7 @@ # VoxCeleb V1.1 +This recipe will be deprecated, use V1.2 + Recipe for the VoxCeleb Speaker Verification Task ## Differences w.r.t VoxCeleb V1 recipe @@ -87,6 +89,139 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr ### VoxCeleb 1 Original-Clean trial list +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.11 | 0.069 | 0.126 | +| | | | Cosine + AS-Norm | 1.10 | 0.065 | 0.108 | +| | | | Cosine + QMF | 0.95 | 0.059 | 0.084 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | +| | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 | +| | | | Cosine + QMF | 0.57 | 0.037 | 0.071 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.039 | 0.048 | +| | | | Cosine + QMF | 0.62 | 0.034 | 0.042 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.76 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.041 | 0.061 | +| | | | Cosine + QMF | 0.62 | 0.037 | 0.056 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 | +| | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062| +| | | | Cosine + QMF | 0.62 | 0.036 | 0.063 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.053 | 0.082 | +| | | | Cosine + AS-Norm | 0.70 | 0.043 | 0.076 | +| | | | Cosine + QMF | 0.63 | 0.042 | 0.071 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.051 | 0.095 | +| | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 | +| | | | Cosine + QMF | 0.67 | 0.039 | 0.074 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | +| | | | Cosine + AS-Norm | 0.52 | 0.033 | 0.045 | +| | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 | +| | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 | +| | | | Cosine + QMF | 0.49 | 0.033 | 0.054 | + + +### VoxCeleb 1 Entire-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.16 | 0.073 | 0.130 | +| | | | Cosine + AS-Norm | 1.13 | 0.068 | 0.118 | +| | | | Cosine + QMF | 1.06 | 0.064 | 0.112 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | +| | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 | +| | | | Cosine + QMF | 0.76 | 0.047 | 0.083 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.86 | 0.054 | 0.098 | +| | | | Cosine + AS-Norm | 0.81 | 0.049 | 0.087 | +| | | | Cosine + QMF | 0.77 | 0.046 | 0.082 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.89 | 0.058 | 0.098 | +| | | | Cosine + AS-Norm | 0.84 | 0.053 | 0.087| +| | | | Cosine + QMF | 0.80 | 0.050 | 0.081 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 | +| | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 | +| | | | Cosine + QMF | 0.74 | 0.045 | 0.081 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.91 | 0.057 | 0.100 | +| | | | Cosine + AS-Norm | 0.85 | 0.052 | 0.089 | +| | | | Cosine + QMF | 0.81 | 0.049 | 0.085 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.94 | 0.059 | 0.105 | +| | | | Cosine + AS-Norm | 0.88 | 0.053 | 0.093 | +| | | | Cosine + QMF | 0.84 | 0.051 | 0.088 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| +| | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | +| | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 | +| | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 | +| | | | Cosine + QMF | 0.68 | 0.040 | 0.069 | + +### VoxCeleb 1 Hard-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.10 | 0.128 | 0.209 | +| | | | Cosine + AS-Norm | 1.99 | 0.118 | 0.190 | +| | | | Cosine + QMF | 1.84 | 0.111 | 0.184 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | +| | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 | +| | | | Cosine + QMF | 1.44 | 0.087 | 0.145 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.62 | 0.098 | 0.164 | +| | | | Cosine + AS-Norm | 1.45 | 0.085 | 0.142 | +| | | | Cosine + QMF | 1.36 | 0.082 | 0.137 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.70 | 0.1 | 0.165 | +| | | | Cosine + AS-Norm | 1.50 | 0.086 | 0.138 | +| | | | Cosine + QMF | 1.44 | 0.085 | 0.139 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 | +| | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 | +| | | | Cosine + QMF | 1.34 | 0.079 | 0.136 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.75 | 0.104 | 0.171 | +| | | | Cosine + AS-Norm | 1.56 | 0.091 | 0.152 | +| | | | Cosine + QMF | 1.50 | 0.087 | 0.145 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.76 | 0.104 | 0.174 | +| | | | Cosine + AS-Norm | 1.58 | 0.092 | 0.152 | +| | | | Cosine + QMF | 1.51 | 0.089 | 0.149 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | +| | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | +| | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 | +| | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 | +| | | | Cosine + QMF | 1.21 | 0.069 | 0.113 | + + +### VoxSRC2022 dev + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.87 | 0.185 | 0.304 | +| | | | Cosine + AS-Norm | 2.84 | 0.182 | 0.304 | +| | | | Cosine + QMF | 2.61 | 0.172 | 0.283 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | +| | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 | +| | | | Cosine + QMF | 2.06 | 0.137 | 0.251 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.19 | 0.142 | 0.242 | +| | | | Cosine + AS-Norm | 2.00 | 0.133 | 0.254 | +| | | | Cosine + QMF | 1.86 | 0.126 | 0.229 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.34 | 0.145 | 0.246 | +| | | | Cosine + AS-Norm | 2.10 | 0.135 | 0.248 | +| | | | Cosine + QMF | 2.01 | 0.127 | 0.218 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 | +| | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 | +| | | | Cosine + QMF | 1.87 | 0.119 | 0.216 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.36 | 0.153 | 0.259 | +| | | | Cosine + AS-Norm | 2.18 | 0.139 | 0.249 | +| | | | Cosine + QMF | 2.08 | 0.128 | 0.222 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.49 | 0.158 | 0.265 | +| | | | Cosine + AS-Norm | 2.29 | 0.145 | 0.251 | +| | | | Cosine + QMF | 2.17 | 0.133 | 0.230 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | +| | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | +| | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 | +| | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 | +| | | | Cosine + QMF | 1.68 | 0.114 | 0.216 | + + +## Results before 2023 + +### VoxCeleb 1 Original-Clean trial list + | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | | config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 | diff --git a/egs/voxceleb/v1.1/conf b/egs/voxceleb/v1.1/conf deleted file mode 120000 index 25a735e3..00000000 --- a/egs/voxceleb/v1.1/conf +++ /dev/null @@ -1 +0,0 @@ -../v1/conf \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/clsp.conf b/egs/voxceleb/v1.1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/v1.1/conf/coe_gpu_bigmem.conf b/egs/voxceleb/v1.1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/v1.1/conf/coe_gpu_long.conf b/egs/voxceleb/v1.1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/v1.1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/v1.1/conf/coe_gpu_rtx.conf b/egs/voxceleb/v1.1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/v1.1/conf/coe_gpu_short.conf b/egs/voxceleb/v1.1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/v1.1/conf/coe_gpu_v100.conf b/egs/voxceleb/v1.1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/v1.1/conf/ecapatdnn_small.yaml b/egs/voxceleb/v1.1/conf/ecapatdnn_small.yaml new file mode 100644 index 00000000..fd386500 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/ecapatdnn_small.yaml @@ -0,0 +1,34 @@ +resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 +pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/voxceleb/v1.1/conf/efficientnet_b4.yaml b/egs/voxceleb/v1.1/conf/efficientnet_b4.yaml new file mode 100644 index 00000000..f87c1e02 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/efficientnet_b4.yaml @@ -0,0 +1,20 @@ +effnet_type: efficientnet-b4 +in_feats: 80 +in_channels: 1 +in_kernel_size: 3 +in_stride: 1 +se_r: 4 +fix_stem_head: true +mbconv_strides: +- 1 +- 1 +- 2 +- 2 +- 1 +- 2 +- 1 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/voxceleb/v1.1/conf/efficientnet_b7.yaml b/egs/voxceleb/v1.1/conf/efficientnet_b7.yaml new file mode 100644 index 00000000..bae5c7cb --- /dev/null +++ b/egs/voxceleb/v1.1/conf/efficientnet_b7.yaml @@ -0,0 +1,22 @@ +effnet_type: efficientnet-b7 +in_feats: 80 +in_channels: 1 +in_kernel_size: 3 +in_stride: 1 +se_r: 4 +fix_stem_head: true +mbconv_strides: +- 1 +- 1 +- 2 +- 2 +- 1 +- 2 +- 1 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 +norm_layer: instance-norm-affine +head_norm_layer: layer-norm diff --git a/egs/voxceleb/v1.1/conf/fbank64_8k.yaml b/egs/voxceleb/v1.1/conf/fbank64_8k.yaml new file mode 100644 index 00000000..a77eb899 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/fbank64_8k.yaml @@ -0,0 +1,7 @@ +sample_frequency: 8000 +frame_length: 25 +low_freq: 20 +high_freq: 3700 +num_filters: 64 +snip_edges: false +use_energy: false diff --git a/egs/voxceleb/v1.1/conf/fbank64_stmn_8k.yaml b/egs/voxceleb/v1.1/conf/fbank64_stmn_8k.yaml new file mode 100644 index 00000000..dfd0d3e5 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/fbank64_stmn_8k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/v1.1/conf/fbank80_16k.yaml b/egs/voxceleb/v1.1/conf/fbank80_16k.yaml new file mode 100644 index 00000000..88bae69e --- /dev/null +++ b/egs/voxceleb/v1.1/conf/fbank80_16k.yaml @@ -0,0 +1,7 @@ +sample_frequency: 16000 +frame_length: 25 +low_freq: 20 +high_freq: 7600 +num_filters: 80 +snip_edges: false +use_energy: false diff --git a/egs/voxceleb/v1.1/conf/fbank80_specaug1_stmn_16k.yaml b/egs/voxceleb/v1.1/conf/fbank80_specaug1_stmn_16k.yaml new file mode 100644 index 00000000..8df42fc6 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/fbank80_specaug1_stmn_16k.yaml @@ -0,0 +1,24 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 8 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/v1.1/conf/fbank80_stmn_16k.yaml b/egs/voxceleb/v1.1/conf/fbank80_stmn_16k.yaml new file mode 100644 index 00000000..f4091f5d --- /dev/null +++ b/egs/voxceleb/v1.1/conf/fbank80_stmn_16k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/v1.1/conf/lrsched_exp_default.yaml b/egs/voxceleb/v1.1/conf/lrsched_exp_default.yaml new file mode 100644 index 00000000..fe08b704 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/lrsched_exp_default.yaml @@ -0,0 +1,7 @@ +lrsch_type: exp_lr +decay_rate: 0.5 +decay_steps: 8000 +hold_steps: 40000 +min_lr: 1.0e-05 +update_lr_on_opt_step: true +warmup_steps: 1000 diff --git a/egs/voxceleb/v1.1/conf/noise_aug.yaml b/egs/voxceleb/v1.1/conf/noise_aug.yaml new file mode 100644 index 00000000..7e575faf --- /dev/null +++ b/egs/voxceleb/v1.1/conf/noise_aug.yaml @@ -0,0 +1,19 @@ +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/voxceleb/v1.1/conf/online_pitch.conf b/egs/voxceleb/v1.1/conf/online_pitch.conf new file mode 100644 index 00000000..926bcfca --- /dev/null +++ b/egs/voxceleb/v1.1/conf/online_pitch.conf @@ -0,0 +1 @@ +--sample-frequency=8000 diff --git a/egs/voxceleb/v1.1/conf/optim_adam_default.yaml b/egs/voxceleb/v1.1/conf/optim_adam_default.yaml new file mode 100644 index 00000000..b6620069 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/optim_adam_default.yaml @@ -0,0 +1,6 @@ +opt_type: adam +lr: 0.05 +amsgrad: true +beta1: 0.9 +beta2: 0.95 +weight_decay: 1.0e-05 diff --git a/egs/voxceleb/v1.1/conf/res2net50.yaml b/egs/voxceleb/v1.1/conf/res2net50.yaml new file mode 100644 index 00000000..48067a3d --- /dev/null +++ b/egs/voxceleb/v1.1/conf/res2net50.yaml @@ -0,0 +1,13 @@ +resnet_type: res2net50 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +res2net_width_factor: 3.25 +res2net_scale: 8 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/voxceleb/v1.1/conf/resnet34.yaml b/egs/voxceleb/v1.1/conf/resnet34.yaml new file mode 100644 index 00000000..98695823 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/resnet34.yaml @@ -0,0 +1,11 @@ +resnet_type: resnet34 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/voxceleb/v1.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/v1.1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/voxceleb/v1.1/conf/spinenet49.yaml b/egs/voxceleb/v1.1/conf/spinenet49.yaml new file mode 100644 index 00000000..66b8d517 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/spinenet49.yaml @@ -0,0 +1,11 @@ +spinenet_type: spinenet49 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..727f40a3 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: cfwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..9a9dfc06 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..e2fb4c40 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: seresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..9a9dfc06 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_data_default.yaml b/egs/voxceleb/v1.1/conf/train_data_default.yaml new file mode 100644 index 00000000..1f96d1f6 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_data_default.yaml @@ -0,0 +1,19 @@ +dataset: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml new file mode 100644 index 00000000..c4de614e --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml @@ -0,0 +1,91 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..1633f4a2 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml @@ -0,0 +1,95 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.2 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + #min_lr: 1.0e-05 + min_lr: 1.0e-06 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml new file mode 100644 index 00000000..a2e63b54 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + # max_chunk_length: 6.0 + # min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..f3573b4a --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0.25 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml new file mode 100644 index 00000000..f5a7dcb1 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml @@ -0,0 +1,89 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..f15d453d --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml @@ -0,0 +1,93 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml new file mode 100644 index 00000000..bb9c8c79 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..13f9cd9a --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0. +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn_xvec_default.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn_xvec_default.yaml new file mode 100644 index 00000000..46298946 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: ecapatdnn_small.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/train_effnetb4_xvec_default.yaml b/egs/voxceleb/v1.1/conf/train_effnetb4_xvec_default.yaml new file mode 100644 index 00000000..1bc74de6 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_effnetb4_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: efficientnet_b4.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..db559c14 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 4 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..9a9dfc06 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml new file mode 100644 index 00000000..b7f02a47 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + se_r: 4 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 20 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..1016087d --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.05 + se_r: 4 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml new file mode 100644 index 00000000..65cd737c --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 5 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..c7437e94 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage1_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage1_v2.0.yaml new file mode 100644 index 00000000..bff34263 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage1_v2.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + se_r: 4 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml new file mode 100644 index 00000000..09a5345f --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50_xvec_default.yaml b/egs/voxceleb/v1.1/conf/train_res2net50_xvec_default.yaml new file mode 100644 index 00000000..c7eb6ee1 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: res2net50.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..5dda7913 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..c7437e94 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..e98d6c13 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..63a5cb25 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + # dropout_rate: 0.0 + dropout_rate: 0.2 +trainer: + override_output: true + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_resnet34_xvec_default.yaml b/egs/voxceleb/v1.1/conf/train_resnet34_xvec_default.yaml new file mode 100644 index 00000000..1d387790 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_resnet34_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: resnet34.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/train_resnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_resnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..bff4a00b --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_resnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: resnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_spinenet49_xvec_default.yaml b/egs/voxceleb/v1.1/conf/train_spinenet49_xvec_default.yaml new file mode 100644 index 00000000..07167987 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_spinenet49_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: spinenet49.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..31dcaf9a --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: tseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 256 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..9a9dfc06 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/trainer_default.yaml b/egs/voxceleb/v1.1/conf/trainer_default.yaml new file mode 100644 index 00000000..86dcc2e4 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/trainer_default.yaml @@ -0,0 +1,6 @@ +optim: optim_adam_default.yaml +lrsched: lrsched_exp_default.yaml +use_amp: true +log_interval: 1000 +epochs: 70 +eff_batch_size: 512 diff --git a/egs/voxceleb/v1.1/conf/trainer_swa_default.yaml b/egs/voxceleb/v1.1/conf/trainer_swa_default.yaml new file mode 100644 index 00000000..0cafad01 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/trainer_swa_default.yaml @@ -0,0 +1,9 @@ +optim: optim_adam_default.yaml +lrsched: lrsched_exp_default.yaml +use_amp: true +log_interval: 1000 +epochs: 80 +eff_batch_size: 512 +swa_start: 60 +swa_lr: 1e-3 +swa_anneal_epochs: 5 diff --git a/egs/voxceleb/v1.1/conf/vad_16k.yaml b/egs/voxceleb/v1.1/conf/vad_16k.yaml new file mode 100644 index 00000000..a8d7b4d4 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/vad_16k.yaml @@ -0,0 +1,9 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 +wav_scale: 32767 diff --git a/egs/voxceleb/v1.1/conf/vad_8k.yaml b/egs/voxceleb/v1.1/conf/vad_8k.yaml new file mode 100644 index 00000000..7592c9d1 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/vad_8k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 8000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/v1.1/conf/val_data_default.yaml b/egs/voxceleb/v1.1/conf/val_data_default.yaml new file mode 100644 index 00000000..1f96d1f6 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/val_data_default.yaml @@ -0,0 +1,19 @@ +dataset: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 diff --git a/egs/voxceleb/v1.1/datapath.sh b/egs/voxceleb/v1.1/datapath.sh index 9a2f7529..a7eb575c 100644 --- a/egs/voxceleb/v1.1/datapath.sh +++ b/egs/voxceleb/v1.1/datapath.sh @@ -13,6 +13,7 @@ elif [ "$(hostname --domain)" == "cm.gemini" ];then # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + voxsrc22_root=/exp/jvillalba/corpora/voxsrc22 musan_root=/expscratch/dgromero/corpora-open/musan else echo "Put your database paths here" diff --git a/egs/voxceleb/v1.1/default_config.sh b/egs/voxceleb/v1.1/default_config.sh deleted file mode 100644 index 652b4d61..00000000 --- a/egs/voxceleb/v1.1/default_config.sh +++ /dev/null @@ -1,54 +0,0 @@ -# Default parameters -# LResNet34 x-vector without mixed precision training - -# acoustic features -feat_config=conf/fbank80_stmn_16k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=128 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=lresnet34 #light resnet -dropout=0 -embed_dim=256 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}.v1 -nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/default_config.sh b/egs/voxceleb/v1.1/default_config.sh new file mode 120000 index 00000000..fd0e1bb1 --- /dev/null +++ b/egs/voxceleb/v1.1/default_config.sh @@ -0,0 +1 @@ +global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh \ No newline at end of file diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh new file mode 100644 index 00000000..fdb3147f --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cfwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh new file mode 100644 index 00000000..dbbf6fa7 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh @@ -0,0 +1,45 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + + +nnet_s2_base_cfg=conf/train_cwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh new file mode 100644 index 00000000..0532754f --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh @@ -0,0 +1,53 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn2048x4.v2.0 + +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth +nnet_s1=$nnet_s1_dir/model_ep0030.pth +nnet_s1=$nnet_s1_dir/model_ep0020.pth +#nnet_s1=$nnet_s1_dir/model_ep0010.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0010.pth +#nnet_s2=$nnet_s2_dir/model_ep0020.pth +#nnet_s2=$nnet_s2_dir/model_ep0010.pth +#nnet_s2=$nnet_s2_dir/model_ep0005.pth +#nnet_s2=$nnet_s2_dir/model_ep0002.pth +#nnet_s2=$nnet_s2_dir/model_ep0001.pth +#nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh new file mode 100644 index 00000000..f2622b0e --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn2048x4.v3.0 + +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh new file mode 100644 index 00000000..1f6eb371 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh @@ -0,0 +1,45 @@ +# ECAPA-TDNN small + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn512x3.v2.0 + +nnet_s1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh new file mode 100644 index 00000000..a3ad0c29 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh @@ -0,0 +1,45 @@ +# ECAPA-TDNN small + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn512x3.v3.0 + +nnet_s1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh index 2b4f07a7..ecd076c8 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,4 +1,4 @@ -# Time SE Res2Net50 w26s4 x-vector with mixed precision training +# ECAPA-TDNN small # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,56 +9,26 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg nnet_type=resnet1d -block_type=seres2bn # squeeze-excitation res2net bottleneck -channels=512 -ep_channels=1536 -width_factor=1 -scale=8 -se_r=4 -dropout=0 -attstats_inner=128 +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size +dropout=0 embed_dim=256 +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet_enc.in-feats 80 \ - --resnet_enc.in-conv-channels $channels \ - --resnet_enc.in-kernel-size 5 \ - --resnet_enc.in-stride 1 \ - --resnet_enc.resb-type $block_type \ - --resnet_enc.resb-repeats 1 1 1 \ - --resnet_enc.resb-channels $channels \ - --resnet_enc.resb-kernel-sizes 3 \ - --resnet_enc.resb-dilations 2 3 4 \ - --resnet_enc.resb-strides 1 \ - --resnet_enc.res2net-width-factor $width_factor \ - --resnet_enc.res2net-scale $scale \ - --resnet_enc.se-r $se_r \ - --resnet_enc.multilayer \ - --resnet_enc.multilayer-concat \ - --resnet_enc.endpoint-channels $ep_channels \ - --pool_net.pool-type ch-wise-att-mean+stddev \ - --pool_net.inner-feats $attstats_inner \ - --embed-dim $embed_dim" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +xvec_train_base_cfg=conf/train_ecapatdnn_xvec_default.yaml +xvec_train_args="--data.train.sampler.min-batch-size $batch_size_1gpu --data.val.sampler.min-batch-size $batch_size_1gpu" nnet_name=${feat_type}_ecapatdnn512x3_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 + nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh index 0765b60d..aae5f68e 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh @@ -9,35 +9,29 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.01 +# x-vector cfg +nnet_type=efficientnet -nnet_type=efficientnet-b4 +effnet_type=efficientnet-b4 dropout=0 embed_dim=256 -se_r=4 - s=30 margin_warmup=20 margin=0.3 +se_r=4 -nnet_opt="--effnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --se-r $se_r --fix-stem-head --mbconv-strides 1 1 2 2 1 2 1" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +batch_size_1gpu=16 +eff_batch_size=512 # effective batch size +lr=0.01 nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_effnetb4_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr" + +nnet_s1_name=${feat_type}_${effnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml @@ -51,4 +45,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh index 7d1fd1dc..6ddb2b5e 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh @@ -9,35 +9,29 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=2 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.01 +# x-vector cfg +nnet_type=efficientnet -nnet_type=efficientnet-b7 +effnet_type=efficientnet-b7 dropout=0 embed_dim=256 -se_r=4 - s=30 margin_warmup=20 margin=0.3 +se_r=4 -nnet_opt="--effnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --se-r $se_r --fix-stem-head --mbconv-strides 1 1 2 2 1 2 1 --norm-layer instance-norm-affine --head-norm-layer layer-norm" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_eina_hln_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +batch_size_1gpu=2 +eff_batch_size=512 # effective batch size +lr=0.01 nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_effnetb4_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model $PWD/conf/efficientnet_b7.yaml --trainer.optim.lr $lr" + +nnet_s1_name=${feat_type}_${effnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_eina_hln_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh new file mode 100644 index 00000000..7aa61f00 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_fwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_fwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v2.0.sh new file mode 100644 index 00000000..f71545b7 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v2.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v2.0 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0020.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0005.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh new file mode 100644 index 00000000..003bf978 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v3.0 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0029.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet202.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet202.v2.0.sh new file mode 100644 index 00000000..3de2f432 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet202.v2.0.sh @@ -0,0 +1,45 @@ +# Voxsrc22 Ravana ResNet202 network + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet202.v2.0 + +nnet_s1_base_cfg=conf/train_idrnd_resnet202_xvec_stage1_v2.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_345_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_345_arcs30m0.3_adam_lr0.05_amp.v1.sh index dbab12ae..9bfb7bb7 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_345_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_345_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,38 +1,35 @@ -# LResNet34_345 (multi-level feature) x-vector with mixed precision training +# LResNet34 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=64 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg + +nnet_type=resnet -nnet_type=lresnet34_345 +resnet_type=lresnet34_345 +batch_size_1gpu=128 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -50,3 +47,4 @@ lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh index c243020f..4aabd592 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -7,34 +7,29 @@ feat_type=fbank80_stmn #vad vad_config=conf/vad_16k.yaml - # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg + +nnet_type=resnet + +resnet_type=lresnet34 batch_size_1gpu=128 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=lresnet34 #light resnet dropout=0 embed_dim=256 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -52,3 +47,4 @@ lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lspinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lspinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh index 9ba45ab5..2afe35ef 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lspinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lspinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,35 +4,31 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=lspinenet49 batch_size_1gpu=64 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=lspinenet49 dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${spinenet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -49,4 +45,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 7cfe8894..9082799e 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,37 +9,31 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=64 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=res2net34 +nnet_type=resnet + +resnet_type=res2net34 +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=1 -scale=4 -ws_tag=w16s4 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +width_factor=1 +scale=4 +ws_tag=w16s4 nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" + +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 75f3bbbd..f2e22b45 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,4 +1,4 @@ -# Res2Net34 w26 s4 x-vector with mixed precision training +# Res2Net34 w26s4 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,37 +9,31 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=64 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=res2net34 +nnet_type=resnet + +resnet_type=res2net34 +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +width_factor=1.625 +scale=4 +ws_tag=w26s4 nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" + +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh index cbd13a22..bc828375 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,37 +9,32 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=res2net50 +nnet_type=resnet + +resnet_type=res2net50 +batch_size_1gpu=16 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=1.625 -scale=8 -ws_tag=w13s8 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +width_factor=1.625 +scale=8 +ws_tag=w13s8 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 4c7e6fc5..0c2e825a 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,37 +9,31 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg + +nnet_type=resnet -nnet_type=res2net50 +resnet_type=res2net50 +batch_size_1gpu=16 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +width_factor=1.625 +scale=4 +ws_tag=w26s4 nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" + +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml @@ -54,3 +48,4 @@ lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh index db3bfea8..49fd61fa 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,37 +9,31 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=res2net50 +nnet_type=resnet + +resnet_type=res2net50 +batch_size_1gpu=16 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +width_factor=3.25 +scale=8 +ws_tag=w26s8 nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" + +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh index c2191649..505ed8bc 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh @@ -1,4 +1,4 @@ -# Res2Net50 w26s8 x-vector with mixed precision training +# Res2Net50 w26s8 x-vector with mixed precision training and SWA # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,37 +9,31 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=res2net50 +nnet_type=resnet + +resnet_type=res2net50 +batch_size_1gpu=16 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 70 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 +width_factor=3.25 +scale=8 +ws_tag=w26s8 nnet_num_epochs=90 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/swa_model_ep0091.pth +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --trainer.epochs $nnet_num_epochs --trainer.swa-start 70 --trainer.swa-lr 1e-3 --trainer.swa-anneal-epochs 5" + +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/swa_model_ep0091.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh new file mode 100644 index 00000000..b194d1bd --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34.v3.0 + +nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name.kk2 +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh index fc62c86b..9c787210 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,33 +9,27 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=resnet + +resnet_type=resnet34 batch_size_1gpu=32 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=resnet34 dropout=0 embed_dim=256 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_s1_base_cfg=conf/train_resnet34_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh index d5f9e623..48dc3c90 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh @@ -1,4 +1,4 @@ -# ResNet34 x-vector with mixed precision training +# ResNet34 x-vector with mixed precision training and SWA # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,33 +9,28 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg + +nnet_type=resnet + +resnet_type=resnet34 batch_size_1gpu=32 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=resnet34 dropout=0 embed_dim=256 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +nnet_s1_base_cfg=conf/train_resnet34_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer $PWD/conf/trainer_swa_default.yaml" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 60 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 -nnet_num_epochs=80 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/swa_model_ep0081.pth +nnet_s1_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/swa_model_ep0081.pth # back-end @@ -51,3 +46,4 @@ lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh index b172ad91..838a41ae 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh @@ -1,4 +1,4 @@ -# ResNet34 x-vector with mixed precision training +# ResNet34 x-vector with mixed precision training and sharded distrib. data parallel # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,34 +9,28 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg + +nnet_type=resnet + +resnet_type=resnet34 batch_size_1gpu=32 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.01 - -nnet_type=resnet34 dropout=0 embed_dim=256 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --ddp-type oss_sharded_ddp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_s1_base_cfg=conf/train_resnet34_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.ddp-type oss_sharded_ddp" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_sharded_ddp_amp.v1 -nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/swa_model_ep0071.pth -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_sharded_ddp_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh index f9b8c038..003c8aae 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,4 +1,5 @@ -# LResNet34 x-vector with mixed precision training +# ResNet50 x-vector with mixed precision training + # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -8,46 +9,41 @@ feat_type=fbank80_stmn vad_config=conf/vad_16k.yaml # x-vector training -nnet_data=voxceleb2cat -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=resnet +resnet_type=resnet50 batch_size_1gpu=32 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=resnet50 dropout=0 embed_dim=256 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_resnet34_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type" + +nnet_s1_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=6 if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat + plda_data=voxceleb2cat_train else - plda_data=voxceleb2cat_augx${plda_num_augs} + plda_data=voxceleb2cat_train_augx${plda_num_augs} fi plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 8ee1b484..08669114 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,4 +1,4 @@ -# SE ResNet34 x-vector with mixed precision training +# Squeeze-Excitation Res2Net50 w26s8 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,39 +9,34 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=seres2net50 +nnet_type=resnet + +resnet_type=seres2net50 +batch_size_1gpu=24 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 +lr=0.05 +s=30 +margin_warmup=20 +margin=0.3 width_factor=1.625 scale=4 ws_tag=w26s4 +nnet_num_epochs=70 se_r=16 -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r" +xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=6 @@ -55,3 +50,4 @@ lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh index 5ea146b6..3a764519 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,41 +4,35 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=spine2net49 batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=spine2net49 dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 - width_factor=1.625 scale=4 ws_tag=w26s4 - -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}_${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" + +nnet_s1_name=${feat_type}_${spinenet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml @@ -52,4 +46,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh index 6aa20991..e12ab940 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,41 +4,35 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=spine2net49s batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=spine2net49s dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 - width_factor=1.625 scale=4 ws_tag=w26s4 - -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}_${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" + +nnet_s1_name=${feat_type}_${spinenet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml @@ -52,4 +46,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh index 28418a2e..f452baae 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,37 +4,32 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=spinenet49 batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=spinenet49 dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${spinenet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_dir/model_ep0070.pth # back-end diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh index 8be0e057..d17e2862 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,38 +4,32 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=64 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg +nnet_type=spinenet -nnet_type=spinenet49s +spinenet_type=spinenet49s +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 - -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type" + +nnet_s1_name=${feat_type}_${spinenet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml @@ -49,4 +43,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 7a3b0351..547020b1 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,4 +1,4 @@ -# Time SE Res2Net50 w26s4 x-vector with mixed precision training +# Time-Squeeze-Excitation Res2Net50 w26s8 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,38 +9,32 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=tseres2net50 +nnet_type=resnet + +resnet_type=tseres2net50 +batch_size_1gpu=24 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 +lr=0.05 +s=30 +margin_warmup=20 +margin=0.3 width_factor=1.625 scale=4 ws_tag=w26s4 +nnet_num_epochs=70 se_r=256 -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh new file mode 100644 index 00000000..00622772 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_tseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_tseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + +nnet_s2_base_cfg=conf/train_tseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh index 35a146a5..63cde868 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,34 +9,30 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg + +nnet_type=resnet + +resnet_type=tseresnet34 batch_size_1gpu=32 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=tseresnet34 dropout=0 embed_dim=256 se_r=16 +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --se-r $se_r" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_s1_base_cfg=conf/train_resnet34_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.se-r $se_r" -nnet_name=${feat_type}_${nnet_type}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh index 48f54f8b..e465c525 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,41 +4,36 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +# vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=tsespine2net49 batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=tsespine2net49 dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 - width_factor=1.625 scale=4 -se_r=256 ws_tag=w26s4 +se_r=256 +nnet_num_epochs=70 -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" -nnet_name=${feat_type}_${nnet_type}_${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${spinenet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh index 6253ee54..975e2aba 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,41 +4,36 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +# vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=tsespine2net49s batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=tsespine2net49s dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 - width_factor=1.625 scale=4 -se_r=256 ws_tag=w26s4 +se_r=256 +nnet_num_epochs=70 -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" -nnet_name=${feat_type}_${nnet_type}_${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${spinenet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end @@ -53,4 +48,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/local b/egs/voxceleb/v1.1/local deleted file mode 120000 index 740b697d..00000000 --- a/egs/voxceleb/v1.1/local +++ /dev/null @@ -1 +0,0 @@ -../v1/local/ \ No newline at end of file diff --git a/egs/voxceleb/v1.1/local/attack_analysis.py b/egs/voxceleb/v1.1/local/attack_analysis.py new file mode 100755 index 00000000..2e0fdb42 --- /dev/null +++ b/egs/voxceleb/v1.1/local/attack_analysis.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +import sys +import os +import argparse +import time +import logging + +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.np.metrics.verification_evaluator import ( + VerificationAdvAttackEvaluator as Eval, +) + + +def evaluate_attacks( + key_file, + clean_score_file, + attack_score_files, + attack_stats_files, + output_path, + prior, +): + + output_dir = os.path.dirname(output_path) + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + evaluator = Eval( + key_file, clean_score_file, attack_score_files, attack_stats_files, prior + ) + + # performance vs SNR + logging.info("compute perf vs snr for all trials") + df_clean = evaluator.compute_dcf_eer(return_df=True) + df_clean.insert(0, "snr", np.inf) + + df = evaluator.compute_dcf_eer_vs_stats( + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "all", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_all_snr_results.csv" % (output_path) + df = pd.concat([df_clean, df], ignore_index=True) + df.to_csv(file_path) + file_path = "%s_attack_all_snr" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) + + logging.info("compute perf vs snr for tar trials") + df = evaluator.compute_dcf_eer_vs_stats( + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "tar", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_tar_snr_results.csv" % (output_path) + df = pd.concat([df_clean, df], ignore_index=True) + df.to_csv(file_path) + file_path = "%s_attack_tar_snr" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) + + logging.info("compute perf vs snr for non trials") + df = evaluator.compute_dcf_eer_vs_stats( + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "non", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_non_snr_results.csv" % (output_path) + df = pd.concat([df_clean, df], ignore_index=True) + df.to_csv(file_path) + file_path = "%s_attack_non_snr" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) + + logging.info("find best attacks from snr point of view") + for i in range(len(attack_score_files)): + file_path = "%s_best_snr_tar_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "snr", + "tar", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=True, + ) + + file_path = "%s_best_snr_non_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "snr", + "non", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=True, + ) + + # performance vs Linf + logging.info("compute perf vs linf for all trials") + eps = np.ceil(np.asarray([0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]) * 2 ** 15) + df = evaluator.compute_dcf_eer_vs_stats( + "n_linf", eps, "all", higher_better=False, return_df=True + ) + file_path = "%s_attack_all_linf_results.csv" % (output_path) + df.to_csv(file_path) + file_path = "%s_attack_all_linf" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) + + logging.info("compute perf vs linf for tar trials") + df = evaluator.compute_dcf_eer_vs_stats( + "n_linf", eps, "tar", higher_better=False, return_df=True + ) + file_path = "%s_attack_tar_linf_results.csv" % (output_path) + df.to_csv(file_path) + file_path = "%s_attack_tar_linf" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) + + logging.info("compute perf vs linf for non trials") + df = evaluator.compute_dcf_eer_vs_stats( + "n_linf", eps, "non", higher_better=False, return_df=True + ) + file_path = "%s_attack_non_linf_results.csv" % (output_path) + df.to_csv(file_path) + file_path = "%s_attack_non_linf" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) + + # find the best attacks in terms of linf + logging.info("find best attacks from linf point of view") + for i in range(len(attack_score_files)): + file_path = "%s_best_linf_tar_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "n_linf", + "tar", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=False, + ) + + file_path = "%s_best_linf_non_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "n_linf", + "non", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=False, + ) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Analyses performance of adversarial attacks for spk. verif.", + ) + + parser.add_argument("--key-file", required=True) + parser.add_argument("--clean-score-file", required=True) + parser.add_argument("--attack-score-files", required=True, nargs="+") + parser.add_argument("--attack-stats-files", required=True, nargs="+") + parser.add_argument("--output-path", required=True) + parser.add_argument("--prior", default=0.05, type=float) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + evaluate_attacks(**vars(args)) diff --git a/egs/voxceleb/v1.1/local/attack_analysis.sh b/egs/voxceleb/v1.1/local/attack_analysis.sh new file mode 100755 index 00000000..42249873 --- /dev/null +++ b/egs/voxceleb/v1.1/local/attack_analysis.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +cmd=run.pl +prior=0.05 +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: $0 " + exit 1; +fi + +set -e + +key=$1 +clean_scores=$2 +adv_scores="$3" +adv_stats="$4" +output_path=$5 + +output_dir=$(dirname $output_path) +base=$(basename $output_path) +logdir=$output_dir/log +mkdir -p $logdir + +if [ "$(hostname --domain)" == "cm.gemini" ];then + module load texlive +fi + +$cmd $logdir/analysis_${base}.log \ + local/attack_analysis.py \ + --key-file $key \ + --clean-score-file $clean_scores \ + --attack-score-files $adv_scores \ + --attack-stats-files $adv_stats \ + --output-path $output_path + +scores_v=($adv_scores) +for((i=0;i<${#scores_v[@]};i++)) +do + scores_dir=$(dirname ${scores_v[$i]}) + wav_out_dir0=${output_path}_wavs + + for t in tar non + do + if [ "$t" == "tar" ];then + t2=tar2non + else + t2=non2tar + fi + wav_in_dir=$scores_dir/wav/$t2 + if [ ! -d "$wav_in_dir" ];then + continue + fi + for m in snr linf + do + best_file=${output_path}_best_${m}_${t}_attacks_$i.csv + if [ ! -f $best_file ];then + continue + fi + wav_out_dir=${wav_out_dir0}/best_${m}_${t}_attacks_$i + mkdir -p $wav_out_dir + for f in $(awk -F "," 'BEGIN{getline;}{ print $2"-"$3".wav"}' $best_file) + do + ff=$wav_in_dir/$f + if [ -f $ff ];then + cp -v $ff $wav_out_dir > $logdir/copywavs_${base}.log 2>&1 + fi + done + done + done +done + + diff --git a/egs/voxceleb/v1.1/local/calibrate_voxceleb1_o_clean.sh b/egs/voxceleb/v1.1/local/calibrate_voxceleb1_o_clean.sh new file mode 100755 index 00000000..736c3fb0 --- /dev/null +++ b/egs/voxceleb/v1.1/local/calibrate_voxceleb1_o_clean.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright 2019 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# + +set -e + +cmd=run.pl +prior=0.05 +l2_reg=1e-5 + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1; +fi + +score_dir=$1 +cal_score_dir=${score_dir}_cal_v1 + +mkdir -p $cal_score_dir + +echo "$0 train calibration on VoxCeleb1 Original Clean" + +model_file=$cal_score_dir/cal_tel.h5 +train_scores=$score_dir/voxceleb1_scores +train_key=data/voxceleb1_test/trials_o_clean + +$cmd $cal_score_dir/train_cal_tel.log \ + steps_be/train-calibration-v1.py --score-file $train_scores \ + --key-file $train_key --model-file $model_file --prior $prior --lambda-reg $l2_reg + +ndxs=(voxceleb1_test/trials_o_clean) +scores=(voxceleb1) +n_ndx=${#ndxs[*]} +for((i=0;i<$n_ndx;i++)) +do + echo "$0 eval calibration on ${scores[$i]}" + scores_in=$score_dir/${scores[$i]}_scores + scores_out=$cal_score_dir/${scores[$i]}_scores + ndx=data/${ndxs[$i]} + $cmd $cal_score_dir/eval_cal_${scores[$i]}.log \ + steps_be/eval-calibration-v1.py --in-score-file $scores_in \ + --ndx-file $ndx --model-file $model_file --out-score-file $scores_out & + +done +wait + + + + + diff --git a/egs/voxceleb/v1.1/local/make_musan.py b/egs/voxceleb/v1.1/local/make_musan.py new file mode 100755 index 00000000..b0ae6846 --- /dev/null +++ b/egs/voxceleb/v1.1/local/make_musan.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + + +def prepare_music(root_dir, fs, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_speech(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_noise(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def main(): + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) + + +if __name__ == "__main__": + main() diff --git a/egs/voxceleb/v1.1/local/make_musan.sh b/egs/voxceleb/v1.1/local/make_musan.sh new file mode 100755 index 00000000..4a6d30f9 --- /dev/null +++ b/egs/voxceleb/v1.1/local/make_musan.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +use_vocals='Y' + +. parse_options.sh || exit 1; + +if [ $# -ne 3 ];then + echo "Usage: $0 [options] "; + echo "e.g.: $0 /export/corpora/JHU/musan 8 data" + exit 1; +fi + +in_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} $fs ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf $data_dir/musan.tmp + diff --git a/egs/voxceleb/v1.1/local/make_rirs_data.sh b/egs/voxceleb/v1.1/local/make_rirs_data.sh new file mode 100755 index 00000000..c6652eda --- /dev/null +++ b/egs/voxceleb/v1.1/local/make_rirs_data.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# +# Apache 2.0. +set -e + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom" +fi + +rir_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir + +rir_list=$rir_dir/rir_list +if [ "$fs" -eq 16 ];then + awk '{ key=$5; sub(/.*\//,"",key); print key,$5 }' $rir_list > $data_dir/wav.scp +else + awk '{ +key=$5; sub(/.*\//,"",key); +print key,"sox "$5" -r 8000 -t wav -b 16 -e signed-integer - |" }' \ + $rir_list > $data_dir/wav.scp +fi +awk '{ key=$5; sub(/.*\//,"",key); print key,$4 }' $rir_list > $data_dir/rir2room + diff --git a/egs/voxceleb/v1/local/make_some_figs.py b/egs/voxceleb/v1.1/local/make_some_figs.py similarity index 99% rename from egs/voxceleb/v1/local/make_some_figs.py rename to egs/voxceleb/v1.1/local/make_some_figs.py index 207cab20..a4117aba 100755 --- a/egs/voxceleb/v1/local/make_some_figs.py +++ b/egs/voxceleb/v1.1/local/make_some_figs.py @@ -9,7 +9,7 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.metrics.verification_evaluator import ( +from hyperion.np.metrics.verification_evaluator import ( VerificationAdvAttackEvaluator as Eval, ) diff --git a/egs/voxceleb/v1/local/make_train_lists_sup_embed_with_augm.sh b/egs/voxceleb/v1.1/local/make_train_lists_sup_embed_with_augm.sh similarity index 100% rename from egs/voxceleb/v1/local/make_train_lists_sup_embed_with_augm.sh rename to egs/voxceleb/v1.1/local/make_train_lists_sup_embed_with_augm.sh diff --git a/egs/voxceleb/v1.1/local/make_trials_subset.py b/egs/voxceleb/v1.1/local/make_trials_subset.py new file mode 100755 index 00000000..da230842 --- /dev/null +++ b/egs/voxceleb/v1.1/local/make_trials_subset.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import logging +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils import SparseTrialKey + + +def make_trials(in_key_file, out_key_file, ntar, nnon, seed): + + rng = np.random.RandomState(seed=seed) + + logging.info("Load key: %s" % in_key_file) + key = SparseTrialKey.load_txt(in_key_file) + + nz_idx = key.tar.nonzero() + nnz = len(nz_idx[0]) + p = rng.permutation(nnz)[ntar:] + nz_idx = (nz_idx[0][p], nz_idx[1][p]) + key.tar[nz_idx] = False + + nz_idx = key.non.nonzero() + nnz = len(nz_idx[0]) + p = rng.permutation(nnz)[nnon:] + nz_idx = (nz_idx[0][p], nz_idx[1][p]) + key.non[nz_idx] = False + + logging.info("Saving key: %s" % out_key_file) + key.save_txt(out_key_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Makes a subset of a trial key") + + parser.add_argument("--in-key-file", required=True) + parser.add_argument("--out-key-file", required=True) + parser.add_argument("--ntar", required=True, type=int) + parser.add_argument("--nnon", required=True, type=int) + parser.add_argument("--seed", default=112358, type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + make_trials(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1.1/local/make_vox2_trials.py b/egs/voxceleb/v1.1/local/make_vox2_trials.py new file mode 100755 index 00000000..95a69cf1 --- /dev/null +++ b/egs/voxceleb/v1.1/local/make_vox2_trials.py @@ -0,0 +1,83 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import math +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils.segment_set import SegmentSet + + +def make_trials_single_gender(ft, fm, fs, segments, num_tar_trials, num_spks): + + # select spks + rng = np.random.RandomState(seed=1123) + spks = segments["class_id"].unique() + spks = rng.choice(spks, size=(num_spks,), replace=False) + snorm_segments = segments[~segments["class_id"].isin(spks)] + for seg, spk in zip(snorm_segments["id"], snorm_segments["class_id"]): + fs.write("%s %s\n" % (seg, spk)) + + segments = segments[segments["class_id"].isin(spks)] + num_segs_per_spk = int( + math.ceil((1 + math.sqrt(1 + 8 * num_tar_trials // num_spks)) / 2) + ) + + n = num_spks * num_segs_per_spk + print(num_segs_per_spk, n, num_tar_trials // num_spks, num_spks, len(spks)) + seg_ids = rng.choice(segments["id"], size=(n,), replace=False) + segments = segments[segments["id"].isin(seg_ids)] + seg_ids = segments["id"].values + class_ids = segments["class_id"].values + ntar = 0 + nnon = 0 + for i in range(n - 1): + for j in range(i + 1, n): + t = "target" if class_ids[i] == class_ids[j] else "nontarget" + ft.write("%s %s %s\n" % (seg_ids[i], seg_ids[j], t)) + if t == "target": + ntar += 1 + else: + nnon += 1 + + logging.info("Got ntar=%d and nnon=%d", ntar, nnon) + for i in range(n - 1): + fm.write("%s %s\n" % (seg_ids[i], seg_ids[i])) + + +def make_trials(data_dir, num_1k_tar_trials, num_spks): + config_logger(1) + logging.info("Making trial list for %s", data_dir) + data_dir = Path(data_dir) + segments = SegmentSet.load(data_dir / "utt2spk") + gender = SegmentSet.load(data_dir / "spk2gender") + segments["gender"] = gender.loc[segments["class_id"], "class_id"].values + + num_tar_trials = num_1k_tar_trials * 1000 // 2 + num_spks = num_spks // 2 + with open(data_dir / "trials", "w") as ft, open( + data_dir / "utt2model", "w" + ) as fm, open(data_dir / "snorm_utt2spk", "w") as fs: + segs_m = SegmentSet(segments.loc[segments["gender"] == "m"]) + make_trials_single_gender(ft, fm, fs, segs_m, num_tar_trials, num_spks) + segs_f = SegmentSet(segments.loc[segments["gender"] == "f"]) + make_trials_single_gender(ft, fm, fs, segs_f, num_tar_trials, num_spks) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="makes a trial list for vox2 dev") + + parser.add_argument("--data-dir", required=True, help="Path to dataset") + parser.add_argument( + "--num-1k-tar-trials", type=int, default=30, help="thousands of target trials" + ) + parser.add_argument("--num-spks", type=int, default=1000, help="number of speakers") + args = parser.parse_args() + make_trials(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1.1/local/make_voxceleb1_o.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_o.pl new file mode 100755 index 00000000..dce92245 --- /dev/null +++ b/egs/voxceleb/v1.1/local/make_voxceleb1_o.pl @@ -0,0 +1,180 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# 2020 Jesus Villalba +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ +# Create trial lists for Voxceleb1 original, +# with cleaned and non-cleaned versions +# Attention: +# - This script is for the old version of the dataset without anonymized speaker-ids +# - This script assumes that the voxceleb1 dataset has all speaker directories +# dumped in the same wav directory, NOT separated dev and test directories + + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_dir = "$out_dir/voxceleb1_test"; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +my $url_base="http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta"; +my @trials_basename = ("very_test.txt", "very_test2.txt"); +my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt"); +my @trials = ("trials_o", "trials_o_clean"); + +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; +my $meta_path = "$data_base/vox1_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox1_meta.csv"; + system("wget -O $meta_path $meta_url"); +} + +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; +my %id2spkr = (); +my %spkr2gender = (); +my %spkr2nation = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; + $id2spkr{$vox_id} = $spkr_id; + $spkr2gender{$spkr_id} = $gender; + $nation =~ s@ @-@g; + $spkr2nation{$spkr_id} = $nation; +} +close(META_IN) or die; + +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; +my $lid_path = "$data_base/lang_vox1_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox1_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + my ($vox_id, $vid_id, $file_id) = split '/', $utt_id; + my $spkr_id = $id2spkr{$vox_id}; + my $utt_id = "$spkr_id-$vid_id-00$file_id"; + $utt_id =~ s@\.wav$@@; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + +#download trials from voxceleb web page +for($i = 0; $i <= $#trials; $i++) { + + my $file_i = "$out_dir/$trials_basename[$i]"; + my $url_i = $trials_url[$i]; + my $trial_i = "$out_dir/$trials[$i]"; + if (! -e $file_i) { + system("wget -O $file_i $url_i"); + } + #mapping from new speaker ids and file-names to old ones + open(TRIAL_IN, "<", "$file_i") or die "Could not open the verification trials file $file_i"; + open(TRIAL_OUT, ">", "$trial_i") or die "Could not open the output file $trial_i"; + while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + + # Create entry for left-hand side of trial + my ($vox_id, $rec_id, $segment) = split('/', $path1); + $segment =~ s/\.wav$//; + my $spkr_id = $id2spkr{$vox_id}; + my $utt_id1 = "$spkr_id-$rec_id-00$segment"; + + # Create entry for right-hand side of trial + my ($vox_id, $rec_id, $segment) = split('/', $path2); + $segment =~ s/\.wav$//; + my $spkr_id = $id2spkr{$vox_id}; + my $utt_id2 = "$spkr_id-$rec_id-00$segment"; + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; + } + + close(TRIAL_IN) or die; + close(TRIAL_OUT) or die; + +} + + +opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + my $new_spkr_id = $spkr_id; + # If we're using a newer version of VoxCeleb1, we need to "deanonymize" + # the speaker labels. + if (exists $id2spkr{$spkr_id}) { + $new_spkr_id = $id2spkr{$spkr_id}; + } + print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n"; + print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n"; + + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $filename = $_; + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; + my $utt_id = "$new_spkr_id-$rec_id-$segment"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $new_spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } + } +} + +close(SPKR) or die; +close(WAV) or die; +close(LANG) or die; +close(GENDER) or die; +close(NAT) or die; + +if (system( + "cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) { + die "Error creating trials file in directory $out_dir"; +} + +if (system( + "awk '{ print \$1,\$1 }' $out_dir/trials | sort -u > $out_dir/utt2model") != 0) { + die "Error creating utt2model file in directory $out_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} + diff --git a/egs/voxceleb/v1/local/make_voxceleb1_oeh.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_oeh.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_oeh.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_oeh.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_old.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_old.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_old.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_old.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_orig.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_orig.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_orig.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_orig.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_orig_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_orig_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_orig_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_orig_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2_o.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2_o.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2_oeh.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2_oeh.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat.pl b/egs/voxceleb/v1.1/local/make_voxceleb1cat.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1cat.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1cat.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1cat_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1cat_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb2.pl b/egs/voxceleb/v1.1/local/make_voxceleb2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb2.pl diff --git a/egs/voxceleb/v1.1/local/make_voxceleb2cat.pl b/egs/voxceleb/v1.1/local/make_voxceleb2cat.pl new file mode 100755 index 00000000..93b6ad5a --- /dev/null +++ b/egs/voxceleb/v1.1/local/make_voxceleb2cat.pl @@ -0,0 +1,136 @@ +#!/usr/bin/perl +# +# Copyright 2018 Johns Hopkins University (Jesus Villalba) +# Copyright 2018 Ewald Enzinger +# +# Apache 2.0 +# Usage: make_voxceleb2cat.pl /export/voxceleb2cat_train dev 16 data/dev +# +# Note: This script requires ffmpeg to be installed and its location included in $PATH. + +if (@ARGV != 4) { + print STDERR "Usage: $0 fs \n"; + print STDERR "e.g. $0 /export/voxceleb2 dev 16 data/dev\n"; + exit(1); +} + +# Check that ffmpeg is installed. +if (`which ffmpeg` eq "") { + die "Error: this script requires that ffmpeg is installed."; +} + +($data_base, $dataset, $fs, $out_dir) = @ARGV; + +print "Preparing VoxCeleb2 Cat in $out_dir \n"; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +my $dataset_path = "" ; +if ( -d "$data_base/$dataset/aac" ){ + $dataset_path = "$data_base/$dataset/aac" +} +else { + $dataset_path = "$data_base/$dataset" +} + + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +if (system("mkdir -p $out_dir/lists_cat") != 0) { + die "Error making directory $out_dir/lists_cat"; +} + +print "Reading metadata\n"; +my $meta_url = "https://www.openslr.org/resources/49/vox2_meta.csv"; +my $meta_path = "$data_base/vox2_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox2_meta.csv"; + system("wget --no-check-certificate -O $meta_path $meta_url"); +} +open(META_IN, "<", "$meta_path") or die "Could not open the output file $meta_path"; +my %spkr2gender = (); +while () { + chomp; + my ($spkr, $vox_id, $vgg_id, $gender, $set) = split; + $spkr2gender{$vox_id} = $gender; +} +close(META_IN) or die; + +print "Reading languages estimated voxlingua \n"; +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox2_final.csv"; +my $lid_path = "$data_base/lang_vox2_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox2_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + $utt_id =~ s@/@-@g; + $utt_id =~ s@-[^-]*\.wav$@@; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; + +opendir my $dh, "$dataset_path" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$dataset_path/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +my $num_spkrs = @spkr_dirs; +my $count = 0; +foreach (@spkr_dirs) { + my $spkr_id = $_; + + $count++ ; + print " processing speaker $spkr_id $count / $num_spkrs \n"; + print GENDER "$spkr_id $spkr2gender{$spkr_id}\n"; + + opendir my $dh, "$dataset_path/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$dataset_path/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + + foreach (@rec_dirs) { + my $rec_id = $_; + my $utt_id = "$spkr_id-$rec_id"; + my $file_list = "$out_dir/lists_cat/$utt_id.txt"; + if (system("find $dataset_path/$spkr_id/$rec_id -name \"*.m4a\" -printf \"file %p\\n\" > $file_list") != 0){ + die "Error creating $file_list"; + } + my $wav = "ffmpeg -v 8 -f concat -safe 0 -i $file_list -f wav -acodec pcm_s16le -|"; + if($fs == 8){ + $wav = $wav." sox -t wav - -t wav -r 8k - |" + } + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } + } +} +close(SPKR) or die; +close(WAV) or die; +close(LANG) or die; +close(GENDER) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/voxceleb/v1.1/local/prepare_voxsrc22_dev.py b/egs/voxceleb/v1.1/local/prepare_voxsrc22_dev.py new file mode 100755 index 00000000..915de676 --- /dev/null +++ b/egs/voxceleb/v1.1/local/prepare_voxsrc22_dev.py @@ -0,0 +1,88 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def prepare_voxsrc22_dev(vox1_corpus_dir, voxsrc22_corpus_dir, output_dir, verbose): + config_logger(verbose) + logging.info( + "Preparing corpus %s + %s -> %s", + vox1_corpus_dir, + voxsrc22_corpus_dir, + output_dir, + ) + vox1_corpus_dir = Path(vox1_corpus_dir) + voxsrc22_corpus_dir = Path(voxsrc22_corpus_dir) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + trials_file = voxsrc22_corpus_dir / "voxsrc2022_dev.txt" + df_trials = pd.read_csv( + trials_file, header=None, names=["target", "enroll", "test"], sep=" ", + ) + + trials_file = output_dir / "trials" + logging.info("creating trials file %s", trials_file) + with open(trials_file, "w") as f: + for _, row in df_trials.iterrows(): + t = "target" if row["target"] == 1 else "nontarget" + f.write("%s %s %s\n" % (row["enroll"], row["test"], t)) + + enroll_file = output_dir / "utt2model" + logging.info("creating enrollment file %s", enroll_file) + file_ids = df_trials["enroll"].unique() + with open(enroll_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + u2s_file = output_dir / "utt2spk" + logging.info("creating utt2spk file %s", u2s_file) + file_ids = np.unique(np.concatenate((df_trials["enroll"], df_trials["test"]))) + with open(u2s_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + s2u_file = output_dir / "spk2utt" + logging.info("creating spk2utt file %s", s2u_file) + with open(s2u_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + wav_file = output_dir / "wav.scp" + logging.info("creating wav.scp file %s", wav_file) + with open(wav_file, "w") as f: + for file_id in file_ids: + if "VoxSRC2022_dev" in file_id: + wav_file = voxsrc22_corpus_dir / file_id + else: + wav_file = vox1_corpus_dir / "wav" / file_id + + f.write("%s %s\n" % (file_id, wav_file)) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares VoxSRC22 Track1/2 validation data") + + parser.add_argument( + "--vox1-corpus-dir", required=True, help="Path to voxceleb1 v2 dataset" + ) + parser.add_argument( + "--voxsrc22-corpus-dir", required=True, help="Path to voxsrc22 dataset" + ) + + parser.add_argument("--output-dir", required=True, help="Ouput data path prefix") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_voxsrc22_dev(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1.1/local/prepare_voxsrc22_test.py b/egs/voxceleb/v1.1/local/prepare_voxsrc22_test.py new file mode 100755 index 00000000..e3421fe1 --- /dev/null +++ b/egs/voxceleb/v1.1/local/prepare_voxsrc22_test.py @@ -0,0 +1,73 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def prepare_voxsrc22_test(corpus_dir, output_dir, verbose): + config_logger(verbose) + logging.info( + "Preparing corpus %s -> %s", corpus_dir, output_dir, + ) + corpus_dir = Path(corpus_dir) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + trials_file = corpus_dir / "Track12_blind.txt" + df_trials = pd.read_csv( + trials_file, header=None, names=["enroll", "test"], sep=" ", + ) + trials_file = output_dir / "trials" + logging.info("creating trials file %s", trials_file) + with open(trials_file, "w") as f: + for _, row in df_trials.iterrows(): + f.write("%s %s\n" % (row["enroll"], row["test"])) + + enroll_file = output_dir / "utt2model" + logging.info("creating enrollment file %s", enroll_file) + file_ids = df_trials["enroll"].unique() + with open(enroll_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + u2s_file = output_dir / "utt2spk" + logging.info("creating utt2spk file %s", u2s_file) + file_ids = np.unique(np.concatenate((df_trials["enroll"], df_trials["test"]))) + with open(u2s_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + s2u_file = output_dir / "spk2utt" + logging.info("creating spk2utt file %s", s2u_file) + with open(s2u_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + wav_file = output_dir / "wav.scp" + logging.info("creating wav.scp file %s", wav_file) + with open(wav_file, "w") as f: + for file_id in file_ids: + wav_file = corpus_dir / "Track12_test_data" / file_id + f.write("%s %s\n" % (file_id, wav_file)) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares VoxSRC22 Track1/2 test data") + + parser.add_argument("--corpus-dir", required=True, help="Path to voxsrc22 dataset") + + parser.add_argument("--output-dir", required=True, help="Ouput data path prefix") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_voxsrc22_test(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1.1/local/score_dcf.py b/egs/voxceleb/v1.1/local/score_dcf.py new file mode 100755 index 00000000..3524d222 --- /dev/null +++ b/egs/voxceleb/v1.1/local/score_dcf.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import argparse +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils import SparseTrialScores, SparseTrialKey +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval + + +def score_dcf(key_file, score_file, output_path): + + logging.info("Load key: %s" % key_file) + key = SparseTrialKey.load_txt(key_file) + logging.info("Load scores: %s" % score_file) + scr = SparseTrialScores.load_txt(score_file) + logging.info("separating tar/non") + tar, non = scr.get_tar_non(key) + logging.info("computing EER/DCF") + priors = np.array([0.001, 0.005, 0.01, 0.05]) + min_dcf, act_dcf, eer, _, min_pmiss, min_pfa, act_pmiss, act_pfa = fast_eval( + tar, non, priors, return_probs=True + ) + + output_dir = os.path.dirname(output_path) + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + ntar = len(tar) + nnon = len(non) + + output_file = output_path + "_results" + with open(output_file, "w") as f: + s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f} ntar: {9:d} nnon: {10:d}\n".format( + eer * 100, + min_dcf[3], + act_dcf[3], + min_dcf[2], + act_dcf[2], + min_dcf[1], + act_dcf[1], + min_dcf[0], + act_dcf[0], + ntar, + nnon, + ) + f.write(s) + logging.info(s) + s = "min-pmiss={} min-pfa={} act-pmiss={} act-pfa={}".format( + min_pmiss, min_pfa, act_pmiss, act_pfa + ) + logging.info(s) + s = "min-Nmiss={} min-Nfa={} act-Nmiss={} act-Nfa={}".format( + min_pmiss * ntar, min_pfa * nnon, act_pmiss * ntar, act_pfa * nnon + ) + logging.info(s) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + score_dcf(**vars(args)) diff --git a/egs/voxceleb/v1.1/local/score_voxceleb1.sh b/egs/voxceleb/v1.1/local/score_voxceleb1.sh new file mode 100755 index 00000000..f12b18eb --- /dev/null +++ b/egs/voxceleb/v1.1/local/score_voxceleb1.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +if [ $# -ne 2 ] && [ $# -n 3]; then + echo "Usage: $0 [suffix]" + exit 1; +fi + +set -e + +data_dir=$1 +score_dir=$2 +suffix=$3 + +for cond in o o_clean e e_clean h h_clean +do + echo "Voxceleb1 $cond" + key=$data_dir/trials_$cond + #Compute performance + python local/score_dcf.py --key-file $key --score-file $score_dir/voxceleb1_scores$suffix --output-path $score_dir/voxceleb1${suffix}_${cond} & +done +wait + diff --git a/egs/voxceleb/v1.1/local/score_voxceleb1_o_clean.sh b/egs/voxceleb/v1.1/local/score_voxceleb1_o_clean.sh new file mode 100755 index 00000000..b8247efc --- /dev/null +++ b/egs/voxceleb/v1.1/local/score_voxceleb1_o_clean.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1; +fi + +set -e + +data_dir=$1 +score_dir=$2 + +for cond in o_clean +do + echo "Voxceleb $cond" + key=$data_dir/trials_$cond + #Compute performance + python local/score_dcf.py --key-file $key --score-file $score_dir/voxceleb1_scores --output-path $score_dir/voxceleb1_${cond} & +done +wait + diff --git a/egs/voxceleb/v1.1/local/score_voxceleb1_single_cond.sh b/egs/voxceleb/v1.1/local/score_voxceleb1_single_cond.sh new file mode 100755 index 00000000..7531037e --- /dev/null +++ b/egs/voxceleb/v1.1/local/score_voxceleb1_single_cond.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +if [ $# -ne 3 ]; then + echo "Usage: $0 " + exit 1; +fi + +set -e + +data_dir=$1 +cond=$2 +score_dir=$3 + +echo "Voxceleb $cond" +key=$data_dir/trials_$cond +#Compute performance +python local/score_dcf.py --key-file $key --score-file $score_dir/voxceleb1_scores --output-path $score_dir/voxceleb1_${cond} + + diff --git a/egs/voxceleb/v1.1/local/score_voxsrc22_dev.sh b/egs/voxceleb/v1.1/local/score_voxsrc22_dev.sh new file mode 100755 index 00000000..f4649fb7 --- /dev/null +++ b/egs/voxceleb/v1.1/local/score_voxsrc22_dev.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +if [ $# -ne 2 ] && [ $# -n 3 ]; then + echo "Usage: $0 " + exit 1; +fi + +set -e + +data_dir=$1 +score_dir=$2 +suffix=$3 + +echo "Score voxsrc22 dev" +key=$data_dir/trials +#Compute performance +python local/score_dcf.py --key-file $key --score-file $score_dir/voxsrc22_dev_scores$suffix --output-path $score_dir/voxsrc22_dev$suffix + + diff --git a/egs/voxceleb/v1.1/run_001_prepare_data.sh b/egs/voxceleb/v1.1/run_001_prepare_data.sh index 7bf15448..44385610 100755 --- a/egs/voxceleb/v1.1/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.1/run_001_prepare_data.sh @@ -12,7 +12,7 @@ config_file=default_config.sh . parse_options.sh || exit 1; . datapath.sh - +. $config_file if [ $stage -le 1 ];then # Prepare the VoxCeleb2 dataset for training. @@ -26,3 +26,21 @@ if [ $stage -le 2 ];then # Use this for the newer version of voxceleb1: local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + local/prepare_voxsrc22_dev.py \ + --vox1-corpus-dir $voxceleb1_root \ + --voxsrc22-corpus-dir $voxsrc22_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then +# local/prepare_voxsrc22_test.py \ +# --corpus-dir $voxsrc22_root \ +# --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # # split vox2 into 2 parts, for cohort and qmf training + local/make_vox2_trials.py --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/v1.1/run_002_compute_evad.sh b/egs/voxceleb/v1.1/run_002_compute_evad.sh index eeae00ac..27260be3 100755 --- a/egs/voxceleb/v1.1/run_002_compute_evad.sh +++ b/egs/voxceleb/v1.1/run_002_compute_evad.sh @@ -19,39 +19,41 @@ config_file=default_config.sh if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $vaddir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $vaddir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $vaddir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $vaddir/storage - else - echo "we don't distribute data between multiple machines" - fi + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" fi + fi fi -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ - --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_vad/$name $vaddir - utils/fix_data_dir.sh data/${name} - done +if [ $stage -le 2 ];then + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh \ + --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done fi diff --git a/egs/voxceleb/v1.1/run_011_train_xvector.sh b/egs/voxceleb/v1.1/run_011_train_xvector.sh index 0b0e4d50..c8ab552e 100755 --- a/egs/voxceleb/v1.1/run_011_train_xvector.sh +++ b/egs/voxceleb/v1.1/run_011_train_xvector.sh @@ -10,9 +10,8 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" use_tb=false use_wandb=false @@ -20,20 +19,17 @@ use_wandb=false . $config_file . datapath.sh -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/${nnet_data}_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$use_tb" == "true" ];then - args="$args --use-tensorboard" + extra_args="$extra_args --trainer.use-tensorboard" fi if [ "$use_wandb" == "true" ];then - args="$args --use-wandb --wandb.project voxceleb-v1.1 --wandb.name $nnet_name.$(date -Iminutes)" + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" fi if [ "$interactive" == "true" ];then @@ -42,48 +38,44 @@ fi # Network Training if [ $stage -le 1 ]; then - - if [[ ${nnet_type} =~ resnet1d ]]; then - train_exec=torch-train-resnet1d-xvec-from-wav.py - elif [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]] || [[ ${nnet_type} =~ res2net ]] || [[ ${nnet_type} =~ res2next ]]; then - train_exec=torch-train-resnet-xvec-from-wav.py - elif [[ ${nnet_type} =~ efficientnet ]]; then - train_exec=torch-train-efficientnet-xvec-from-wav.py - elif [[ ${nnet_type} =~ tdnn ]]; then - train_exec=torch-train-tdnn-xvec-from-wav.py - elif [[ ${nnet_type} =~ transformer ]]; then - train_exec=torch-train-transformer-xvec-v1-from-wav.py - elif [[ ${nnet_type} =~ spinenet ]] || [[ ${nnet_type} =~ spine2net ]] || [[ ${nnet_type} =~ r0_sp53 ]]; then - train_exec=torch-train-spinenet-xvec-from-wav.py - else - echo "$nnet_type not supported" - exit 1 - fi - mkdir -p $nnet_dir/log + mkdir -p $nnet_s1_dir/log $cuda_cmd \ - --gpu $ngpu $nnet_dir/log/train.log \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --embed-dim $embed_dim $nnet_opt $opt_opt $lrs_opt \ - --epochs $nnet_num_epochs \ - --cos-scale $s --margin $margin --margin-warmup-epochs $margin_warmup \ - --dropout-rate $dropout \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_s1_dir \ --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args fi -exit +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh index 3abf2ff6..f933a7b2 100755 --- a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh @@ -8,6 +8,7 @@ set -e stage=1 +nnet_stage=2 config_file=default_config.sh use_gpu=false xvec_chunk_length=12800 @@ -21,41 +22,67 @@ else xvec_cmd="$train_cmd --mem 12G" fi +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + xvector_dir=exp/xvectors/$nnet_name -if [ $stage -le 1 ]; then - # Extract xvectors for training LDA/PLDA - for name in voxceleb2cat_train - do - if [ $plda_num_augs -eq 0 ]; then - steps_xvec/extract_xvectors_from_wav.sh --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ - --random-utt-length true --min-utt-length 400 --max-utt-length 14000 \ - --feat-config $feat_config \ - $nnet data/${name} \ - $xvector_dir/${name} - else - steps_xvec/extract_xvectors_from_wav.sh --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ - --random-utt-length true --min-utt-length 400 --max-utt-length 14000 \ - --feat-config $feat_config --aug-config $plda_aug_config --num-augs $plda_num_augs \ - $nnet data/${name} \ - $xvector_dir/${name}_augx${plda_num_augs} \ - data/${name}_augx${plda_num_augs} - fi - done +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then + # Extract xvectors for training LDA/PLDA + for name in voxceleb2cat_train + do + if [ $plda_num_augs -eq 0 ]; then + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --random-utt-length true --min-utt-length 200 --max-utt-length 3000 \ + --feat-config $feat_config \ + $nnet data/${name} \ + $xvector_dir/${name} + else + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ + --random-utt-length true --min-utt-length 200 --max-utt-length 3000 \ + --feat-config $feat_config --aug-config $plda_aug_config --num-augs $plda_num_augs \ + $nnet data/${name} \ + $xvector_dir/${name}_augx${plda_num_augs} \ + data/${name}_augx${plda_num_augs} + fi + done fi if [ $stage -le 2 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors_from_wav.sh --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - --feat-config $feat_config \ - $nnet data/$name \ - $xvector_dir/$name - done + # Extracts x-vectors for evaluation + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + --feat-config $feat_config \ + $nnet data/$name \ + $xvector_dir/$name + done fi -exit + diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh index cd168180..6bdbdf92 100755 --- a/egs/voxceleb/v1.1/run_040_eval_be.sh +++ b/egs/voxceleb/v1.1/run_040_eval_be.sh @@ -8,12 +8,34 @@ set -e stage=1 +nnet_stage=2 config_file=default_config.sh + . parse_options.sh || exit 1; . $config_file . datapath.sh +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + plda_label=${plda_type}y${plda_y_dim}_v1 be_name=lda${lda_dim}_${plda_label}_${plda_data} @@ -22,39 +44,92 @@ be_dir=exp/be/$nnet_name/$be_name score_dir=exp/scores/$nnet_name/${be_name} score_plda_dir=$score_dir/plda score_cosine_dir=exp/scores/$nnet_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/cosine_qmf -if [ $stage -le 1 ]; then +if [ "$do_plda" == "true" ];then + if [ $stage -le 1 ]; then echo "Train PLDA on Voxceleb2" - steps_be/train_be_v1.sh --cmd "$train_cmd" \ - --lda_dim $lda_dim \ - --plda_type $plda_type \ - --y_dim $plda_y_dim --z_dim $plda_z_dim \ - $xvector_dir/$plda_data/xvector.scp \ - data/$plda_data \ - $be_dir & + steps_be/train_be_v1.sh \ + --cmd "$train_cmd" \ + --lda_dim $lda_dim \ + --plda_type $plda_type \ + --y_dim $plda_y_dim --z_dim $plda_z_dim \ + $xvector_dir/$plda_data/xvector.scp \ + data/$plda_data \ + $be_dir + + fi + + + if [ $stage -le 2 ];then + echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA" + steps_be/eval_be_v1.sh \ + --cmd "$train_cmd" --plda_type $plda_type \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $be_dir/lda_lnorm.h5 \ + $be_dir/plda.h5 \ + $score_plda_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + fi +fi - wait + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir + + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done fi +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then -if [ $stage -le 2 ];then + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $score_cosine_dir/voxsrc22_dev_scores & - echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA" - steps_be/eval_be_v1.sh --cmd "$train_cmd" --plda_type $plda_type \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $be_dir/lda_lnorm.h5 \ - $be_dir/plda.h5 \ - $score_plda_dir/voxceleb1_scores + # steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $score_cosine_dir/voxsrc22_test_scores - $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_dir - for f in $(ls $score_plda_dir/*_results); + for f in $(ls $score_cosine_dir/voxsrc22_dev_results); do echo $f cat $f @@ -64,62 +139,197 @@ if [ $stage -le 2 ];then fi -score_plda_dir=$score_cosine_dir - -if [ $stage -le 3 ];then - - echo "Eval Voxceleb 1 with Cosine scoring" - steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $score_plda_dir/voxceleb1_scores +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 22G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/*_results); + do + echo $f + cat $f + echo "" + done + fi + + if [ $stage -le 6 ];then + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos_snorm.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_snorm_dir/voxsrc22_test_scores - $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_snorm_dir - for f in $(ls $score_plda_dir/*_results); + for f in $(ls $score_cosine_snorm_dir/voxsrc22_dev_results); do echo $f cat $f echo "" done - + fi fi -be_dir=exp/be/$nnet_name/cw -score_plda_dir=$score_dir/cw_cosine -if [ $stage -le 4 ]; then - echo "Train centering+whitening on Voxceleb2" - steps_be/train_be_v2.sh --cmd "$train_cmd" \ - $xvector_dir/$plda_data/xvector.scp \ - data/$plda_data \ - $be_dir -fi +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + echo "Train QMF in Vox2" + steps_be/train_be_cos_qmf.sh \ + --cmd "$train_cmd" --coh-nbest 1000 \ + data/voxceleb2cat_train/trials \ + data/voxceleb2cat_train/utt2model \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $xvector_dir/voxceleb2cat_train/utt2num_frames \ + data/voxceleb2cat_train/snorm_utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/voxceleb2_qmf_scores + fi -if [ $stage -le 5 ];then + if [ $stage -le 8 ];then - echo "Eval Voxceleb 1 with CentWhiten + Cosine scoring" - steps_be/eval_be_v2.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $be_dir/cw.h5 \ - $score_plda_dir/voxceleb1_scores + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $xvector_dir/voxceleb1_test/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_snorm.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_qmf.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxceleb1{,_snorm,_qmf}_[oeh]_clean_results); + do + echo $f + cat $f + echo "" + done - $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + fi + + if [ $stage -le 9 ];then + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $xvector_dir/voxsrc22_dev/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos_qmf.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $xvector_dir/voxsrc22_test/utt2num_frames \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_qmf_dir/qmf.h5 \ + # $score_cosine_qmf_dir/voxsrc22_test_scores - for f in $(ls $score_plda_dir/*_results); + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_snorm.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_qmf.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxsrc22_dev{,_snorm,_qmf}_results); do echo $f cat $f echo "" done + fi + +fi +if [ "$do_pca" != "true" ];then + exit 0 fi -exit +be_name=pca_r${pca_var_r} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name/${be_name} +score_cosine_dir=exp/scores/$nnet_name/$be_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/$be_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/$be_name/cosine_qmf + +be_dir=exp/be/$nnet_name/ +score_be_dir=$score_dir/pca_r${pca_var_r} + +if [ $stage -le 10 ]; then + echo "Train projection on Voxceleb2" + $train_cmd $be_dir/log/train_be.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_proj_v1.py \ + --v-file scp:$xvector_dir/$plda_data/xvector.scp \ + --train-list data/$plda_data/utt2spk \ + --output-path $be_dir \ + --pca.pca-var-r $pca_var_r + +fi + + +if [ $stage -le 11 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + --preproc-file $be_dir/preproc.h5 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir + + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi diff --git a/egs/voxceleb/v1.2/README.md b/egs/voxceleb/v1.2/README.md new file mode 100644 index 00000000..f6fec0a6 --- /dev/null +++ b/egs/voxceleb/v1.2/README.md @@ -0,0 +1,314 @@ +# VoxCeleb V1.2 + +Recipe for the VoxCeleb Speaker Verification Task + +## Differences w.r.t VoxCeleb V1 recipe + +In recipe version V1: + - We compute speech augmentations and acoustic features offline and dump them to disk. + - Augmentation is performed using Kaldi scripts and wav-reverbate tool + - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files. + +In V1.1: + - We compute speech augmentations and acoustic features are computed always on-the-fly, + we don't dump any features to disk. + - Augmentation is performed using Hyperin SpeechAugment class. + - The behavior of this class is controlled + by the the configuration file `conf/reverb_noise_aug.yml`, + which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe. + - Babble noise is created offline by mixing 3-10 single speaker files. + +In V1.2: + - Feaure extractor is embedded into the pytorch model in classes derived from Wav2XVector base class. + - Kaldi format is replaced by new format based on pandas tables + - Kaldi style bash scripts are removed and replaced by python scripts + - Most python scripts are called using Hyperion entry points + +## Citing + +## Training Data + + - x-Vector network is trained on Voxceleb2 dev + test with augmentations + - MUSAN noise + - RIR reverberation + +## Test data + + - Test data is VoxCeleb 1 + - We evaluate the 3 conditions (with cleaned lists): + - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers + - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1 + - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials. + + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it will use Light ResNet (16 base channels) + - For better performance use full ResNet (64 base channels) using `config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` file as +```bash +run_005_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_006_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true +run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +``` + + - To train with mixed precision training use config file `config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` + +## Recipe Steps: + + - `run_001_prepare_data.sh` + - Data preparation script to generate Kaldi style data directories for + - VoxCeleb2 train+test + - VoxCeleb1 O/E/H eval sets + + - `run_002_compute_evad.sh` + - Computes Energy VAD for all datasets + + - `run_003_prepare_noises_rirs.sh` + - Prepares MUSAN noises, music to be used by SpeechAugment class. + - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. + - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. + + - `run_004_prepare_xvec_train_data.sh` + - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. + - Removes silence from the audios + - Removes utterances shorter than 4secs and speakers with less than 8 utterances. + - Creates training and validation lists for x-vector training + + - `run_005_train_xvector.sh` + - Trains the x-vector network + + - `run_006_extract_xvectors.sh` + - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training + - Exctracts x-vectors for VoxCeleb1 test sets + + - `run_007_eval_be.sh` + - Trains PLDA and evals PLDA and cosine scoring back-ends + +## Results + +### VoxCeleb 1 Original-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.11 | 0.069 | 0.126 | +| | | | Cosine + AS-Norm | 1.10 | 0.065 | 0.108 | +| | | | Cosine + QMF | 0.95 | 0.059 | 0.084 | +| config_fbank80_stmn_ecapatdnn512x3.v3.1.sh | ECAPA-TDNN 512x3 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.02 | 0.067 | 0.109 | +| | | | Cosine + AS-Norm | 0.98 | 0.062 | 0.092 | +| | | | Cosine + QMF | 0.85 | 0.061 | 0.091 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | +| | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 | +| | | | Cosine + QMF | 0.57 | 0.037 | 0.071 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh | ECAPA-TDNN 2048x4 | Stage2: Subcenter ArcFace m=0.3/intertop_m=0.1/centers=2 Dropout=0.25 | Cosine | 0.62 | 0.049 | 0.076 | +| | | | Cosine + AS-Norm | 0.61 | 0.044 | 0.075 | +| | | | Cosine + QMF | 0.53 | 0.037 | 0.076 | +| config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.59 | 0.1 | 0.172 | +| | | | Cosine + AS-Norm | 1.54 | 0.927 | 0.140 | +| | | | Cosine + QMF | 1.32 | 0.083 | 0.121 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.039 | 0.048 | +| | | | Cosine + QMF | 0.62 | 0.034 | 0.042 | +| config_fbank80_stmn_resnet34.v3.1.sh | ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.68 | 0.039 | 0.048 +| | | | Cosine + AS-Norm | 0.60 | 0.036 | 0.052 | +| | | | Cosine + QMF | 0.53 | 0.033 | 0.050 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.76 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.041 | 0.061 | +| | | | Cosine + QMF | 0.62 | 0.037 | 0.056 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 | +| | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062| +| | | | Cosine + QMF | 0.62 | 0.036 | 0.063 | +| config_fbank80_stmn_fwseresnet34.v3.1.sh | FwSE-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.66 | 0.046 | 0.060 | +| | | | Cosine + AS-Norm | 0.61 | 0.040 | 0.052 | +| | | | Cosine + QMF | 0.057 | 0.037 | 0.058 | +| config_fbank80_stmn_fwseresnet34pe.v3.1.sh | FwSE-ResNet34-FPE | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.73 | 0.042 | 0.053 | +| | | | Cosine + AS-Norm | 0.64 | 0.034 | 0.047 | +| | | | Cosine + QMF | 0.60 | 0.033 | 0.044 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.053 | 0.082 | +| | | | Cosine + AS-Norm | 0.70 | 0.043 | 0.076 | +| | | | Cosine + QMF | 0.63 | 0.042 | 0.071 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.051 | 0.095 | +| | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 | +| | | | Cosine + QMF | 0.67 | 0.039 | 0.074 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | +| | | | Cosine + AS-Norm | 0.52 | 0.033 | 0.045 | +| | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | +| config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.50 | 0.035 | 0.038 | +| | | | Cosine + AS-Norm | 0.47 | 0.031 | 0.038 | +| | | | Cosine + QMF | 0.40 | 0.027 | 0.032 | +| config_fbank80_stmn_idrnd_resnet100.v3.2.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.49 | 0.032 | 0.038 | +| | | | Cosine + AS-Norm | 0.43 | 0.025 | 0.034 | +| | | | Cosine + QMF | 0.37 | 0.024 | 0.033 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 | +| | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 | +| | | | Cosine + QMF | 0.49 | 0.033 | 0.054 | + +### VoxCeleb 1 Entire-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.16 | 0.073 | 0.130 | +| | | | Cosine + AS-Norm | 1.13 | 0.068 | 0.118 | +| | | | Cosine + QMF | 1.06 | 0.064 | 0.112 | +| config_fbank80_stmn_ecapatdnn512x3.v3.1.sh | ECAPA-TDNN 512x3 | Stage2: SubCenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.06 | 0.066 | 0.116 | +| | | | Cosine + AS-Norm | 1.01 | 0.061 | 0.106 | +| | | | Cosine + QMF | 0.96 | 0.058 | 0.097 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | +| | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 | +| | | | Cosine + QMF | 0.76 | 0.047 | 0.083 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh | ECAPA-TDNN 2048x4 | Stage2: Subcenter ArcFace m=0.3/intertop_m=0.1/centers=2 Dropout=0.25 | Cosine | 0.83 | 0.052 | 0.096 | +| | | | Cosine + AS-Norm | 0.77 | 0.049 | 0.086 | +| | | | Cosine + QMF | 0.74 | 0.047 | 0.082 | +| config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.69 | 0.103 | 0.174 | +| | | | Cosine + AS-Norm | 1.62 | 0.096 | 0.156 | +| | | | Cosine + QMF | 1.51 | 0.091 | 0.152 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.86 | 0.054 | 0.098 | +| | | | Cosine + AS-Norm | 0.81 | 0.049 | 0.087 | +| | | | Cosine + QMF | 0.77 | 0.046 | 0.082 | +| config_fbank80_stmn_resnet34.v3.1.sh | ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.80 | 0.049 | 0.094 | +| | | | Cosine + AS-Norm | 0.76 | 0.046 | 0.081 | +| | | | Cosine + QMF | 0.70 | 0.043 | 0.074 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.89 | 0.058 | 0.098 | +| | | | Cosine + AS-Norm | 0.84 | 0.053 | 0.087| +| | | | Cosine + QMF | 0.80 | 0.050 | 0.081 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 | +| | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 | +| | | | Cosine + QMF | 0.74 | 0.045 | 0.081 | +| config_fbank80_stmn_fwseresnet34.v3.1.sh | FwSE-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.80 | 0.051 | 0.090 | +| | | | Cosine + AS-Norm | 0.74 | 0.046 | 0.081 | +| | | | Cosine + QMF | 0.70 | 0.044 | 0.076 | +| config_fbank80_stmn_fwseresnet34pe.v3.1.sh | FwSE-ResNet34-FPE | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.80 | 0.052 | 0.094 | +| | | | Cosine + AS-Norm | 0.76 | 0.047 | 0.081 | +| | | | Cosine + QMF | 0.72 | 0.045 | 0.076 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.91 | 0.057 | 0.100 | +| | | | Cosine + AS-Norm | 0.85 | 0.052 | 0.089 | +| | | | Cosine + QMF | 0.81 | 0.049 | 0.085 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.94 | 0.059 | 0.105 | +| | | | Cosine + AS-Norm | 0.88 | 0.053 | 0.093 | +| | | | Cosine + QMF | 0.84 | 0.051 | 0.088 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| +| | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | +| | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | +| config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.69 | 0.043 | 0.074 | +| | | | Cosine + AS-Norm | 0.65 | 0.039 | 0.068 | +| | | | Cosine + QMF | 0.63 | 0.036 | 0.065 | +| config_fbank80_stmn_idrnd_resnet100.v3.2.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.66 | 0.040 | 0.072 | +| | | | Cosine + AS-Norm | 0.62 | 0.037 | 0.066 | +| | | | Cosine + QMF | 0.59 | 0.035 | 0.064 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 | +| | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 | +| | | | Cosine + QMF | 0.68 | 0.040 | 0.069 | + + +### VoxCeleb 1 Hard-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.10 | 0.128 | 0.209 | +| | | | Cosine + AS-Norm | 1.99 | 0.118 | 0.190 | +| | | | Cosine + QMF | 1.84 | 0.111 | 0.184 | +| config_fbank80_stmn_ecapatdnn512x3.v3.1.sh | ECAPA-TDNN 512x3 | Stage2: SubCenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.93 | 0.120 | 0.198 | +| | | | Cosine + AS-Norm | 1.84 | 0.113 | 0.184 | +| | | | Cosine + QMF | 1.73 | 0.108 | 0.177 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | +| | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 | +| | | | Cosine + QMF | 1.44 | 0.087 | 0.145 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh | ECAPA-TDNN 2048x4 | Stage2: Subcenter ArcFace m=0.3/intertop_m=0.1/centers=2 Dropout=0.25 | Cosine | 1.65 | 0.0101 | 0.169 | +| | | | Cosine + AS-Norm | 1.53 | 0.090 | 0.149 | +| | | | Cosine + QMF | 1.46 | 0.087 | 0.144 | +| config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.84 | 0.167 | 0.267 | +| | | | Cosine + AS-Norm | 2.58 | 0.150 | 0.252 | +| | | | Cosine + QMF | 2.45 | 0.144 | 0.234 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.62 | 0.098 | 0.164 | +| | | | Cosine + AS-Norm | 1.45 | 0.085 | 0.142 | +| | | | Cosine + QMF | 1.36 | 0.082 | 0.137 | +| config_fbank80_stmn_resnet34.v3.1.sh | ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.56 | 0.091 | 0.157 | +| | | | Cosine + AS-Norm | 1.40 | 0.080 | 0.135 | +| | | | Cosine + QMF | 1.33 | 0.076 | 0.128 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.70 | 0.1 | 0.165 | +| | | | Cosine + AS-Norm | 1.50 | 0.086 | 0.138 | +| | | | Cosine + QMF | 1.44 | 0.085 | 0.139 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 | +| | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 | +| | | | Cosine + QMF | 1.34 | 0.079 | 0.136 | +| config_fbank80_stmn_fwseresnet34.v3.1.sh | FwSE-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.58 | 0.096 | 0.162 | +| | | | Cosine + AS-Norm | 1.43 | 0.083 | 0.140 | +| | | | Cosine + QMF | 0.134 | 0.079 | 0.134 | +| config_fbank80_stmn_fwseresnet34pe.v3.1.sh | FwSE-ResNet34-FPE | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.61 | 0.097 | 1.63 | +| | | | Cosine + AS-Norm | 1.44 | 0.085 | 0.138 | +| | | | Cosine + QMF | 1.37 | 0.080 | 0.132 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.75 | 0.104 | 0.171 | +| | | | Cosine + AS-Norm | 1.56 | 0.091 | 0.152 | +| | | | Cosine + QMF | 1.50 | 0.087 | 0.145 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.76 | 0.104 | 0.174 | +| | | | Cosine + AS-Norm | 1.58 | 0.092 | 0.152 | +| | | | Cosine + QMF | 1.51 | 0.089 | 0.149 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | +| | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | +| | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | +| config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.36 | 0.077 | 0.122 | +| | | | Cosine + AS-Norm | 1.23 | 0.069 | 0.112 | +| | | | Cosine + QMF | 1.17 | 0.065 | 0.110 | +| config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.27 | 0.072 | 0.121 | +| | | | Cosine + AS-Norm | 1.15 | 0.065 | 0.107 | +| | | | Cosine + QMF | 1.10 | 0.062 | 0.102 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 | +| | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 | +| | | | Cosine + QMF | 1.21 | 0.069 | 0.113 | + + +### VoxSRC2022 dev + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.87 | 0.185 | 0.304 | +| | | | Cosine + AS-Norm | 2.84 | 0.182 | 0.304 | +| | | | Cosine + QMF | 2.61 | 0.172 | 0.283 | +| config_fbank80_stmn_ecapatdnn512x3.v3.1.sh | ECAPA-TDNN 512x3 | Stage2: SubCenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.60 | 0.174 | 0.287 | +| | | | Cosine + AS-Norm | 2.58 | 0.172 | 0.291 | +| | | | Cosine + QMF | 2.44 | 0.161 | 0.274 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | +| | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 | +| | | | Cosine + QMF | 2.06 | 0.137 | 0.251 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh | ECAPA-TDNN 2048x4 | Stage2: Subcenter ArcFace m=0.3/intertop_m=0.1/centers=2 Dropout=0.25 | Cosine | 2.34 | 0.152 | 0.275 | +| | | | Cosine + AS-Norm | 2.24 | 0.143 | 0.268 | +| | | | Cosine + QMF | 2.12 | 0.139 | 0.255 | +| config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 3.74 | 0.239 | 0.394 | +| | | | Cosine + AS-Norm | 3.45 | 0.225 | 0.377 | +| | | | Cosine + QMF | 3.27 | 0.213 | 0.356 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.19 | 0.142 | 0.242 | +| | | | Cosine + AS-Norm | 2.00 | 0.133 | 0.254 | +| | | | Cosine + QMF | 1.86 | 0.126 | 0.229 | +| config_fbank80_stmn_resnet34.v3.1.sh | ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.15 | 0.135 | 0.233 | +| | | | Cosine + AS-Norm | 1.98 | 0.126 | 0.245 | +| | | | Cosine + QMF | 1.86 | 0.119 | 0.222 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.34 | 0.145 | 0.246 | +| | | | Cosine + AS-Norm | 2.10 | 0.135 | 0.248 | +| | | | Cosine + QMF | 2.01 | 0.127 | 0.218 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 | +| | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 | +| | | | Cosine + QMF | 1.87 | 0.119 | 0.216 | +| config_fbank80_stmn_fwseresnet34.v3.1.sh | FwSE-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.14 | 0.134 | 0.228 | +| | | | Cosine + AS-Norm | 1.97 | 0.124 | 0.223 | +| | | | Cosine + QMF | 1.82 | 0.116 | 0.205 | +| config_fbank80_stmn_fwseresnet34pe.v3.1.sh | FwSE-ResNet34-FPE | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.27 | 0.138 | 0.238 | +| | | | Cosine + AS-Norm | 2.08 | 0.129 | 0.223 | +| | | | Cosine + QMF | 1.94 | 0.120 | 0.207 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.36 | 0.153 | 0.259 | +| | | | Cosine + AS-Norm | 2.18 | 0.139 | 0.249 | +| | | | Cosine + QMF | 2.08 | 0.128 | 0.222 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.49 | 0.158 | 0.265 | +| | | | Cosine + AS-Norm | 2.29 | 0.145 | 0.251 | +| | | | Cosine + QMF | 2.17 | 0.133 | 0.230 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | +| | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | +| | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | +| config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.02 | 0.116 | 0.194 | +| | | | Cosine + AS-Norm | 1.81 | 0.107 | 0.199 | +| | | | Cosine + QMF | 1.72 | 0.099 | 0.186 | +| config_fbank80_stmn_idrnd_resnet100.v3.2.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.91 | 0.111 | 0.192 | +| | | | Cosine + AS-Norm | 1.75 | 0.105 | 0.194 | +| | | | Cosine + QMF | 1.64 | 0.098 | 0.181 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 | +| | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 | +| | | | Cosine + QMF | 1.68 | 0.114 | 0.216 | diff --git a/egs/voxceleb/v1.2/cmd.sh b/egs/voxceleb/v1.2/cmd.sh new file mode 100755 index 00000000..381b14e0 --- /dev/null +++ b/egs/voxceleb/v1.2/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/voxceleb/v1.2/conf/clsp.conf b/egs/voxceleb/v1.2/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/v1.2/conf/coe_gpu_bigmem.conf b/egs/voxceleb/v1.2/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/v1.2/conf/coe_gpu_long.conf b/egs/voxceleb/v1.2/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/v1.2/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/v1.2/conf/coe_gpu_rtx.conf b/egs/voxceleb/v1.2/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/v1.2/conf/coe_gpu_short.conf b/egs/voxceleb/v1.2/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/v1.2/conf/coe_gpu_v100.conf b/egs/voxceleb/v1.2/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/v1.2/conf/fbank80_specaug1_stmn_16k.yaml b/egs/voxceleb/v1.2/conf/fbank80_specaug1_stmn_16k.yaml new file mode 100644 index 00000000..8df42fc6 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/fbank80_specaug1_stmn_16k.yaml @@ -0,0 +1,24 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 8 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/v1.2/conf/fbank80_stmn_16k.yaml b/egs/voxceleb/v1.2/conf/fbank80_stmn_16k.yaml new file mode 100644 index 00000000..f4091f5d --- /dev/null +++ b/egs/voxceleb/v1.2/conf/fbank80_stmn_16k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml b/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..86f55073 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml @@ -0,0 +1,34 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max + mediumroom: + weight: 1 + rir_path: csv:data/rirs_mediumroom/rirs.csv + rir_norm: max + realroom: + weight: 1 + rir_path: csv:data/rirs_real/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 3 + max_snr: 18 diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..132438bf --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: cfwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..34c0801e --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..f576e411 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: seresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..34c0801e --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..23f03de7 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml @@ -0,0 +1,98 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.2 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + #min_lr: 1.0e-05 + min_lr: 1.0e-06 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..b7fab34b --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml @@ -0,0 +1,101 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.2 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + #min_lr: 1.0e-05 + min_lr: 1.0e-06 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..79d510ae --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0.25 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..e147dbb3 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.1.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0.25 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..03a7f736 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml @@ -0,0 +1,96 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..958c6237 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.1.yaml @@ -0,0 +1,99 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..9008a04c --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0. +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..c19546e8 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.1.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0. +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..41748978 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 4 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..ca15bbba --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.1.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 4 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..34c0801e --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..ac859010 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.1.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34pe_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34pe_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..efa601c0 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34pe_xvec_stage1_v3.1.yaml @@ -0,0 +1,78 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 4 + freq_pos_enc: true +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker +master_port: 4567 \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..eff62765 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.05 + se_r: 4 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..03897a19 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.1.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.05 + se_r: 4 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..571411ca --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker + \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..3b8d716a --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.1.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker + \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.2.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.2.yaml new file mode 100644 index 00000000..99fbf196 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.2.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.025 +trainer: + optim: + opt_type: sgd + lr: 1e-4 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 11 + eff_batch_size: 256 + swa_start: 20 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + target_key: speaker + \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..2244fd38 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage1_v3.1.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..ac859010 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage2_v3.1.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..e35b273a --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: resnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..0ec78598 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.1.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: resnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..34c0801e --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..ac859010 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.1.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..f4c381d6 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: tseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 256 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..34c0801e --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/vad_16k.yaml b/egs/voxceleb/v1.2/conf/vad_16k.yaml new file mode 100644 index 00000000..e5a6bb82 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/vad_16k.yaml @@ -0,0 +1,9 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: -4.89 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 +wav_scale: 1 diff --git a/egs/voxceleb/v1.2/datapath.sh b/egs/voxceleb/v1.2/datapath.sh new file mode 100644 index 00000000..a7eb575c --- /dev/null +++ b/egs/voxceleb/v1.2/datapath.sh @@ -0,0 +1,23 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + # voxceleb1_root=/export/corpora5/VoxCeleb1_v1 #voxceleb1 v1 + voxceleb1_root=/export/corpora5/VoxCeleb1_v2 #voxceleb1 v2 + voxceleb2_root=/export/corpora5/VoxCeleb2 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + voxsrc22_root=/exp/jvillalba/corpora/voxsrc22 + musan_root=/expscratch/dgromero/corpora-open/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/voxceleb/v1.2/default_config.sh b/egs/voxceleb/v1.2/default_config.sh new file mode 120000 index 00000000..fd0e1bb1 --- /dev/null +++ b/egs/voxceleb/v1.2/default_config.sh @@ -0,0 +1 @@ +global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh \ No newline at end of file diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh new file mode 100644 index 00000000..56d18bd0 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# Channel-freq-wise-SE-ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cfwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh new file mode 100644 index 00000000..68849f78 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh @@ -0,0 +1,45 @@ +# Channel-wise ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + + +nnet_s2_base_cfg=conf/train_cwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh new file mode 100644 index 00000000..f2622b0e --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn2048x4.v3.0 + +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh new file mode 100644 index 00000000..5a9b6028 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn2048x4.v3.1 + +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh new file mode 100644 index 00000000..a3ad0c29 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh @@ -0,0 +1,45 @@ +# ECAPA-TDNN small + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn512x3.v3.0 + +nnet_s1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.1.sh new file mode 100644 index 00000000..05aa4033 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.1.sh @@ -0,0 +1,46 @@ +# ECAPA-TDNN small + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn512x3.v3.1 + +nnet_s1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0030.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage2_v3.1.yaml +nnet_name=${feat_type}_ecapatdnn512x3.v3.1 +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh new file mode 100644 index 00000000..f962c2b3 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# Freq-wise-SE ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_fwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_fwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.1.sh new file mode 100644 index 00000000..19f90be6 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.1.sh @@ -0,0 +1,44 @@ +# Freq-wise-SE ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34.v3.1 + +nnet_s1_base_cfg=conf/train_fwseresnet34_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_fwseresnet34_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34pe.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34pe.v3.1.sh new file mode 100644 index 00000000..62092708 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34pe.v3.1.sh @@ -0,0 +1,44 @@ +# Freq-wise-SE ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34pe.v3.1 + +nnet_s1_base_cfg=conf/train_fwseresnet34pe_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_fwseresnet34_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh new file mode 100644 index 00000000..6ea334b4 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh @@ -0,0 +1,44 @@ +# IdRnd ResNet100 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v3.0 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0029.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.1.sh new file mode 100644 index 00000000..f06bcbea --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.1.sh @@ -0,0 +1,44 @@ +# IdRnd ResNet100 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v3.1 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0029.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.2.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.2.sh new file mode 100644 index 00000000..4dbee17d --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.2.sh @@ -0,0 +1,45 @@ +# IdRnd ResNet100 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v3.1 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0029.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v3.2.yaml +nnet_name=${feat_type}_idrnd_resnet100.v3.2 +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0011.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_lresnet34.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_lresnet34.v3.1.sh new file mode 100644 index 00000000..019ac827 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_lresnet34.v3.1.sh @@ -0,0 +1,44 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34.v3.1 + +nnet_s1_base_cfg=conf/train_lresnet34_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_lresnet34_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh new file mode 100644 index 00000000..cb1a172d --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34.v3.0 + +nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.1.sh new file mode 100644 index 00000000..e954b63d --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.1.sh @@ -0,0 +1,44 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34.v3.1 + +nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh new file mode 100644 index 00000000..2528d13f --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# TSE-ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_tseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_tseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + +nnet_s2_base_cfg=conf/train_tseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/hyp_utils b/egs/voxceleb/v1.2/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/v1.2/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/v1.2/path.sh b/egs/voxceleb/v1.2/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/voxceleb/v1.2/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh new file mode 100755 index 00000000..563d3c2d --- /dev/null +++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. datapath.sh +. $config_file + +if [ $stage -le 1 ];then + # Prepare the VoxCeleb2 dataset for training. + hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train +fi + +if [ $stage -le 2 ];then + # prepare voxceleb1 for test + hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test +fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + # hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + # --vox1-corpus-dir $voxceleb1_root \ + # --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # split vox2 into 2 parts, for cohort and qmf training + hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/v1.2/run_002_compute_evad.sh b/egs/voxceleb/v1.2/run_002_compute_evad.sh new file mode 100755 index 00000000..acccace3 --- /dev/null +++ b/egs/voxceleb/v1.2/run_002_compute_evad.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +vad_dir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml +nj=40 + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ -z "$vad_config" ];then + echo "We are not using VAD in this configuration" + exit 0 +fi + +if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" +fi + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + hyp_utils/create_data_split_dirs.sh \ + $vad_dir/$name \ + $USER/hyp-data/voxceleb/v1.2/vad $nodes + done +fi + +#Train datasets +if [ $stage -le 2 ];then + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj + echo "compute vad for $name" + $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-compute-energy-vad --cfg $vad_config \ + --recordings-file data/$name/recordings.csv \ + --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ + --part-idx JOB --num-parts $nj || exit 1 + + hyperion-tables cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion-dataset add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv + done +fi + + diff --git a/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..73c7ed82 --- /dev/null +++ b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh new file mode 100755 index 00000000..4e0c5b19 --- /dev/null +++ b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +nodes=b1 +nj=40 +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + hyp_utils/create_data_split_dirs.sh \ + exp/xvector_audios/$nnet_data \ + $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes +fi + +if [ $stage -le 2 ];then + output_dir=exp/proc_audio/$nnet_data + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$nnet_data/vad.csv" + update_durs="--update-seg-durs" + fi + + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac --remove-dc-offset $vad_args \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$nnet_data/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + + hyperion-dataset set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad +fi + +if [ $stage -le 3 ];then + hyperion-dataset remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 + + hyperion-dataset remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 +fi + +if [ $stage -le 4 ];then + hyperion-dataset split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --joint-classes speaker --min-train-samples 1 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val +fi + diff --git a/egs/voxceleb/v1.2/run_005_train_xvector.sh b/egs/voxceleb/v1.2/run_005_train_xvector.sh new file mode 100755 index 00000000..2479d565 --- /dev/null +++ b/egs/voxceleb/v1.2/run_005_train_xvector.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_data_dir=data/${nnet_data}_xvector_train +val_data_dir=data/${nnet_data}_xvector_val + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu \ + +fi + + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v1.2/run_006_extract_xvectors.sh b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh new file mode 100755 index 00000000..0dc58048 --- /dev/null +++ b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +use_gpu=false +xvec_chunk_length=120.0 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" + num_gpus=1 +else + xvec_cmd="$train_cmd --mem 12G" + num_gpus=0 +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then + # Extract xvectors for training LDA/PLDA + nj=100 + for name in voxceleb2cat_train + do + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --random-utt-length --min-utt-length 2 --max-utt-length 30 \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + nj=100 + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}') + nj=$(($num_segs < 100 ? $num_segs:100)) + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + + diff --git a/egs/voxceleb/v1.2/run_007_eval_be.sh b/egs/voxceleb/v1.2/run_007_eval_be.sh new file mode 100755 index 00000000..bd436644 --- /dev/null +++ b/egs/voxceleb/v1.2/run_007_eval_be.sh @@ -0,0 +1,321 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name +score_plda_dir=$score_dir/${be_name}/plda +score_cosine_dir=$score_dir/cosine +score_cosine_snorm_dir=$score_dir/cosine_snorm +score_cosine_qmf_dir=$score_dir/cosine_qmf + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_dir/voxceleb1_scores.csv \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_dir/voxceleb1_results.csv + + cat $score_cosine_dir/voxceleb1_results.csv +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring" + $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_dir/voxsrc22_dev_scores.csv + + # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \ + # hyp_utils/conda_env.sh \ + # hyperion-eval-cosine-scoring-backend \ + # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ + # --ndx-file data/voxsrc22_eval/trials.csv \ + # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ + # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_dir/voxsrc22_dev_results.csv + + cat $score_cosine_dir/voxsrc22_dev_results.csv + +fi + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_snorm_dir/voxceleb1_results.csv + + cat $score_cosine_snorm_dir/voxceleb1_results.csv + fi + + if [ $stage -le 6 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + AS-Norm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + fi + +fi + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + echo "Train QMF in Vox2" + echo "...Calculating quality measures for Vox2" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --ndx-file data/voxceleb2cat_train_trials/trials.csv \ + --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv \ + --score-file $score_cosine_qmf_dir/voxceleb2_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + hyperion-train-qmf --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --key-file data/voxceleb2cat_train_trials/trials.csv \ + --model-file $score_cosine_qmf_dir/qmf.h5 + + fi + + if [ $stage -le 8 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxceleb1_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + ) & + done + wait + fi + + if [ $stage -le 9 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + ) & + done + wait + fi + +fi + diff --git a/egs/voxceleb/v1/local/attack_analysis.py b/egs/voxceleb/v1/local/attack_analysis.py deleted file mode 100755 index 8c74c6e9..00000000 --- a/egs/voxceleb/v1/local/attack_analysis.py +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - - -import sys -import os -import argparse -import time -import logging - -import numpy as np -import pandas as pd - -from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.metrics.verification_evaluator import ( - VerificationAdvAttackEvaluator as Eval, -) - - -def evaluate_attacks( - key_file, - clean_score_file, - attack_score_files, - attack_stats_files, - output_path, - prior, -): - - output_dir = os.path.dirname(output_path) - if not os.path.isdir(output_dir): - os.makedirs(output_dir) - - evaluator = Eval( - key_file, clean_score_file, attack_score_files, attack_stats_files, prior - ) - - # performance vs SNR - logging.info("compute perf vs snr for all trials") - df_clean = evaluator.compute_dcf_eer(return_df=True) - df_clean.insert(0, "snr", np.inf) - - df = evaluator.compute_dcf_eer_vs_stats( - "snr", - [-10, 0, 10, 20, 30, 40, 50, 60], - "all", - higher_better=True, - return_df=True, - ) - file_path = "%s_attack_all_snr_results.csv" % (output_path) - df = pd.concat([df_clean, df], ignore_index=True) - df.to_csv(file_path) - file_path = "%s_attack_all_snr" % (output_path) - evaluator.plot_dcf_eer_vs_stat_v1( - df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True - ) - - logging.info("compute perf vs snr for tar trials") - df = evaluator.compute_dcf_eer_vs_stats( - "snr", - [-10, 0, 10, 20, 30, 40, 50, 60], - "tar", - higher_better=True, - return_df=True, - ) - file_path = "%s_attack_tar_snr_results.csv" % (output_path) - df = pd.concat([df_clean, df], ignore_index=True) - df.to_csv(file_path) - file_path = "%s_attack_tar_snr" % (output_path) - evaluator.plot_dcf_eer_vs_stat_v1( - df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True - ) - - logging.info("compute perf vs snr for non trials") - df = evaluator.compute_dcf_eer_vs_stats( - "snr", - [-10, 0, 10, 20, 30, 40, 50, 60], - "non", - higher_better=True, - return_df=True, - ) - file_path = "%s_attack_non_snr_results.csv" % (output_path) - df = pd.concat([df_clean, df], ignore_index=True) - df.to_csv(file_path) - file_path = "%s_attack_non_snr" % (output_path) - evaluator.plot_dcf_eer_vs_stat_v1( - df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True - ) - - logging.info("find best attacks from snr point of view") - for i in range(len(attack_score_files)): - file_path = "%s_best_snr_tar_attacks_%d.csv" % (output_path, i) - evaluator.save_best_attacks( - file_path, - "snr", - "tar", - num_best=10, - min_delta=1, - attack_idx=i, - higher_better=True, - ) - - file_path = "%s_best_snr_non_attacks_%d.csv" % (output_path, i) - evaluator.save_best_attacks( - file_path, - "snr", - "non", - num_best=10, - min_delta=1, - attack_idx=i, - higher_better=True, - ) - - # performance vs Linf - logging.info("compute perf vs linf for all trials") - eps = np.ceil(np.asarray([0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]) * 2 ** 15) - df = evaluator.compute_dcf_eer_vs_stats( - "n_linf", eps, "all", higher_better=False, return_df=True - ) - file_path = "%s_attack_all_linf_results.csv" % (output_path) - df.to_csv(file_path) - file_path = "%s_attack_all_linf" % (output_path) - evaluator.plot_dcf_eer_vs_stat_v1( - df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True - ) - - logging.info("compute perf vs linf for tar trials") - df = evaluator.compute_dcf_eer_vs_stats( - "n_linf", eps, "tar", higher_better=False, return_df=True - ) - file_path = "%s_attack_tar_linf_results.csv" % (output_path) - df.to_csv(file_path) - file_path = "%s_attack_tar_linf" % (output_path) - evaluator.plot_dcf_eer_vs_stat_v1( - df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True - ) - - logging.info("compute perf vs linf for non trials") - df = evaluator.compute_dcf_eer_vs_stats( - "n_linf", eps, "non", higher_better=False, return_df=True - ) - file_path = "%s_attack_non_linf_results.csv" % (output_path) - df.to_csv(file_path) - file_path = "%s_attack_non_linf" % (output_path) - evaluator.plot_dcf_eer_vs_stat_v1( - df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True - ) - - # find the best attacks in terms of linf - logging.info("find best attacks from linf point of view") - for i in range(len(attack_score_files)): - file_path = "%s_best_linf_tar_attacks_%d.csv" % (output_path, i) - evaluator.save_best_attacks( - file_path, - "n_linf", - "tar", - num_best=10, - min_delta=1, - attack_idx=i, - higher_better=False, - ) - - file_path = "%s_best_linf_non_attacks_%d.csv" % (output_path, i) - evaluator.save_best_attacks( - file_path, - "n_linf", - "non", - num_best=10, - min_delta=1, - attack_idx=i, - higher_better=False, - ) - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars="@", - description="Analyses performance of adversarial attacks for spk. verif.", - ) - - parser.add_argument("--key-file", required=True) - parser.add_argument("--clean-score-file", required=True) - parser.add_argument("--attack-score-files", required=True, nargs="+") - parser.add_argument("--attack-stats-files", required=True, nargs="+") - parser.add_argument("--output-path", required=True) - parser.add_argument("--prior", default=0.05, type=float) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - evaluate_attacks(**vars(args)) diff --git a/egs/voxceleb/v1/local/score_dcf.py b/egs/voxceleb/v1/local/score_dcf.py deleted file mode 100755 index 9858583d..00000000 --- a/egs/voxceleb/v1/local/score_dcf.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import sys -import os -import argparse -import time -import logging - -import numpy as np - -from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.utils import SparseTrialScores, SparseTrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval - - -def score_dcf(key_file, score_file, output_path): - - logging.info("Load key: %s" % key_file) - key = SparseTrialKey.load_txt(key_file) - logging.info("Load scores: %s" % score_file) - scr = SparseTrialScores.load_txt(score_file) - logging.info("separating tar/non") - tar, non = scr.get_tar_non(key) - logging.info("computing EER/DCF") - priors = np.array([0.001, 0.005, 0.01, 0.05]) - min_dcf, act_dcf, eer, _, min_pmiss, min_pfa, act_pmiss, act_pfa = fast_eval( - tar, non, priors, return_probs=True - ) - - output_dir = os.path.dirname(output_path) - if not os.path.isdir(output_dir): - os.makedirs(output_dir) - - ntar = len(tar) - nnon = len(non) - - output_file = output_path + "_results" - with open(output_file, "w") as f: - s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f} ntar: {9:d} nnon: {10:d}\n".format( - eer * 100, - min_dcf[3], - act_dcf[3], - min_dcf[2], - act_dcf[2], - min_dcf[1], - act_dcf[1], - min_dcf[0], - act_dcf[0], - ntar, - nnon, - ) - f.write(s) - logging.info(s) - s = "min-pmiss={} min-pfa={} act-pmiss={} act-pfa={}".format( - min_pmiss, min_pfa, act_pmiss, act_pfa - ) - logging.info(s) - s = "min-Nmiss={} min-Nfa={} act-Nmiss={} act-Nfa={}".format( - min_pmiss * ntar, min_pfa * nnon, act_pmiss * ntar, act_pfa * nnon - ) - logging.info(s) - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars="@", - description="Computes EER and DCF", - ) - - parser.add_argument("--key-file", required=True) - parser.add_argument("--score-file", required=True) - parser.add_argument("--output-path", required=True) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - score_dcf(**vars(args)) diff --git a/egs/voxceleb/v1/local/score_voxceleb1.sh b/egs/voxceleb/v1/local/score_voxceleb1.sh deleted file mode 100755 index 5d11848d..00000000 --- a/egs/voxceleb/v1/local/score_voxceleb1.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Copyright 2020 Johns Hopkins University (Jesus Villalba) -# Apache 2.0. -# -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1; -fi - -set -e - -data_dir=$1 -score_dir=$2 - -for cond in o o_clean e e_clean h h_clean -do - echo "Voxceleb $cond" - key=$data_dir/trials_$cond - #Compute performance - python local/score_dcf.py --key-file $key --score-file $score_dir/voxceleb1_scores --output-path $score_dir/voxceleb1_${cond} & -done -wait - diff --git a/egs/voxceleb/v1/steps_be/eval-be-v1.py b/egs/voxceleb/v1/steps_be/eval-be-v1.py deleted file mode 100755 index c88b05fc..00000000 --- a/egs/voxceleb/v1/steps_be/eval-be-v1.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - - Evals PLDA LLR -""" - -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging - -import numpy as np - -from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.utils import TrialNdx, TrialScores -from hyperion.helpers import TrialDataReader as TDR -from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList - - -def eval_plda( - iv_file, - ndx_file, - enroll_file, - test_file, - preproc_file, - model_file, - score_file, - plda_type, - model_part_idx, - num_model_parts, - seg_part_idx, - num_seg_parts, - **kwargs -): - - logging.info("loading data") - if preproc_file is not None: - preproc = TransformList.load(preproc_file) - else: - preproc = None - - tdr = TDR( - iv_file, - ndx_file, - enroll_file, - test_file, - preproc, - model_part_idx, - num_model_parts, - seg_part_idx, - num_seg_parts, - ) - x_e, x_t, enroll, ndx = tdr.read() - - logging.info("loading plda model: %s" % (model_file)) - model = F.load_plda(plda_type, model_file) - - t1 = time.time() - logging.info("computing llr") - scores = model.llr_1vs1(x_e, x_t) - - dt = time.time() - t1 - num_trials = len(enroll) * x_t.shape[0] - logging.info( - "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." - % (dt, dt / num_trials * 1000) - ) - - if num_model_parts > 1 or num_seg_parts > 1: - score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) - logging.info("saving scores to %s" % (score_file)) - s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask) - s.save_txt(score_file) - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Eval PLDA") - - parser.add_argument("--iv-file", dest="iv_file", required=True) - parser.add_argument("--ndx-file", dest="ndx_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) - parser.add_argument("--test-file", dest="test_file", default=None) - parser.add_argument("--preproc-file", dest="preproc_file", default=None) - - TDR.add_argparse_args(parser) - F.add_argparse_eval_args(parser) - - parser.add_argument("--score-file", dest="score_file", required=True) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - assert args.test_file is not None or args.ndx_file is not None - eval_plda(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/eval-be-v2.py b/egs/voxceleb/v1/steps_be/eval-be-v2.py deleted file mode 100755 index 0438e373..00000000 --- a/egs/voxceleb/v1/steps_be/eval-be-v2.py +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -""" -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging - -import numpy as np - -from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring -from hyperion.helpers import TrialDataReader as TDR -from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList - - -def eval_plda( - iv_file, - ndx_file, - enroll_file, - test_file, - preproc_file, - score_file, - model_part_idx, - num_model_parts, - seg_part_idx, - num_seg_parts, - **kwargs -): - - logging.info("loading data") - if preproc_file is not None: - preproc = TransformList.load(preproc_file) - else: - preproc = None - - tdr = TDR( - iv_file, - ndx_file, - enroll_file, - test_file, - preproc, - model_part_idx, - num_model_parts, - seg_part_idx, - num_seg_parts, - ) - x_e, x_t, enroll, ndx = tdr.read() - - t1 = time.time() - logging.info("computing llr") - scores = cosine_scoring(x_e, x_t) - - dt = time.time() - t1 - num_trials = len(enroll) * x_t.shape[0] - logging.info( - "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." - % (dt, dt / num_trials * 1000) - ) - - if num_model_parts > 1 or num_seg_parts > 1: - score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) - logging.info("saving scores to %s" % (score_file)) - s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask) - s.save_txt(score_file) - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Eval cosine-scoring") - - parser.add_argument("--iv-file", dest="iv_file", required=True) - parser.add_argument("--ndx-file", dest="ndx_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) - parser.add_argument("--test-file", dest="test_file", default=None) - parser.add_argument("--preproc-file", dest="preproc_file", default=None) - - TDR.add_argparse_args(parser) - - parser.add_argument("--score-file", dest="score_file", required=True) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - assert args.test_file is not None or args.ndx_file is not None - eval_plda(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/eval-calibration-v1.py b/egs/voxceleb/v1/steps_be/eval-calibration-v1.py index bf252f60..fdd5516f 100755 --- a/egs/voxceleb/v1/steps_be/eval-calibration-v1.py +++ b/egs/voxceleb/v1/steps_be/eval-calibration-v1.py @@ -23,8 +23,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos.py b/egs/voxceleb/v1/steps_be/eval_be_cos.py new file mode 100755 index 00000000..a9bc03d1 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/eval_be_cos.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.list_utils import ismember +from hyperion.utils import TrialNdx, TrialScores +from hyperion.utils.math_funcs import cosine_scoring +from hyperion.helpers import TrialDataReader as TDR +from hyperion.helpers import PLDAFactory as F +from hyperion.np.transforms import TransformList + + +def eval_cos( + v_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + score_file, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + **kwargs +): + + logging.info("loading data") + if preproc_file is not None: + preproc = TransformList.load(preproc_file) + else: + preproc = None + + tdr = TDR( + v_file, + ndx_file, + enroll_file, + test_file, + preproc, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + ) + x_e, x_t, enroll, ndx = tdr.read() + + t1 = time.time() + logging.info("computing llr %d", x_e.shape[1]) + scores = cosine_scoring(x_e, x_t) + + dt = time.time() - t1 + num_trials = len(enroll) * x_t.shape[0] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) + + if num_model_parts > 1 or num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + f, loc = ismember(enroll, ndx.model_set) + s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask[loc]) + s.save_txt(score_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Eval cosine-scoring") + + parser.add_argument("--v-file", required=True) + parser.add_argument("--ndx-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-file", default=None) + parser.add_argument("--preproc-file", default=None) + + TDR.add_argparse_args(parser) + + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + assert args.test_file is not None or args.ndx_file is not None + eval_cos(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos.sh b/egs/voxceleb/v1/steps_be/eval_be_cos.sh index 90f118af..434732d6 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos.sh +++ b/egs/voxceleb/v1/steps_be/eval_be_cos.sh @@ -2,13 +2,13 @@ # Copyright 2020 Johns Hopkins University (Jesus Villalba) # Apache 2.0. # - +set -e cmd=run.pl num_parts=8 +preproc_file="" if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; -set -e if [ $# -ne 4 ]; then echo "Usage: $0 " @@ -27,6 +27,9 @@ name=$(basename $output_file) echo "$0 score $ndx_file" +if [ -n "$preproc_file" ];then + extra_args="--preproc-file $preproc_file" +fi for((i=1;i<=$num_parts;i++)); do @@ -34,8 +37,8 @@ do do $cmd $output_dir/log/${name}_${i}_${j}.log \ hyp_utils/conda_env.sh \ - steps_be/eval-be-v2.py \ - --iv-file scp:$vector_file \ + steps_be/eval_be_cos.py $extra_args \ + --v-file scp:$vector_file \ --ndx-file $ndx_file \ --enroll-file $enroll_file \ --score-file $output_file \ diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py new file mode 100755 index 00000000..bf66d72b --- /dev/null +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils import TrialNdx, TrialScores, Utt2Info +from hyperion.utils.math_funcs import cosine_scoring +from hyperion.np.pdfs import PLDA +from hyperion.utils.list_utils import ismember +from hyperion.helpers import TrialDataReader as TDR +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm +from hyperion.np.classifiers import BinaryLogisticRegression as LR + + +def get_score_filename(score_file, q_name, i, j, p): + if q_name is not None: + score_file = "%s_%s" % (score_file, q_name) + + if p: + score_file = "%s-%03d-%03d" % (score_file, i, j) + + return score_file + + +def save_empty(score_file, q_name, i, j, p): + score_file = get_score_filename(score_file, q_name, i, j, p) + logging.info("saving scores to %s", score_file) + with open(score_file, "w") as f: + pass + + +def save_scores(s, score_file, q_name, i, j, p): + score_file = get_score_filename(score_file, q_name, i, j, p) + logging.info("saving scores to %s", score_file) + s.save_txt(score_file) + + +def print_q_stats(scores, name): + s = f"{name} stats mean={np.mean(scores)} min={np.min(scores)} max={np.max(scores)} median={np.median(scores)}" + logging.info(s) + + +def eval_plda( + v_file, + ndx_file, + enroll_file, + num_frames_file, + coh_file, + coh_v_file, + score_file, + qmf_file, + preproc_file, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + coh_nbest, + **kwargs, +): + + if preproc_file is not None: + preproc = TransformList.load(preproc_file) + else: + preproc = None + + logging.info("loading data") + tdr = TDR( + v_file, + ndx_file, + enroll_file, + None, + preproc, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + ) + logging.info("read x-vectors and ndx") + x_e, x_t, enroll, ndx = tdr.read() + enroll_segs = tdr.enroll.key + + parallel = num_model_parts > 1 or num_seg_parts > 1 + + if not np.any(ndx.trial_mask): + save_empty(score_file, None, model_part_idx, seg_part_idx, parallel) + save_empty(score_file, "snorm", model_part_idx, seg_part_idx, parallel) + if qmf_file is None: + for q_name in ["snorm", "maxnf", "minnf", "maxcohmu", "mincohmu"]: + save_empty(score_file, q_name, model_part_idx, seg_part_idx, parallel) + else: + save_empty(score_file, "qmf", model_part_idx, seg_part_idx, parallel) + return + + logging.info("read num_frames") + u2nf = Utt2Info.load(num_frames_file) + min_dur = 0.1 + max_dur = 30.0 + + enroll_nf = np.log( + np.clip( + u2nf.filter(enroll_segs).info.astype(float) / 100, + a_min=min_dur, + a_max=max_dur, + ) + ) + test_nf = np.log( + np.clip( + u2nf.filter(ndx.seg_set).info.astype(float) / 100, + a_min=min_dur, + a_max=max_dur, + ) + ) + log_min_dur = np.log(min_dur) + log_max_dur = np.log(max_dur) + enroll_nf = (enroll_nf - log_min_dur) / (log_max_dur - log_min_dur) + test_nf = (test_nf - log_min_dur) / (log_max_dur - log_min_dur) + + t1 = time.time() + logging.info("computing llr") + scores = cosine_scoring(x_e, x_t) + + logging.info("read cohort x-vectors") + vcr = VCR(coh_v_file, coh_file, preproc=preproc) + x_coh, ids_coh = vcr.read() + D_coh = PLDA.compute_stats_hard(x_coh, class_ids=ids_coh) + x_coh = D_coh[1] / np.expand_dims(D_coh[0], axis=-1) + + t2 = time.time() + logging.info("score cohort vs test") + scores_coh_test = cosine_scoring(x_coh, x_t) + logging.info("score enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh) + + dt = time.time() - t2 + logging.info("cohort-scoring elapsed time: %.2f s.", dt) + + t2 = time.time() + logging.info("apply s-norm") + snorm = SNorm(nbest=coh_nbest, nbest_sel_method="highest-other-side") + scores_norm, mu_z, s_z, mu_t, s_t = snorm( + scores, scores_coh_test, scores_enr_coh, return_stats=True + ) + mu_z = mu_z / s_z + mu_t = mu_t / s_t + + dt = time.time() - t1 + num_trials = len(enroll) * x_t.shape[0] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + q_measures = { + "maxnf": np.maximum(enroll_nf[:, None], test_nf[None, :]), + "minnf": np.minimum(enroll_nf[:, None], test_nf[None, :]), + "maxcohmu": np.maximum(mu_z, mu_t), + "mincohmu": np.minimum(mu_z, mu_t), + } + for k, v in q_measures.items(): + print_q_stats(v, k) + + f, loc = ismember(enroll, ndx.model_set) + trial_mask = ndx.trial_mask[loc] + s = TrialScores(enroll, ndx.seg_set, scores, score_mask=trial_mask) + save_scores(s, score_file, None, model_part_idx, seg_part_idx, parallel) + s.scores = scores_norm + save_scores(s, score_file, "snorm", model_part_idx, seg_part_idx, parallel) + if qmf_file is None: + for q_name in ["maxnf", "minnf", "maxcohmu", "mincohmu"]: + s.scores = q_measures[q_name] + save_scores(s, score_file, q_name, model_part_idx, seg_part_idx, parallel) + + return + + logging.info("applying qmf") + # scores_fus = [scores.ravel()] + scores_fus = [scores_norm.ravel()] + for q_name in ["maxnf", "minnf", "maxcohmu", "mincohmu"]: + scores_fus.append(q_measures[q_name].ravel()) + + scores_fus = np.vstack(scores_fus).T + lr = LR.load(qmf_file) + scores_fus = lr.predict(scores_fus) + scores_fus = np.reshape(scores_fus, (s.num_models, s.num_tests)) + s.scores = scores_fus + save_scores(s, score_file, "qmf", model_part_idx, seg_part_idx, parallel) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Eval cosine-scoring with QMF") + + parser.add_argument("--v-file", required=True) + parser.add_argument("--ndx-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--num-frames-file", required=True) + parser.add_argument("--coh-v-file", required=True) + parser.add_argument("--coh-file", required=True) + parser.add_argument("--coh-nbest", type=int, default=400) + parser.add_argument("--qmf-file", default=None) + parser.add_argument("--preproc-file", default=None) + + TDR.add_argparse_args(parser) + + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_plda(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh new file mode 100755 index 00000000..a0712304 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +set -e +cmd=run.pl +stage=1 +num_parts=16 +coh_nbest=1000 +preproc_file="" + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +set -e + +if [ $# -ne 8 ]; then + echo "Usage: $0 " + exit 1; +fi + +ndx_file=$1 +enroll_file=$2 +vector_file=$3 +nf_file=$4 +coh_file=$5 +coh_v_file=$6 +qmf_file=$7 +output_file=$8 + +output_dir=$(dirname $output_file) + +mkdir -p $output_dir/log +name=$(basename $output_file) + +echo "$0 score $ndx_file" + +if [ -n "$preproc_file" ];then + extra_args="--preproc-file $preproc_file" +fi + +if [ $stage -le 1 ];then + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $cmd $output_dir/log/${name}_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_cos_qmf.py $extra_args \ + --v-file scp:$vector_file \ + --ndx-file $ndx_file \ + --enroll-file $enroll_file \ + --score-file $output_file \ + --num-frames-file $nf_file \ + --coh-v-file scp:$coh_v_file \ + --coh-file $coh_file \ + --coh-nbest $coh_nbest \ + --qmf-file $qmf_file \ + --model-part-idx $i --num-model-parts $num_parts \ + --seg-part-idx $j --num-seg-parts $num_parts & + done + done + wait +fi + + +if [ $stage -le 2 ];then + for suffix in "" _snorm _qmf + do + output_file_k=${output_file}${suffix} + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + cat $output_file_k-$(printf "%03d" $i)-$(printf "%03d" $j) + done + done | sort -u > $output_file_k + done +fi + + diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py new file mode 100755 index 00000000..0eca769d --- /dev/null +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.list_utils import ismember +from hyperion.utils import TrialNdx, TrialScores +from hyperion.utils.math_funcs import cosine_scoring +from hyperion.helpers import TrialDataReader as TDR +from hyperion.helpers import PLDAFactory as F +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF + + +def eval_plda( + v_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + score_file, + coh_v_file, + coh_file, + coh_nbest, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + **kwargs +): + + logging.info("loading data") + if preproc_file is not None: + preproc = TransformList.load(preproc_file) + else: + preproc = None + + tdr = TDR( + v_file, + ndx_file, + enroll_file, + test_file, + preproc, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + ) + x_e, x_t, enroll, ndx = tdr.read() + + coh_segs = SegmentSet.load(coh_file) + r = DRF.create(coh_v_file) + x_coh = r.read(coh_segs["id"], squeeze=True) + if preproc is not None: + x_coh = preproc(x_coh) + _, spk_ids = np.unique(coh_segs["class_id"], return_inverse=True) + num_coh_spks = np.max(spk_ids) + 1 + x_coh_spk = np.zeros((num_coh_spks, x_coh.shape[1])) + for i in range(num_coh_spks): + idx = spk_ids == i + x_coh_spk[i] = np.mean(x_coh[idx], axis=0) + + t1 = time.time() + logging.info("computing llr") + scores = cosine_scoring(x_e, x_t) + + logging.info("computing enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh_spk) + logging.info("computing cohort vs test") + scores_coh_test = cosine_scoring(x_coh_spk, x_t) + + snorm = AdaptSNorm(coh_nbest) + scores = snorm(scores, scores_coh_test, scores_enr_coh) + + dt = time.time() - t1 + num_trials = len(enroll) * x_t.shape[0] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if num_model_parts > 1 or num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + f, loc = ismember(enroll, ndx.model_set) + s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask[loc]) + s.save_txt(score_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Eval cosine-scoring with adaptive s-norm") + + parser.add_argument("--v-file", required=True) + parser.add_argument("--ndx-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-file", default=None) + parser.add_argument("--preproc-file", default=None) + + TDR.add_argparse_args(parser) + + parser.add_argument("--coh-v-file", required=True) + parser.add_argument("--coh-file", required=True) + parser.add_argument("--coh-nbest", type=int, default=1000) + + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + assert args.test_file is not None or args.ndx_file is not None + eval_plda(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh new file mode 100755 index 00000000..b64d80a3 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# + +cmd=run.pl +num_parts=16 +coh_nbest=1000 +preproc_file="" + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +set -e + +if [ $# -ne 6 ]; then + echo "Usage: $0 " + exit 1; +fi + +ndx_file=$1 +enroll_file=$2 +vector_file=$3 +coh_file=$4 +coh_vector_file=$5 +output_file=$6 + +output_dir=$(dirname $output_file) + +mkdir -p $output_dir/log +name=$(basename $output_file) + +echo "$0 score $ndx_file" + +if [ -n "$preproc_file" ];then + extra_args="--preproc-file $preproc_file" +fi + +for((i=1;i<=$num_parts;i++)); +do + for((j=1;j<=$num_parts;j++)); + do + $cmd $output_dir/log/${name}_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_cos_snorm.py $extra_args \ + --v-file scp:$vector_file \ + --ndx-file $ndx_file \ + --enroll-file $enroll_file \ + --coh-file $coh_file \ + --coh-v-file scp:$coh_vector_file \ + --score-file $output_file \ + --coh-nbest $coh_nbest \ + --model-part-idx $i --num-model-parts $num_parts \ + --seg-part-idx $j --num-seg-parts $num_parts & + sleep 1s + done +done +wait + + +for((i=1;i<=$num_parts;i++)); +do + for((j=1;j<=$num_parts;j++)); + do + cat $output_file-$(printf "%03d" $i)-$(printf "%03d" $j) + done +done | sort -u > $output_file + + + diff --git a/egs/voxceleb/v1/steps_be/eval_be_plda_v1.py b/egs/voxceleb/v1/steps_be/eval_be_plda_v1.py new file mode 100755 index 00000000..da77f8f3 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/eval_be_plda_v1.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Evals PLDA LLR +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.list_utils import ismember +from hyperion.utils import TrialNdx, TrialScores +from hyperion.helpers import TrialDataReader as TDR +from hyperion.helpers import PLDAFactory as F +from hyperion.np.transforms import TransformList + + +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + **kwargs +): + + logging.info("loading data") + if preproc_file is not None: + preproc = TransformList.load(preproc_file) + else: + preproc = None + + tdr = TDR( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + ) + x_e, x_t, enroll, ndx = tdr.read() + + logging.info("loading plda model: %s" % (model_file)) + model = F.load_plda(plda_type, model_file) + + t1 = time.time() + logging.info("computing llr") + scores = model.llr_1vs1(x_e, x_t) + + dt = time.time() - t1 + num_trials = len(enroll) * x_t.shape[0] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) + + if num_model_parts > 1 or num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + f, loc = ismember(enroll, ndx.model_set) + s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask[loc]) + s.save_txt(score_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Eval PLDA") + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) + + TDR.add_argparse_args(parser) + F.add_argparse_eval_args(parser) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + assert args.test_file is not None or args.ndx_file is not None + eval_plda(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/eval_be_plda_v1.sh b/egs/voxceleb/v1/steps_be/eval_be_plda_v1.sh new file mode 100755 index 00000000..69d6ace1 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/eval_be_plda_v1.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# + +cmd=run.pl +plda_type=frplda +num_parts=8 +stage=1 +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +set -e + +if [ $# -ne 6 ]; then + echo "Usage: $0 " + exit 1; +fi + +ndx_file=$1 +enroll_file=$2 +vector_file=$3 +preproc_file=$4 +plda_file=$5 +output_file=$6 + +output_dir=$(dirname $output_file) + +mkdir -p $output_dir/log +name=$(basename $output_file) + +echo "$0 score $ndx_file" + +for((i=1;i<=$num_parts;i++)); +do + for((j=1;j<=$num_parts;j++)); + do + $cmd $output_dir/log/${name}_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_plda_v1.py \ + --v-file scp:$vector_file \ + --ndx-file $ndx_file \ + --enroll-file $enroll_file \ + --preproc-file $preproc_file \ + --model-file $plda_file \ + --plda-type $plda_type \ + --score-file $output_file \ + --model-part-idx $i --num-model-parts $num_parts \ + --seg-part-idx $j --num-seg-parts $num_parts & + done +done +wait + + +for((i=1;i<=$num_parts;i++)); +do + for((j=1;j<=$num_parts;j++)); + do + cat $output_file-$(printf "%03d" $i)-$(printf "%03d" $j) + done +done | sort -u > $output_file + + + diff --git a/egs/voxceleb/v1/steps_be/eval_be_v1.sh b/egs/voxceleb/v1/steps_be/eval_be_v1.sh deleted file mode 100755 index eefc989f..00000000 --- a/egs/voxceleb/v1/steps_be/eval_be_v1.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Jesus Villalba) -# Apache 2.0. -# - -cmd=run.pl -plda_type=frplda -num_parts=8 -stage=1 -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; -set -e - -if [ $# -ne 6 ]; then - echo "Usage: $0 " - exit 1; -fi - -ndx_file=$1 -enroll_file=$2 -vector_file=$3 -preproc_file=$4 -plda_file=$5 -output_file=$6 - -output_dir=$(dirname $output_file) - -mkdir -p $output_dir/log -name=$(basename $output_file) - -echo "$0 score $ndx_file" - -for((i=1;i<=$num_parts;i++)); -do - for((j=1;j<=$num_parts;j++)); - do - $cmd $output_dir/log/${name}_${i}_${j}.log \ - hyp_utils/conda_env.sh \ - steps_be/eval-be-v1.py \ - --iv-file scp:$vector_file \ - --ndx-file $ndx_file \ - --enroll-file $enroll_file \ - --preproc-file $preproc_file \ - --model-file $plda_file \ - --plda-type $plda_type \ - --score-file $output_file \ - --model-part-idx $i --num-model-parts $num_parts \ - --seg-part-idx $j --num-seg-parts $num_parts & - done -done -wait - - -for((i=1;i<=$num_parts;i++)); -do - for((j=1;j<=$num_parts;j++)); - do - cat $output_file-$(printf "%03d" $i)-$(printf "%03d" $j) - done -done | sort -u > $output_file - - - diff --git a/egs/voxceleb/v1/steps_be/eval_be_v2.sh b/egs/voxceleb/v1/steps_be/eval_be_v2.sh index 7389bf2c..bb58872e 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_v2.sh +++ b/egs/voxceleb/v1/steps_be/eval_be_v2.sh @@ -36,7 +36,7 @@ do $cmd $output_dir/log/${name}_${i}_${j}.log \ hyp_utils/conda_env.sh \ steps_be/eval-be-v2.py \ - --iv-file scp:$vector_file \ + --v-file scp:$vector_file \ --ndx-file $ndx_file \ --enroll-file $enroll_file \ --preproc-file $preproc_file \ diff --git a/egs/voxceleb/v1/steps_be/train-be-v1.py b/egs/voxceleb/v1/steps_be/train-be-v1.py deleted file mode 100755 index a1e6fa7e..00000000 --- a/egs/voxceleb/v1/steps_be/train-be-v1.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python -""" Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import logging -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time - -import numpy as np - -from hyperion.hyp_defs import config_logger -from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, LDA, LNorm, PCA -from hyperion.helpers import PLDAFactory as F - - -def remove_outlayers(x, class_ids): - bound = 100 - idx = np.all(np.abs(x) < bound, axis=1) - x = x[idx] - class_ids = class_ids[idx] - return x, class_idx - - -def train_be( - iv_file, - train_list, - lda_dim, - plda_type, - y_dim, - z_dim, - epochs, - ml_md, - md_epochs, - output_path, - **kwargs -): - - # Read data - vcr_args = VCR.filter_args(**kwargs) - vcr_train = VCR(iv_file, train_list, None, **vcr_args) - x, class_ids = vcr_train.read() - del vcr_train - - t1 = time.time() - # x, class_ids = remove_outlayers(x, class_ids) - rank = PCA.get_pca_dim_for_var_ratio(x, var_r=1) - pca = None - if rank < x.shape[1]: - # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name="pca") - pca.fit(x) - x = pca.predict(x) - if lda_dim > rank: - lda_dim = rank - if y_dim > rank: - y_dim = rank - logging.info("PCA rank=%d" % (rank)) - - # Train LDA - lda = LDA(lda_dim=lda_dim, name="lda") - lda.fit(x, class_ids) - - x_lda = lda.predict(x) - logging.info("PCA-LDA Elapsed time: %.2f s." % (time.time() - t1)) - - # Train centering and whitening - t1 = time.time() - lnorm = LNorm(name="lnorm") - lnorm.fit(x_lda) - - x_ln = lnorm.predict(x_lda) - logging.info("LNorm Elapsed time: %.2f s." % (time.time() - t1)) - - # Train PLDA - t1 = time.time() - - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") - elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - - logging.info("PLDA Elapsed time: %.2f s." % (time.time() - t1)) - - # Save models - if pca is None: - preproc = TransformList([lda, lnorm]) - else: - preproc = TransformList([pca, lda, lnorm]) - - if not os.path.exists(output_path): - os.makedirs(output_path) - - preproc.save(output_path + "/lda_lnorm.h5") - plda.save(output_path + "/plda.h5") - - num = np.arange(epochs) - elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Train Back-end") - - parser.add_argument("--iv-file", dest="iv_file", required=True) - parser.add_argument("--train-list", dest="train_list", required=True) - - VCR.add_argparse_args(parser) - F.add_argparse_train_args(parser) - - parser.add_argument("--output-path", dest="output_path", required=True) - parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=None) - - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - train_be(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/train-be-v2.py b/egs/voxceleb/v1/steps_be/train-be-v2.py deleted file mode 100755 index 1d72df93..00000000 --- a/egs/voxceleb/v1/steps_be/train-be-v2.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import logging -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time - -import numpy as np - -from hyperion.hyp_defs import config_logger -from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, CentWhiten, PCA - -from numpy.linalg import matrix_rank - - -def train_be(iv_file, train_list, output_path, **kwargs): - - # Read data - vr_args = VR.filter_args(**kwargs) - vr_train = VR(iv_file, train_list, None, **vr_args) - x = vr_train.read() - del vr_train - - t1 = time.time() - rank = matrix_rank(x) - pca = None - if rank < x.shape[1]: - # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name="pca") - pca.fit(x) - x = pca.predict(x) - logging.info("PCA rank=%d" % (rank)) - - # Train centering and whitening - t1 = time.time() - cw = CentWhiten(name="cw") - cw.fit(x) - - logging.info("PCA-CW Elapsed time: %.2f s." % (time.time() - t1)) - - # Save models - if pca is None: - preproc = TransformList([cw]) - else: - preproc = TransformList([pca, cw]) - - if not os.path.exists(output_path): - os.makedirs(ouput_path) - - preproc.save(output_path + "/cw.h5") - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Train Back-end") - - parser.add_argument("--iv-file", dest="iv_file", required=True) - parser.add_argument("--train-list", dest="train_list", required=True) - - VR.add_argparse_args(parser) - - parser.add_argument("--output-path", dest="output_path", required=True) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - train_be(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/train-calibration-v1.py b/egs/voxceleb/v1/steps_be/train-calibration-v1.py index 7408fd1d..489ceed9 100755 --- a/egs/voxceleb/v1/steps_be/train-calibration-v1.py +++ b/egs/voxceleb/v1/steps_be/train-calibration-v1.py @@ -22,8 +22,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): diff --git a/egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh b/egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh new file mode 100755 index 00000000..267466ae --- /dev/null +++ b/egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +set -e +cmd=run.pl +stage=1 +num_parts=8 +coh_nbest=400 + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +set -e + +if [ $# -ne 7 ]; then + echo "Usage: $0 " + exit 1; +fi + +ndx_file=$1 +enroll_file=$2 +vector_file=$3 +nf_file=$4 +coh_file=$5 +coh_v_file=$6 +output_file=$7 + +output_dir=$(dirname $output_file) + +mkdir -p $output_dir/log +name=$(basename $output_file) + +echo "$0 score $ndx_file" + +if [ $stage -le 1 ];then + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $cmd $output_dir/log/${name}_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_cos_qmf.py \ + --v-file scp:$vector_file \ + --ndx-file $ndx_file \ + --enroll-file $enroll_file \ + --score-file $output_file \ + --num-frames-file $nf_file \ + --coh-v-file scp:$coh_v_file \ + --coh-file $coh_file \ + --coh-nbest $coh_nbest \ + --model-part-idx $i --num-model-parts $num_parts \ + --seg-part-idx $j --num-seg-parts $num_parts & + done + done + wait +fi + +if [ $stage -le 2 ];then + for suffix in "" _maxnf _minnf _maxcohmu _mincohmu _snorm + do + output_file_k=${output_file}${suffix} + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + cat $output_file_k-$(printf "%03d" $i)-$(printf "%03d" $j) + done + done | sort -u > $output_file_k + done +fi + +if [ $stage -le 3 ];then + $cmd $output_dir/log/train_qmf_${name}.log \ + hyp_utils/conda_env.sh \ + steps_be/train_qmf.py \ + --score-file $output_file \ + --key-file $ndx_file \ + --model-file $output_dir/qmf.h5 +fi + + diff --git a/egs/voxceleb/v1/steps_be/train_be_plda_v1.py b/egs/voxceleb/v1/steps_be/train_be_plda_v1.py new file mode 100755 index 00000000..ea8cf867 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/train_be_plda_v1.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +""" Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList, LDA, LNorm, PCA +from hyperion.helpers import PLDAFactory as F + + +def remove_outlayers(x, class_ids): + bound = 100 + idx = np.all(np.abs(x) < bound, axis=1) + x = x[idx] + class_ids = class_ids[idx] + return x, class_idx + + +def train_be( + iv_file, + train_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + output_path, + **kwargs +): + + # Read data + vcr_args = VCR.filter_args(**kwargs) + vcr_train = VCR(iv_file, train_list, None, **vcr_args) + x, class_ids = vcr_train.read() + del vcr_train + + t1 = time.time() + # x, class_ids = remove_outlayers(x, class_ids) + rank = PCA.get_pca_dim_for_var_ratio(x, var_r=1) + pca = None + if rank < x.shape[1]: + # do PCA if rank of x is smaller than its dimension + pca = PCA(pca_dim=rank, name="pca") + pca.fit(x) + x = pca.predict(x) + if lda_dim > rank: + lda_dim = rank + if y_dim > rank: + y_dim = rank + logging.info("PCA rank=%d" % (rank)) + + # Train LDA + lda = LDA(lda_dim=lda_dim, name="lda") + lda.fit(x, class_ids) + + x_lda = lda.predict(x) + logging.info("PCA-LDA Elapsed time: %.2f s." % (time.time() - t1)) + + # Train centering and whitening + t1 = time.time() + lnorm = LNorm(name="lnorm") + lnorm.fit(x_lda) + + x_ln = lnorm.predict(x_lda) + logging.info("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + + # Train PLDA + t1 = time.time() + + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + + logging.info("PLDA Elapsed time: %.2f s." % (time.time() - t1)) + + # Save models + if pca is None: + preproc = TransformList([lda, lnorm]) + else: + preproc = TransformList([pca, lda, lnorm]) + + if not os.path.exists(output_path): + os.makedirs(output_path) + + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") + + num = np.arange(epochs) + elbo = np.vstack((num, elbo)).T + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train Back-end") + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + + VCR.add_argparse_args(parser) + F.add_argparse_train_args(parser) + + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=None) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + train_be(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/train_be_plda_v1.sh b/egs/voxceleb/v1/steps_be/train_be_plda_v1.sh new file mode 100755 index 00000000..ee5f8163 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/train_be_plda_v1.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +cmd=run.pl +lda_dim=150 +plda_type=frplda +y_dim=100 +z_dim=150 + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +set -e + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + exit 1; +fi + +vector_file=$1 +data_dir=$2 +output_dir=$3 + +mkdir -p $output_dir/log + +for f in utt2spk; do + if [ ! -f $data_dir/$f ]; then + echo "$0: no such file $data_dir/$f" + exit 1; + fi +done + +train_list=$output_dir/train_utt2spk + +#filter out the utterances that didn't got an x-vector (empty utts) +awk -v fv=$vector_file 'BEGIN{ +while(getline < fv) +{ + files[$1]=1 +} +} +{ if ($1 in files) {print $1,$2}}' $data_dir/utt2spk > $train_list + + +$cmd $output_dir/log/train_be.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_plda_v1.py \ + --iv-file scp:$vector_file \ + --train-list $train_list \ + --lda-dim $lda_dim \ + --plda-type $plda_type \ + --y-dim $y_dim --z-dim $z_dim \ + --output-path $output_dir + + + + diff --git a/egs/voxceleb/v1/steps_be/train_be_proj_v1.py b/egs/voxceleb/v1/steps_be/train_be_proj_v1.py new file mode 100755 index 00000000..24a2a33b --- /dev/null +++ b/egs/voxceleb/v1/steps_be/train_be_proj_v1.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.helpers import VectorReader as VR +from hyperion.np.transforms import TransformList, CentWhiten, PCA, LNorm + +# from numpy.linalg import matrix_rank + + +def train_be_lda(v_file, train_list, output_path, pca, **kwargs): + from hyperion.helpers import VectorClassReader as VCR + from hyperion.np.transforms import LDA, LNorm + from sklearn.discriminant_analysis import LinearDiscriminantAnalysis + + # Read data + vr_args = VCR.filter_args(**kwargs) + vr_train = VCR(v_file, train_list, None, **vr_args) + x, ids = vr_train.read() + del vr_train + + t1 = time.time() + lnorm = LNorm() + x = lnorm(x) + _, ids = np.unique(ids, return_inverse=True) + pca = LDA(lda_dim=90) + pca.fit(x, ids) + logging.info("PCA elapsed time: %.2f s." % (time.time() - t1)) + + # Save models + preproc = TransformList([lnorm, pca]) + + if not os.path.exists(output_path): + os.makedirs(ouput_path) + + preproc.save(output_path + "/preproc.h5") + + +def train_be(v_file, train_list, output_path, pca, **kwargs): + + # Read data + vr_args = VR.filter_args(**kwargs) + vr_train = VR(v_file, train_list, None, **vr_args) + x = vr_train.read() + del vr_train + + t1 = time.time() + pca = PCA(**pca) + pca.fit(x) + logging.info("PCA dimenson=%d", pca.pca_dim) + logging.info("PCA elapsed time: %.2f s." % (time.time() - t1)) + + # Save models + preproc = TransformList([pca]) + if not os.path.exists(output_path): + os.makedirs(ouput_path) + + preproc.save(output_path + "/preproc.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train Back-end") + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + + VR.add_argparse_args(parser) + PCA.add_class_args(parser, prefix="pca") + parser.add_argument("--output-path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + train_be(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/train_be_proj_v1.sh b/egs/voxceleb/v1/steps_be/train_be_proj_v1.sh new file mode 100755 index 00000000..7d1be89d --- /dev/null +++ b/egs/voxceleb/v1/steps_be/train_be_proj_v1.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +cmd=run.pl +pca_var_r=0.90 + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +set -e + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + exit 1; +fi + +vector_file=$1 +data_dir=$2 +output_dir=$3 + +mkdir -p $output_dir/log + +for f in utt2spk; do + if [ ! -f $data_dir/$f ]; then + echo "$0: no such file $data_dir/$f" + exit 1; + fi +done + +train_list=$output_dir/train_utt2spk + +#filter out the utterances that didn't got an x-vector (empty utts) +awk -v fv=$vector_file 'BEGIN{ +while(getline < fv) +{ + files[$1]=1 +} +} +{ if ($1 in files) {print $1,$2}}' $data_dir/utt2spk > $train_list + + +$cmd $output_dir/log/train_be.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_proj_v1.py \ + --iv-file scp:$vector_file \ + --train-list $train_list \ + --output-path $output_dir + + + + diff --git a/egs/voxceleb/v1/steps_be/train_be_v1.sh b/egs/voxceleb/v1/steps_be/train_be_v1.sh deleted file mode 100755 index 68e470ff..00000000 --- a/egs/voxceleb/v1/steps_be/train_be_v1.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Jesus Villalba) -# Apache 2.0. -# -cmd=run.pl -lda_dim=150 -plda_type=frplda -y_dim=100 -z_dim=150 - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; -set -e - -if [ $# -ne 3 ]; then - echo "Usage: $0 " - exit 1; -fi - -vector_file=$1 -data_dir=$2 -output_dir=$3 - -mkdir -p $output_dir/log - -for f in utt2spk; do - if [ ! -f $data_dir/$f ]; then - echo "$0: no such file $data_dir/$f" - exit 1; - fi -done - -train_list=$output_dir/train_utt2spk - -#filter out the utterances that didn't got an x-vector (empty utts) -awk -v fv=$vector_file 'BEGIN{ -while(getline < fv) -{ - files[$1]=1 -} -} -{ if ($1 in files) {print $1,$2}}' $data_dir/utt2spk > $train_list - - -$cmd $output_dir/log/train_be.log \ - hyp_utils/conda_env.sh \ - steps_be/train-be-v1.py \ - --iv-file scp:$vector_file \ - --train-list $train_list \ - --lda-dim $lda_dim \ - --plda-type $plda_type \ - --y-dim $y_dim --z-dim $z_dim \ - --output-path $output_dir - - - - diff --git a/egs/voxceleb/v1/steps_be/train_be_v2.sh b/egs/voxceleb/v1/steps_be/train_be_v2.sh deleted file mode 100755 index 487c9b1b..00000000 --- a/egs/voxceleb/v1/steps_be/train_be_v2.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Jesus Villalba) -# Apache 2.0. -# -cmd=run.pl - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; -set -e - -if [ $# -ne 3 ]; then - echo "Usage: $0 " - exit 1; -fi - -vector_file=$1 -data_dir=$2 -output_dir=$3 - -mkdir -p $output_dir/log - -for f in utt2spk; do - if [ ! -f $data_dir/$f ]; then - echo "$0: no such file $data_dir/$f" - exit 1; - fi -done - -train_list=$output_dir/train_utt2spk - -#filter out the utterances that didn't got an x-vector (empty utts) -awk -v fv=$vector_file 'BEGIN{ -while(getline < fv) -{ - files[$1]=1 -} -} -{ if ($1 in files) {print $1,$2}}' $data_dir/utt2spk > $train_list - - -$cmd $output_dir/log/train_be.log \ - hyp_utils/conda_env.sh \ - steps_be/train-be-v2.py \ - --iv-file scp:$vector_file \ - --train-list $train_list \ - --output-path $output_dir - - - - diff --git a/egs/voxceleb/v1/steps_be/train_qmf.py b/egs/voxceleb/v1/steps_be/train_qmf.py new file mode 100755 index 00000000..ee9733d8 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/train_qmf.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Trains calibration for SRE18 tel condition +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.trial_scores import TrialScores +from hyperion.utils.trial_key import TrialKey +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR + + +def print_q_stats(q, name): + scores = q.scores[q.score_mask] + s = f"{name} stats mean={np.mean(scores)} min={np.min(scores)} max={np.max(scores)} median={np.median(scores)}" + logging.info(s) + + +def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): + + logging.info("load key: %s", key_file) + key = TrialKey.load_txt(key_file) + score_snorm_file = f"{score_file}_snorm" + logging.info("load scores: %s", score_snorm_file) + scr = TrialScores.load_txt(score_snorm_file) + tar, non = scr.get_tar_non(key) + ntar = len(tar) + nnon = len(non) + + q_file = f"{score_file}_maxnf" + logging.info("load max num-frames: %s", q_file) + q = TrialScores.load_txt(q_file) + print_q_stats(q, "max-nf") + maxnf_tar, maxnf_non = q.get_tar_non(key) + + q_file = f"{score_file}_minnf" + logging.info("load min num-frames: %s", q_file) + q = TrialScores.load_txt(q_file) + print_q_stats(q, "min-nf") + minnf_tar, minnf_non = q.get_tar_non(key) + + q_file = f"{score_file}_maxcohmu" + logging.info("load max cohort mean: %s", q_file) + q = TrialScores.load_txt(q_file) + print_q_stats(q, "max-cohmu") + maxcohmu_tar, maxcohmu_non = q.get_tar_non(key) + + q_file = f"{score_file}_mincohmu" + logging.info("load min cohort mean: %s", q_file) + q = TrialScores.load_txt(q_file) + print_q_stats(q, "min-cohmu") + mincohmu_tar, mincohmu_non = q.get_tar_non(key) + + min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + min_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, + ) + + logging.info("train calibration") + tar = np.vstack((tar, maxnf_tar, minnf_tar, maxcohmu_tar, mincohmu_tar)).T + non = np.vstack((non, maxnf_non, minnf_non, maxcohmu_non, mincohmu_non)).T + + x = np.vstack((tar, non)) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) + lr.fit(x, y) + logging.info(f"A={lr.A} b={lr.b}") + logging.info("save calibration at %s", model_file) + lr.save(model_file) + + logging.info("calibrate scores") + tar_cal = lr.predict(tar) + non_cal = lr.predict(non) + act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + act_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, + ) + + output_file = f"{score_file}_qmf" + scr_out = TrialScores(key.model_set, key.seg_set) + scr_out.scores[key.tar] = tar_cal + scr_out.scores[key.non] = non_cal + scr_out.score_mask = np.logical_or(key.tar, key.non) + scr_out.save(output_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Trains QMF calibration") + + parser.add_argument("--score-file", required=True) + parser.add_argument("--key-file", required=True) + parser.add_argument("--model-file", required=True) + parser.add_argument("--prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + logging.debug(args) + + train_calibration(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v2.1/README.md b/egs/voxceleb/v2.1/README.md new file mode 100644 index 00000000..cb5b5368 --- /dev/null +++ b/egs/voxceleb/v2.1/README.md @@ -0,0 +1,182 @@ +# VoxCeleb V2.1 + +Recipe for the VoxCeleb Speaker Verification Task using Wav2Vec2, WavLM or Hubert models from HuggingFace as feature extractors + +## Differences w.r.t VoxCeleb V2 recipe + + - Kaldi format is replaced by new format based on pandas tables + - Kaldi style bash scripts are removed and replaced by python scripts + - Most python scripts are called using Hyperion entry points + +## Citing + +## Training Data + + - x-Vector network is trained on Voxceleb2 dev + test with augmentations + - MUSAN noise + - RIR reverberation + +## Test data + + - Test data is VoxCeleb 1 + - We evaluate the 3 conditions (with cleaned lists): + - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers + - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1 + - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials. + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it will use config global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh + - To use other configs: +```bash +run_005_train_xvector.sh --config-file global_conf/other_config.sh +run_006_extract_xvectors.sh --config-file global_conf/other_config.sh --use-gpu true +run_007_eval_be.sh --config-file global_conf/other_config.sh +``` + + +## Recipe Steps: + + - `run_001_prepare_data.sh` + - Data preparation script to generate Kaldi style data directories for + - VoxCeleb2 train+test + - VoxCeleb1 O/E/H eval sets + + - `run_002_compute_evad.sh` + - Computes Energy VAD for all datasets + + - `run_003_prepare_noises_rirs.sh` + - Prepares MUSAN noises, music to be used by SpeechAugment class. + - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. + - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. + + - `run_004_prepare_xvec_train_data.sh` + - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. + - Removes silence from the audios + - Removes utterances shorter than 4secs and speakers with less than 8 utterances. + - Creates training and validation lists for x-vector training + + - `run_005_train_xvector.sh` + - Trains the x-vector model on frozen wav2vec features + - Finetunes wav2vec+x-vector model + - Large margin finetuning of wav2vec+x-vector model + + - `run_006_extract_xvectors.sh` + - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training + - Exctracts x-vectors for VoxCeleb1 test sets + + - `run_007_eval_be.sh` + - Trains PLDA and evals PLDA and cosine scoring back-ends + + +## Results + + + + + +### VoxCeleb 1 Original-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.060 | 0.116 | +| | | | Cosine + AS-Norm | 0.81 | 0.058 | 0.108 | +| | | | Cosine + QMF | 0.75 | 0.054 | 0.086 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.069 | 0.108 | +| | | | Cosine + AS-Norm | 0.86 | 0.067 | 0.108 | +| | | | Cosine + QMF | 0.77 | 0.066 | 0.105 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.057 | 0.085 | +| | | | Cosine + AS-Norm | 0.73 | 0.055 | 0.093 | +| | | | Cosine + QMF | 0.66 | 0.051 | 0.094 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.053 | 0.080 | +| | | | Cosine + AS-Norm | 0.71 | 0.050 | 0.087 | +| | | | Cosine + QMF | 0.64 | 0.045 | 0.087 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.063 | 0.111 | +| | | | Cosine + AS-Norm | 0.68 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.63 | 0.048 | 0.071 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.14 | 0.074 | 0.107 | +| | | | Cosine + AS-Norm | 0.94 | 0.060 | 0.089 | +| | | | Cosine + QMF | 0.89 | 0.054 | 0.076 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.1.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.69 | 0.048 | 0.094 | +| | | | Cosine + AS-Norm | 0.63 | 0.046 | 0.082 | +| | | | Cosine + QMF | 0.57 | 0.041 | 0.076 | + +### VoxCeleb 1 Entire-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.81 | 0.051 | 0.087 | +| | | | Cosine + AS-Norm | 0.78 | 0.047 | 0.083 | +| | | | Cosine + QMF | 0.75 | 0.046 | 0.076 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.056 | 0.099 | +| | | | Cosine + AS-Norm | 0.86 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.82 | 0.050 | 0.085 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.049 | 0.088 | +| | | | Cosine + AS-Norm | 0.76 | 0.045 | 0.080 | +| | | | Cosine + QMF | 0.73 | 0.043 | 0.078 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.91 | 0.056 | 0.094 | +| | | | Cosine + AS-Norm | 0.87 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.086 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.050 | 0.086 | +| | | | Cosine + AS-Norm | 0.73 | 0.045 | 0.074 | +| | | | Cosine + QMF | 0.69 | 0.042 | 0.069 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.99 | 0.058 | 0.103 | +| | | | Cosine + AS-Norm | 0.87 | 0.052 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.085 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.1.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.72 | 0.044 | 0.079 | +| | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.068 | +| | | | Cosine + QMF | 0.64 | 0.037 | 0.065 | + +### VoxCeleb 1 Hard-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.73 | 0.113 | 0.182 | +| | | | Cosine + AS-Norm | 1.63 | 0.100 | 0.160 | +| | | | Cosine + QMF | 1.56 | 0.096 | 0.155 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.88 | 0.122 | 0.200 | +| | | | Cosine + AS-Norm | 1.77 | 0.110 | 0.175 | +| | | | Cosine + QMF | 1.66 | 0.104 | 0.168 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.67 | 0.103 | 0.165 | +| | | | Cosine + AS-Norm | 1.54 | 0.093 | 0.152 | +| | | | Cosine + QMF | 1.45 | 0.089 | 0.145 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.78 | 0.106 | 0.174 | +| | | | Cosine + AS-Norm | 1.70 | 0.099 | 0.162 | +| | | | Cosine + QMF | 1.61 | 0.094 | 0.153 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.49 | 0.087 | 0.137 | +| | | | Cosine + AS-Norm | 1.29 | 0.074 | 0.117 | +| | | | Cosine + QMF | 1.22 | 0.069 | 0.111 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.84 | 0.107 | 0.172 | +| | | | Cosine + AS-Norm | 1.47 | 0.083 | 0.128 | +| | | | Cosine + QMF | 1.39 | 0.079 | 0.123 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.24 | 0.076 | 0.121 | +| | | | Cosine + AS-Norm | 1.15 | 0.068 | 0.109 | +| | | | Cosine + QMF | 1.09 | 0.065 | 0.107 | + +### VoxSRC2022 dev + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.60 | 0.163 | 0.257 | +| | | | Cosine + AS-Norm | 2.43 | 0.150 | 0.244 | +| | | | Cosine + QMF | 2.31 | 0.143 | 0.232 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.82 | 0.183 | 0.286 | +| | | | Cosine + AS-Norm | 2.69 | 0.168 | 0.265 | +| | | | Cosine + QMF | 2.52 | 0.158 | 0.252 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.65 | 0.176 | 0.289 | +| | | | Cosine + AS-Norm | 2.55 | 0.171 | 0.292 | +| | | | Cosine + QMF | 2.38 | 0.159 | 0.266 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.62 | 0.153 | 0.251 | +| | | | Cosine + AS-Norm | 2.53 | 0.149 | 0.247 | +| | | | Cosine + QMF | 2.42 | 0.144 | 0.231 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.225 | +| | | | Cosine + AS-Norm | 2.01 | 0.125 | 0.209 | +| | | | Cosine + QMF | 1.92 | 0.117 | 0.200 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.83 | 0.175 | 0.276 | +| | | | Cosine + AS-Norm | 2.31 | 0.149 | 0.244 | +| | | | Cosine + QMF | 2.22 | 0.137 | 0.229 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.1.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.06 | 0.124 | 0.206 | +| | | | Cosine + AS-Norm | 1.97 | 0.125 | 0.212 | +| | | | Cosine + QMF | 1.87 | 0.120 | 0.204 | + diff --git a/egs/voxceleb/v2.1/cmd.sh b/egs/voxceleb/v2.1/cmd.sh new file mode 100755 index 00000000..c95884ec --- /dev/null +++ b/egs/voxceleb/v2.1/cmd.sh @@ -0,0 +1,29 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_a100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 30G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/voxceleb/v2.1/conf/clsp.conf b/egs/voxceleb/v2.1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf b/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_long.conf b/egs/voxceleb/v2.1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf b/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_short.conf b/egs/voxceleb/v2.1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf b/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..86f55073 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml @@ -0,0 +1,34 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max + mediumroom: + weight: 1 + rir_path: csv:data/rirs_mediumroom/rirs.csv + rir_norm: max + realroom: + weight: 1 + rir_path: csv:data/rirs_real/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 3 + max_snr: 18 diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..ffd2f374 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml new file mode 100644 index 00000000..7dcc56ef --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..3f5c46bc --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..9e1d0928 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..0d0dc398 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..07bf8e5a --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..c58797cf --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..46ee7d18 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..5703104e --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..c58797cf --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml new file mode 100644 index 00000000..07bf8e5a --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml new file mode 100644 index 00000000..e9638704 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..40341a27 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,60 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: wavlmlarge_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + target_key: speaker + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..d1af05d8 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,64 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml new file mode 100644 index 00000000..99002b45 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + hf_feats: + encoder_lr: 1e-2 + feat_extract_lr: 1e-2 + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + #lr: 5e-2 + lr: 1e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 20000 + #min_lr: 5e-4 + min_lr: 1e-6 + warmup_steps: 10000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 14 + eff_batch_size: 512 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..4a8c53d7 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml new file mode 100644 index 00000000..9c7652ce --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 1e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 15000 + hold_steps: 10000 + min_lr: 1e-6 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 14 + eff_batch_size: 256 + target_key: speaker + #train_mode: full + train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..d1ed9300 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + hf_feats: + override_lora: true + use_lora: true + lora_rank: 4 + lora_components: + - q_proj + - v_proj + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + target_key: speaker + train_mode: hf-lora diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..fbea3f0f --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + target_key: speaker + train_mode: hf-lora diff --git a/egs/voxceleb/v2.1/conf/vad_16k.yaml b/egs/voxceleb/v2.1/conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/voxceleb/v2.1/conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..5d27b093 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,61 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + drop_layers_gt: 12 +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml new file mode 100644 index 00000000..fe89d2fc --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml @@ -0,0 +1,60 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..63afdb58 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,60 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..4de306e4 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,61 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..2c2c6db3 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,60 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..52246639 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,61 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 12 +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..a05e82e1 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,60 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/datapath.sh b/egs/voxceleb/v2.1/datapath.sh new file mode 100644 index 00000000..a7eb575c --- /dev/null +++ b/egs/voxceleb/v2.1/datapath.sh @@ -0,0 +1,23 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + # voxceleb1_root=/export/corpora5/VoxCeleb1_v1 #voxceleb1 v1 + voxceleb1_root=/export/corpora5/VoxCeleb1_v2 #voxceleb1 v2 + voxceleb2_root=/export/corpora5/VoxCeleb2 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + voxsrc22_root=/exp/jvillalba/corpora/voxsrc22 + musan_root=/expscratch/dgromero/corpora-open/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/voxceleb/v2.1/default_config.sh b/egs/voxceleb/v2.1/default_config.sh new file mode 120000 index 00000000..f2d8812d --- /dev/null +++ b/egs/voxceleb/v2.1/default_config.sh @@ -0,0 +1 @@ +global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh \ No newline at end of file diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..67a4665e --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params layers 2-12 + +# hugging face model +hf_model_name=wav2vec2xlsr300m12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh new file mode 100644 index 00000000..b4130fad --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn1024x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..80ee785b --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..1985b8e6 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,55 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_name=${hf_model_name}_loraqv_ecapatdnn512x3_v2.0 +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +#do_snorm=true +#do_qmf=true +#do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..c2b30f68 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..373535c2 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..530096cc --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..1b276bcd --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/hyp_utils b/egs/voxceleb/v2.1/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/v2.1/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/v2.1/path.sh b/egs/voxceleb/v2.1/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/voxceleb/v2.1/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/voxceleb/v2.1/run_001_prepare_data.sh b/egs/voxceleb/v2.1/run_001_prepare_data.sh new file mode 100755 index 00000000..563d3c2d --- /dev/null +++ b/egs/voxceleb/v2.1/run_001_prepare_data.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. datapath.sh +. $config_file + +if [ $stage -le 1 ];then + # Prepare the VoxCeleb2 dataset for training. + hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train +fi + +if [ $stage -le 2 ];then + # prepare voxceleb1 for test + hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test +fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + # hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + # --vox1-corpus-dir $voxceleb1_root \ + # --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # split vox2 into 2 parts, for cohort and qmf training + hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/v2.1/run_002_compute_evad.sh b/egs/voxceleb/v2.1/run_002_compute_evad.sh new file mode 100755 index 00000000..acccace3 --- /dev/null +++ b/egs/voxceleb/v2.1/run_002_compute_evad.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +vad_dir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml +nj=40 + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ -z "$vad_config" ];then + echo "We are not using VAD in this configuration" + exit 0 +fi + +if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" +fi + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + hyp_utils/create_data_split_dirs.sh \ + $vad_dir/$name \ + $USER/hyp-data/voxceleb/v1.2/vad $nodes + done +fi + +#Train datasets +if [ $stage -le 2 ];then + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj + echo "compute vad for $name" + $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-compute-energy-vad --cfg $vad_config \ + --recordings-file data/$name/recordings.csv \ + --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ + --part-idx JOB --num-parts $nj || exit 1 + + hyperion-tables cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion-dataset add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv + done +fi + + diff --git a/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..73c7ed82 --- /dev/null +++ b/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh new file mode 100755 index 00000000..4e0c5b19 --- /dev/null +++ b/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +nodes=b1 +nj=40 +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + hyp_utils/create_data_split_dirs.sh \ + exp/xvector_audios/$nnet_data \ + $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes +fi + +if [ $stage -le 2 ];then + output_dir=exp/proc_audio/$nnet_data + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$nnet_data/vad.csv" + update_durs="--update-seg-durs" + fi + + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac --remove-dc-offset $vad_args \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$nnet_data/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + + hyperion-dataset set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad +fi + +if [ $stage -le 3 ];then + hyperion-dataset remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 + + hyperion-dataset remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 +fi + +if [ $stage -le 4 ];then + hyperion-dataset split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --joint-classes speaker --min-train-samples 1 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val +fi + diff --git a/egs/voxceleb/v2.1/run_005_train_xvector.sh b/egs/voxceleb/v2.1/run_005_train_xvector.sh new file mode 100755 index 00000000..eb1c591e --- /dev/null +++ b/egs/voxceleb/v2.1/run_005_train_xvector.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_data_dir=data/${nnet_data}_xvector_train +val_data_dir=data/${nnet_data}_xvector_val + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-train-wav2vec2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu \ + +fi + + +# Finetune full model +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi + +# Finetune full model +if [ $stage -le 3 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh new file mode 100755 index 00000000..72b019cd --- /dev/null +++ b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=3 +config_file=default_config.sh +use_gpu=false +hf_chunk_length=120.0 #seconds +xvec_chunk_length=120.0 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" + xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" + num_gpus=1 +else + xvec_cmd="$train_cmd --mem 12G" + num_gpus=0 +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then + # Extract xvectors for training LDA/PLDA + nj=100 + for name in voxceleb2cat_train + do + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2vec2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --random-utt-length --min-utt-length 2 --max-utt-length 30 \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + nj=100 + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}') + nj=$(($num_segs < 100 ? $num_segs:100)) + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2vec2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + + diff --git a/egs/voxceleb/v2.1/run_007_eval_be.sh b/egs/voxceleb/v2.1/run_007_eval_be.sh new file mode 100755 index 00000000..a686b237 --- /dev/null +++ b/egs/voxceleb/v2.1/run_007_eval_be.sh @@ -0,0 +1,321 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=3 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name +score_plda_dir=$score_dir/${be_name}/plda +score_cosine_dir=$score_dir/cosine +score_cosine_snorm_dir=$score_dir/cosine_snorm +score_cosine_qmf_dir=$score_dir/cosine_qmf + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_dir/voxceleb1_scores.csv \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_dir/voxceleb1_results.csv + + cat $score_cosine_dir/voxceleb1_results.csv +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring" + $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_dir/voxsrc22_dev_scores.csv + + # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \ + # hyp_utils/conda_env.sh \ + # hyperion-eval-cosine-scoring-backend \ + # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ + # --ndx-file data/voxsrc22_eval/trials.csv \ + # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ + # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_dir/voxsrc22_dev_results.csv + + cat $score_cosine_dir/voxsrc22_dev_results.csv + +fi + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_snorm_dir/voxceleb1_results.csv + + cat $score_cosine_snorm_dir/voxceleb1_results.csv + fi + + if [ $stage -le 6 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + AS-Norm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + fi + +fi + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + echo "Train QMF in Vox2" + echo "...Calculating quality measures for Vox2" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --ndx-file data/voxceleb2cat_train_trials/trials.csv \ + --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv \ + --score-file $score_cosine_qmf_dir/voxceleb2_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + hyperion-train-qmf --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --key-file data/voxceleb2cat_train_trials/trials.csv \ + --model-file $score_cosine_qmf_dir/qmf.h5 + + fi + + if [ $stage -le 8 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxceleb1_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + ) & + done + wait + fi + + if [ $stage -le 9 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + ) & + done + wait + fi + +fi + diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md new file mode 100644 index 00000000..0bafe85e --- /dev/null +++ b/egs/voxceleb/v2/README.md @@ -0,0 +1,164 @@ +# VoxCeleb V2 + +Recipe for the VoxCeleb Speaker Verification Task using Wav2Vec2, WavLM or Hubert models from HuggingFace as feature extractors + +## Differences w.r.t VoxCeleb V1 recipe + +## Citing + +## Training Data + + - x-Vector network is trained on Voxceleb2 dev + test with augmentations + - MUSAN noise + - RIR reverberation + +## Test data + + - Test data is VoxCeleb 1 + - We evaluate 6 conditions: + - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers + - Voxceleb-O-cleaned: VoxCeleb-O cleaned-up of some errors + - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1 + - Voxceleb-E-cleaned: VoxCeleb-E cleaned-up of some errors + - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials. + - Voxceleb-H-cleaned: VoxCeleb-H cleaned-up of some errors + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it will use config global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh + - For better performance use +```bash +run_011_train_xvector.sh --config-file global_conf/other_config.sh +run_030_extract_xvectors.sh --config-file global_conf/other_config.sh --use-gpu true +run_040_eval_be.sh --config-file global_conf/other_config.sh +``` + + +## Recipe Steps: + + - `run_001_prepare_data.sh` + - Data preparation script to generate Kaldi style data directories for + - VoxCeleb2 train+test + - VoxCeleb1 O/E/H eval sets + + - `run_002_compute_evad.sh` + - Computes Energy VAD for all datasets + + - `run_003_prepare_noises_rirs.sh` + - Prepares MUSAN noises, music to be used by SpeechAugment class. + - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. + - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. + + - `run_010_prepare_xvec_train_data.sh` + - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. + - Removes silence from the audios + - Removes utterances shorter than 4secs and speakers with less than 8 utterances. + - Creates training and validation lists for x-vector training + + - `run_011_train_xvector.sh` + - Trains the x-vector model on frozen wav2vec features + - Finetunes wav2vec+x-vector model + - Large margin finetuning of wav2vec+x-vector model + + - `run_030_extract_xvectors.sh` + - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training + - Exctracts x-vectors for VoxCeleb1 test sets + + - `run_040_eval_be.sh` + - Trains PLDA and evals PLDA and cosine scoring back-ends + + +## Results + +### VoxCeleb 1 Original-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.060 | 0.116 | +| | | | Cosine + AS-Norm | 0.81 | 0.058 | 0.108 | +| | | | Cosine + QMF | 0.75 | 0.054 | 0.086 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.069 | 0.108 | +| | | | Cosine + AS-Norm | 0.86 | 0.067 | 0.108 | +| | | | Cosine + QMF | 0.77 | 0.066 | 0.105 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.057 | 0.085 | +| | | | Cosine + AS-Norm | 0.73 | 0.055 | 0.093 | +| | | | Cosine + QMF | 0.66 | 0.051 | 0.094 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.053 | 0.080 | +| | | | Cosine + AS-Norm | 0.71 | 0.050 | 0.087 | +| | | | Cosine + QMF | 0.64 | 0.045 | 0.087 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.063 | 0.111 | +| | | | Cosine + AS-Norm | 0.68 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.63 | 0.048 | 0.071 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.14 | 0.074 | 0.107 | +| | | | Cosine + AS-Norm | 0.94 | 0.060 | 0.089 | +| | | | Cosine + QMF | 0.89 | 0.054 | 0.076 | + +### VoxCeleb 1 Entire-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.81 | 0.051 | 0.087 | +| | | | Cosine + AS-Norm | 0.78 | 0.047 | 0.083 | +| | | | Cosine + QMF | 0.75 | 0.046 | 0.076 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.056 | 0.099 | +| | | | Cosine + AS-Norm | 0.86 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.82 | 0.050 | 0.085 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.049 | 0.088 | +| | | | Cosine + AS-Norm | 0.76 | 0.045 | 0.080 | +| | | | Cosine + QMF | 0.73 | 0.043 | 0.078 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.91 | 0.056 | 0.094 | +| | | | Cosine + AS-Norm | 0.87 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.086 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.050 | 0.086 | +| | | | Cosine + AS-Norm | 0.73 | 0.045 | 0.074 | +| | | | Cosine + QMF | 0.69 | 0.042 | 0.069 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.99 | 0.058 | 0.103 | +| | | | Cosine + AS-Norm | 0.87 | 0.052 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.085 | + +### VoxCeleb 1 Hard-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.73 | 0.113 | 0.182 | +| | | | Cosine + AS-Norm | 1.63 | 0.100 | 0.160 | +| | | | Cosine + QMF | 1.56 | 0.096 | 0.155 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.88 | 0.122 | 0.200 | +| | | | Cosine + AS-Norm | 1.77 | 0.110 | 0.175 | +| | | | Cosine + QMF | 1.66 | 0.104 | 0.168 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.67 | 0.103 | 0.165 | +| | | | Cosine + AS-Norm | 1.54 | 0.093 | 0.152 | +| | | | Cosine + QMF | 1.45 | 0.089 | 0.145 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.78 | 0.106 | 0.174 | +| | | | Cosine + AS-Norm | 1.70 | 0.099 | 0.162 | +| | | | Cosine + QMF | 1.61 | 0.094 | 0.153 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.49 | 0.087 | 0.137 | +| | | | Cosine + AS-Norm | 1.29 | 0.074 | 0.117 | +| | | | Cosine + QMF | 1.22 | 0.069 | 0.111 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.84 | 0.107 | 0.172 | +| | | | Cosine + AS-Norm | 1.47 | 0.083 | 0.128 | +| | | | Cosine + QMF | 1.39 | 0.079 | 0.123 | + +### VoxSRC2022 dev + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.60 | 0.163 | 0.257 | +| | | | Cosine + AS-Norm | 2.43 | 0.150 | 0.244 | +| | | | Cosine + QMF | 2.31 | 0.143 | 0.232 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.82 | 0.183 | 0.286 | +| | | | Cosine + AS-Norm | 2.69 | 0.168 | 0.265 | +| | | | Cosine + QMF | 2.52 | 0.158 | 0.252 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.65 | 0.176 | 0.289 | +| | | | Cosine + AS-Norm | 2.55 | 0.171 | 0.292 | +| | | | Cosine + QMF | 2.38 | 0.159 | 0.266 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.62 | 0.153 | 0.251 | +| | | | Cosine + AS-Norm | 2.53 | 0.149 | 0.247 | +| | | | Cosine + QMF | 2.42 | 0.144 | 0.231 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.225 | +| | | | Cosine + AS-Norm | 2.01 | 0.125 | 0.209 | +| | | | Cosine + QMF | 1.92 | 0.117 | 0.200 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.83 | 0.175 | 0.276 | +| | | | Cosine + AS-Norm | 2.31 | 0.149 | 0.244 | +| | | | Cosine + QMF | 2.22 | 0.137 | 0.229 | diff --git a/egs/voxceleb/v2/cmd.sh b/egs/voxceleb/v2/cmd.sh new file mode 100755 index 00000000..71f3bae0 --- /dev/null +++ b/egs/voxceleb/v2/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/voxceleb/v2/conf/clsp.conf b/egs/voxceleb/v2/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/v2/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/v2/conf/coe_gpu_bigmem.conf b/egs/voxceleb/v2/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/v2/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/v2/conf/coe_gpu_long.conf b/egs/voxceleb/v2/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/v2/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/v2/conf/coe_gpu_rtx.conf b/egs/voxceleb/v2/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/v2/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/v2/conf/coe_gpu_short.conf b/egs/voxceleb/v2/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/v2/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/v2/conf/coe_gpu_v100.conf b/egs/voxceleb/v2/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/v2/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/v2/conf/hubertbase_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/hubertbase_ecapatdnn512x2.yaml new file mode 100644 index 00000000..94bb31cc --- /dev/null +++ b/egs/voxceleb/v2/conf/hubertbase_ecapatdnn512x2.yaml @@ -0,0 +1,37 @@ +hf_feats: + pretrained_model_path: facebook/hubert-base-ls960 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/lrsched_exp_default.yaml b/egs/voxceleb/v2/conf/lrsched_exp_default.yaml new file mode 100644 index 00000000..fe08b704 --- /dev/null +++ b/egs/voxceleb/v2/conf/lrsched_exp_default.yaml @@ -0,0 +1,7 @@ +lrsch_type: exp_lr +decay_rate: 0.5 +decay_steps: 8000 +hold_steps: 40000 +min_lr: 1.0e-05 +update_lr_on_opt_step: true +warmup_steps: 1000 diff --git a/egs/voxceleb/v2/conf/optim_adam_default.yaml b/egs/voxceleb/v2/conf/optim_adam_default.yaml new file mode 100644 index 00000000..b6620069 --- /dev/null +++ b/egs/voxceleb/v2/conf/optim_adam_default.yaml @@ -0,0 +1,6 @@ +opt_type: adam +lr: 0.05 +amsgrad: true +beta1: 0.9 +beta2: 0.95 +weight_decay: 1.0e-05 diff --git a/egs/voxceleb/v2/conf/reverb_noise_aug.yaml b/egs/voxceleb/v2/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/voxceleb/v2/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/voxceleb/v2/conf/train_data_default.yaml b/egs/voxceleb/v2/conf/train_data_default.yaml new file mode 100644 index 00000000..d41c1507 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_data_default.yaml @@ -0,0 +1,11 @@ +dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 +sampler: + batch_size: 32 + iters_per_epoch: 6 +data_loader: + num_workers: 8 + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..ad991124 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..01ad8897 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..254ff796 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml new file mode 100644 index 00000000..84ecfc04 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5.5e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 4.4e-3 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 20 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml new file mode 100644 index 00000000..fdaff633 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2.3e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 2e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 5 + eff_batch_size: 192 + train_mode: full + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..f424275d --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: wav2vec2xlsr53_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml new file mode 100644 index 00000000..84ecfc04 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5.5e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 4.4e-3 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 20 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml new file mode 100644 index 00000000..58fe1d49 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2.3e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 2e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 2 + eff_batch_size: 192 + train_mode: full + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbase_ecapatdnn512x2_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbase_ecapatdnn512x2_default.yaml new file mode 100644 index 00000000..424c9bd6 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbase_ecapatdnn512x2_default.yaml @@ -0,0 +1,6 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +model: wavlmbase_ecapatdnn512x2.yaml +trainer: trainer_swa_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..86dec831 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: wavlmbaseplus6l_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..e22620ca --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: wavlmbaseplus6l_linfus_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..9860abfa --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..52be6db5 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..18b910d1 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_linfus_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_default.yaml new file mode 100644 index 00000000..8574a1cf --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_default.yaml @@ -0,0 +1,6 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +model: wavlmbaseplus_ecapatdnn512x3.yaml +trainer: trainer_phase1_sgd_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..d4db70a7 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,64 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..bd3e7f86 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml new file mode 100644 index 00000000..ebeedde6 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4850 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-4 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml new file mode 100644 index 00000000..84ecfc04 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5.5e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 4.4e-3 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 20 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..f8e620c1 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,64 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml new file mode 100644 index 00000000..58fe1d49 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2.3e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 2e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 2 + eff_batch_size: 192 + train_mode: full + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..5013e5af --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..8c00d0fa --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 0 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: wavlmbaseplus_linfus_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..ad699556 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: wavlmlarge12l_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..abe5da6e --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..9fec8986 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,64 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml new file mode 100644 index 00000000..58fe1d49 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2.3e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 2e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 2 + eff_batch_size: 192 + train_mode: full + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..5013e5af --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..9602d562 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: wavlmlarge6l_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..37b085f3 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: wavlmlarge_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..2addaa1e --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml new file mode 100644 index 00000000..84ecfc04 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5.5e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 4.4e-3 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 20 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..f8e620c1 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,64 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml new file mode 100644 index 00000000..58fe1d49 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2.3e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 2e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 2 + eff_batch_size: 192 + train_mode: full + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..2ea1589d --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/trainer_swa_default.yaml b/egs/voxceleb/v2/conf/trainer_swa_default.yaml new file mode 100644 index 00000000..c45e3eb5 --- /dev/null +++ b/egs/voxceleb/v2/conf/trainer_swa_default.yaml @@ -0,0 +1,9 @@ +optim: optim_adam_default.yaml +lrsched: lrsched_exp_default.yaml +use_amp: true +log_interval: 1000 +epochs: 63 +eff_batch_size: 512 +swa_start: 60 +swa_lr: 1e-4 +swa_anneal_epochs: 1 diff --git a/egs/voxceleb/v2/conf/vad_16k.yaml b/egs/voxceleb/v2/conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/voxceleb/v2/conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/v2/conf/val_data_default.yaml b/egs/voxceleb/v2/conf/val_data_default.yaml new file mode 100644 index 00000000..72c77204 --- /dev/null +++ b/egs/voxceleb/v2/conf/val_data_default.yaml @@ -0,0 +1,11 @@ +dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 +sampler: + batch_size: 32 + iters_per_epoch: 6 +data_loader: + num_workers: 8 + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/wav2vec2base960h_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base960h_ecapatdnn512x2.yaml new file mode 100644 index 00000000..85964372 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base960h_ecapatdnn512x2.yaml @@ -0,0 +1,37 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_do1_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_do1_ecapatdnn512x2.yaml new file mode 100644 index 00000000..f616073c --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_do1_ecapatdnn512x2.yaml @@ -0,0 +1,42 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_dropouts: true + activation_dropout: 0.1 + attention_dropout: 0.2 + hidden_dropout: 0.2 + feat_proj_dropout: 0.2 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_ecapatdnn512x2.yaml new file mode 100644 index 00000000..fb7c7cde --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_ecapatdnn512x2.yaml @@ -0,0 +1,37 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_specaug1_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_specaug1_ecapatdnn512x2.yaml new file mode 100644 index 00000000..921f21a6 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_specaug1_ecapatdnn512x2.yaml @@ -0,0 +1,39 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_spec_augment: true + mask_time_prob: 0.5 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_specaug2_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_specaug2_ecapatdnn512x2.yaml new file mode 100644 index 00000000..410fd521 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_specaug2_ecapatdnn512x2.yaml @@ -0,0 +1,39 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_spec_augment: true + mask_time_prob: 0.25 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_specaug3_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_specaug3_ecapatdnn512x2.yaml new file mode 100644 index 00000000..96c70f98 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_specaug3_ecapatdnn512x2.yaml @@ -0,0 +1,39 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_spec_augment: true + mask_time_prob: 0.125 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_specaug4_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_specaug4_ecapatdnn512x2.yaml new file mode 100644 index 00000000..bb4613da --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_specaug4_ecapatdnn512x2.yaml @@ -0,0 +1,39 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_spec_augment: true + mask_time_prob: 0.0625 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml new file mode 100644 index 00000000..bf67ce48 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_spec_augment: true + mask_time_prob: 0.150 + mask_feature_prob: 0.150 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..c3466259 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml new file mode 100644 index 00000000..1cc7df4c --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..dc3737e3 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr53_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr53_ecapatdnn512x3.yaml new file mode 100644 index 00000000..1975bada --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr53_ecapatdnn512x3.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-large-xlsr-53 +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus6l_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus6l_ecapatdnn512x3.yaml new file mode 100644 index 00000000..dbe4ff65 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus6l_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 6 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus6l_linfus_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus6l_linfus_ecapatdnn512x3.yaml new file mode 100644 index 00000000..99a3778b --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus6l_linfus_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 6 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: linear +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3.yaml new file mode 100644 index 00000000..ddbf3ca4 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..d7e3388f --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus9l_linfus_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus9l_linfus_ecapatdnn512x3.yaml new file mode 100644 index 00000000..90b0fbef --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus9l_linfus_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: linear +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml new file mode 100644 index 00000000..69b85d8d --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..b2430d97 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus_linfus_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus_linfus_ecapatdnn512x3.yaml new file mode 100644 index 00000000..6f1e9f56 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus_linfus_ecapatdnn512x3.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: linear +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3.yaml new file mode 100644 index 00000000..0de43fd4 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..5025f047 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge6l_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmlarge6l_ecapatdnn512x3.yaml new file mode 100644 index 00000000..062137f3 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge6l_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 6 +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3.yaml new file mode 100644 index 00000000..f36ac70c --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..0a6303f5 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/datapath.sh b/egs/voxceleb/v2/datapath.sh new file mode 100644 index 00000000..9a2f7529 --- /dev/null +++ b/egs/voxceleb/v2/datapath.sh @@ -0,0 +1,22 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + # voxceleb1_root=/export/corpora5/VoxCeleb1_v1 #voxceleb1 v1 + voxceleb1_root=/export/corpora5/VoxCeleb1_v2 #voxceleb1 v2 + voxceleb2_root=/export/corpora5/VoxCeleb2 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + musan_root=/expscratch/dgromero/corpora-open/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/voxceleb/v2/default_config.sh b/egs/voxceleb/v2/default_config.sh new file mode 120000 index 00000000..f2d8812d --- /dev/null +++ b/egs/voxceleb/v2/default_config.sh @@ -0,0 +1 @@ +global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh \ No newline at end of file diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..8144f6eb --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,50 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlr53_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlr53_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..014a5d03 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlr53_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr53 + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..67a4665e --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params layers 2-12 + +# hugging face model +hf_model_name=wav2vec2xlsr300m12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..80ee785b --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..d02c11f7 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus6l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_linfus_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_linfus_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..639225c3 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_linfus_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus6l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_linfus_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..58bded52 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..c2b30f68 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_linfus_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_linfus_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..4553f40b --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_linfus_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_linfus_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..7d39995d --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..373535c2 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_linfus_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_linfus_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..c75280f0 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_linfus_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_linfus_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..11425baa --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..530096cc --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge6l_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge6l_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..e3c9466b --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge6l_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge6l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..8e870abe --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..1b276bcd --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/hyp_utils b/egs/voxceleb/v2/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/v2/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/v2/local b/egs/voxceleb/v2/local new file mode 120000 index 00000000..2ac14857 --- /dev/null +++ b/egs/voxceleb/v2/local @@ -0,0 +1 @@ +../v1.1/local \ No newline at end of file diff --git a/egs/voxceleb/v2/path.sh b/egs/voxceleb/v2/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/voxceleb/v2/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/voxceleb/v2/run_001_prepare_data.sh b/egs/voxceleb/v2/run_001_prepare_data.sh new file mode 100755 index 00000000..44385610 --- /dev/null +++ b/egs/voxceleb/v2/run_001_prepare_data.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. datapath.sh +. $config_file + +if [ $stage -le 1 ];then + # Prepare the VoxCeleb2 dataset for training. + local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train +fi + +if [ $stage -le 2 ];then + # prepare voxceleb1 for test + # This script is for the old version of the dataset + # local/make_voxceleb1_oeh.pl $voxceleb1_root data + # Use this for the newer version of voxceleb1: + local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data +fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + local/prepare_voxsrc22_dev.py \ + --vox1-corpus-dir $voxceleb1_root \ + --voxsrc22-corpus-dir $voxsrc22_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then +# local/prepare_voxsrc22_test.py \ +# --corpus-dir $voxsrc22_root \ +# --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # # split vox2 into 2 parts, for cohort and qmf training + local/make_vox2_trials.py --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/v2/run_002_compute_evad.sh b/egs/voxceleb/v2/run_002_compute_evad.sh new file mode 100755 index 00000000..1248ad39 --- /dev/null +++ b/egs/voxceleb/v2/run_002_compute_evad.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +storage_name=$(date +'%m_%d_%H_%M') +vaddir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" + fi + fi +fi + +if [ $stage -le 2 ];then + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh \ + --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + diff --git a/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..a448af9a --- /dev/null +++ b/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation + +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 16 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then + ln -s ../../sre19-cmn2/v1/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/voxceleb/v2/run_010_prepare_xvec_train_data.sh b/egs/voxceleb/v2/run_010_prepare_xvec_train_data.sh new file mode 100755 index 00000000..5936fbf4 --- /dev/null +++ b/egs/voxceleb/v2/run_010_prepare_xvec_train_data.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 2 ]; then + # This script preprocess audio for x-vector training + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${nnet_data} data/${nnet_data}_proc_audio_no_sil exp/${nnet_data}_proc_audio_no_sil + hyp_utils/kaldi/utils/fix_data_dir.sh data/${nnet_data}_proc_audio_no_sil + +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 4s + hyp_utils/remove_short_audios.sh --min-len 4 data/${nnet_data}_proc_audio_no_sil + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 4 utterances. + hyp_utils/remove_spk_few_utts.sh --min-num-utts 4 data/${nnet_data}_proc_audio_no_sil + +fi + +if [ $stage -le 4 ]; then + # Prepare train and validation lists for x-vectors + local/make_train_lists_sup_embed_with_augm.sh \ + data/${nnet_data}_proc_audio_no_sil \ + data/${nnet_data}_proc_audio_no_sil/lists_xvec +fi + +exit diff --git a/egs/voxceleb/v2/run_011_train_xvector.sh b/egs/voxceleb/v2/run_011_train_xvector.sh new file mode 100755 index 00000000..bc3b5420 --- /dev/null +++ b/egs/voxceleb/v2/run_011_train_xvector.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_s1_dir $args \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir $args \ + --num-gpus $ngpu \ + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir $args \ + --num-gpus $ngpu \ + +fi + diff --git a/egs/voxceleb/v2/run_030_extract_xvectors.sh b/egs/voxceleb/v2/run_030_extract_xvectors.sh new file mode 100755 index 00000000..16f29841 --- /dev/null +++ b/egs/voxceleb/v2/run_030_extract_xvectors.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=3 +config_file=default_config.sh +use_gpu=false +hf_chunk_length=120 #seconds +xvec_chunk_length=120 #seconds +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 6G" +else + xvec_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then + # Extract xvectors for training LDA/PLDA + for name in voxceleb2cat_train + do + if [ $plda_num_augs -eq 0 ]; then + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --random-utt-length true --min-utt-length 2 --max-utt-length 30 \ + $nnet data/${name} \ + $xvector_dir/${name} + else + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ + --random-utt-length true --min-utt-length 2 --max-utt-length 30 \ + --aug-config $plda_aug_config --num-augs $plda_num_augs \ + $nnet data/${name} \ + $xvector_dir/${name}_augx${plda_num_augs} \ + data/${name}_augx${plda_num_augs} + fi + done +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj $nj ${xvec_args} \ + $nnet data/$name \ + $xvector_dir/$name + done +fi + diff --git a/egs/voxceleb/v2/run_040_eval_be.sh b/egs/voxceleb/v2/run_040_eval_be.sh new file mode 100755 index 00000000..0982abeb --- /dev/null +++ b/egs/voxceleb/v2/run_040_eval_be.sh @@ -0,0 +1,349 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=3 +config_file=default_config.sh + + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name/${be_name} +score_plda_dir=$score_dir/plda +score_cosine_dir=exp/scores/$nnet_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/cosine_qmf + + +if [ "$do_plda" == "true" ];then + if [ $stage -le 1 ]; then + echo "Train PLDA on Voxceleb2" + steps_be/train_be_v1.sh \ + --cmd "$train_cmd" \ + --lda_dim $lda_dim \ + --plda_type $plda_type \ + --y_dim $plda_y_dim --z_dim $plda_z_dim \ + $xvector_dir/$plda_data/xvector.scp \ + data/$plda_data \ + $be_dir + + fi + + + if [ $stage -le 2 ];then + echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA" + steps_be/eval_be_v1.sh \ + --cmd "$train_cmd" --plda_type $plda_type \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $be_dir/lda_lnorm.h5 \ + $be_dir/plda.h5 \ + $score_plda_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + fi +fi + + + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir + + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $score_cosine_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $score_cosine_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_dir + + for f in $(ls $score_cosine_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + +fi + + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 22G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/*_results); + do + echo $f + cat $f + echo "" + done + fi + + if [ $stage -le 6 ];then + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos_snorm.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_snorm_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + fi +fi + + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxceleb2cat_train/utt2speech_dur \ + > $xvector_dir/voxceleb2cat_train/utt2num_frames + + echo "Train QMF in Vox2" + steps_be/train_be_cos_qmf.sh \ + --cmd "$train_cmd" --coh-nbest 1000 \ + data/voxceleb2cat_train/trials \ + data/voxceleb2cat_train/utt2model \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $xvector_dir/voxceleb2cat_train/utt2num_frames \ + data/voxceleb2cat_train/snorm_utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/voxceleb2_qmf_scores + + fi + + if [ $stage -le 8 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxceleb1_test/utt2speech_dur \ + > $xvector_dir/voxceleb1_test/utt2num_frames + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $xvector_dir/voxceleb1_test/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_snorm.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_qmf.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxceleb1{,_snorm,_qmf}_[oeh]_clean_results); + do + echo $f + cat $f + echo "" + done + + fi + + if [ $stage -le 9 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxsrc22_dev/utt2speech_dur \ + > $xvector_dir/voxsrc22_dev/utt2num_frames + + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $xvector_dir/voxsrc22_dev/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxsrc22_dev_scores & + + # awk '{ print $1, $2*100}' \ + # $xvector_dir/voxsrc22_test/utt2speech_dur \ + # > $xvector_dir/voxsrc22_test/utt2num_frames + # steps_be/eval_be_cos_qmf.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $xvector_dir/voxsrc22_test/utt2num_frames \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_qmf_dir/qmf.h5 \ + # $score_cosine_qmf_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_snorm.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_qmf.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxsrc22_dev{,_snorm,_qmf}_results); + do + echo $f + cat $f + echo "" + done + fi + +fi + +if [ "$do_pca" != "true" ];then + exit 0 +fi + + +be_name=pca_r${pca_var_r} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name/${be_name} +score_cosine_dir=exp/scores/$nnet_name/$be_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/$be_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/$be_name/cosine_qmf + +be_dir=exp/be/$nnet_name/ +score_be_dir=$score_dir/pca_r${pca_var_r} + +if [ $stage -le 10 ]; then + echo "Train projection on Voxceleb2" + $train_cmd $be_dir/log/train_be.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_proj_v1.py \ + --v-file scp:$xvector_dir/$plda_data/xvector.scp \ + --train-list data/$plda_data/utt2spk \ + --output-path $be_dir \ + --pca.pca-var-r $pca_var_r + +fi + + +if [ $stage -le 11 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + --preproc-file $be_dir/preproc.h5 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir + + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi diff --git a/egs/voxceleb/v2/steps b/egs/voxceleb/v2/steps new file mode 120000 index 00000000..aede39fe --- /dev/null +++ b/egs/voxceleb/v2/steps @@ -0,0 +1 @@ +hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/voxceleb/v2/steps_be b/egs/voxceleb/v2/steps_be new file mode 120000 index 00000000..b2098c2a --- /dev/null +++ b/egs/voxceleb/v2/steps_be @@ -0,0 +1 @@ +../v1/steps_be \ No newline at end of file diff --git a/egs/voxceleb/v2/steps_pyfe b/egs/voxceleb/v2/steps_pyfe new file mode 120000 index 00000000..7b9d122a --- /dev/null +++ b/egs/voxceleb/v2/steps_pyfe @@ -0,0 +1 @@ +hyp_utils/feats \ No newline at end of file diff --git a/egs/voxceleb/v2/steps_xvec b/egs/voxceleb/v2/steps_xvec new file mode 120000 index 00000000..af66a94d --- /dev/null +++ b/egs/voxceleb/v2/steps_xvec @@ -0,0 +1 @@ +hyp_utils/xvectors \ No newline at end of file diff --git a/egs/voxceleb/v2/utils b/egs/voxceleb/v2/utils new file mode 120000 index 00000000..3d590a1d --- /dev/null +++ b/egs/voxceleb/v2/utils @@ -0,0 +1 @@ +hyp_utils/kaldi/utils \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/README.md b/egs/voxceleb/vae.v1/README.md deleted file mode 100644 index 1514fae4..00000000 --- a/egs/voxceleb/vae.v1/README.md +++ /dev/null @@ -1,89 +0,0 @@ -# VoxCeleb Version 3 - -Last update 2020/07/09 - -This recipe is work in progress - -Recipe to evaluate generative models on VoxCeleb -We train models on VoxCeleb2 and evaluate on full VoxCeleb1. -The goal is to evaluate the hability of generative models to -recostruct VoxCeleb1 data or to generate data from scratch. - -## Models included: - - The following models can be evaluated with this recipe: - - Basic Autoencoders (AE) - - Variational Autoencoders (VAE - - VQ-VAE - - Denoising AE, VAE, VQ-VAE - -## Training Data - - - Autoencoders, VAE, VQ-VAE, GAN are trained on - - VoxCeleb2 dev+test - - Denoising versions are trained on - - VoxCeleb2 dev+test + augmentation with - - MUSAN noise - - RIR reverberation - -## Test Data - - - Test data is the full VoxCeleb 1 - -## Usage - - - Run the run_stepnumber_*.sh scripts in sequence - - Depending on the model that you are testing you can skip some steps - - if not running denoising versions skip steps 3 and 4 - - Run train/eval steps only corresponding to the model that you are using - -## Results - -We compute average of the metrics across VoxCeleb1, values in parenthesis are std. -We report EER on VoxCeleb1 Test Original Clean Task using recostructed log-filter-banks and LResNet34 x-vector trained in recipe v1.1. -Baseline EER=1.94% when using original log-filter-banks. - -### Models trained without augmentation - -| Config | Model Type | Architecture | Latent-channels | Compression (bits x/bits z) | ELBO/dim (std) | MSE (std) | L1 (std) | codebook size | EER(%) | -| ------ | ---------- | ------------ | :--------: | :-------: | :----: | :----: | :----: | :----: | :----: | -| config_vae_dc1d_b4d256_z80_c8.opt.lr0.01.v1.sh | VAE | DC1d Enc-Dec
dc-blocks=4 / hid-channels=256 | 80 | 8 | -1.96 (0.62) | 1.57 (0.91) | 0.90 (0.24) | | 16.36 | -| config_vae_dc1d_b9d256_z80_c8.opt.lr0.01.v1.sh | VAE | DC1d Enc-Dec
dc-blocks=9 / hid-channels=256 | 80 | 8 | -1.95 (0.62) | 1.56 (0.91) | 0.89 (0.24) | -| config_vae_resnet1d_b4d256_z80_c8.opt.lr0.01.v1.sh | VAE | ResNet1d Enc-Dec
res-blocks=4/ hid-channels=256 | 80 | 8 | -1.97 (0.65) | 1.55 (0.93) | 0.89 (0.25) | | 15.05 | -| config_vae_resnet1d_b8d256_z80_c8.opt.lr0.01.v1.sh | VAE | ResNet1d Enc-Dec
res-blocks=8/ hid-channels=256 | 80 | 8 | -1.98 (0.65) | 1.55 (0.93) | 0.88 (0.25) | | 13.45 | -| config_vae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh | VAE | ResNet1d Enc-Dec
res-blocks=16/ hid-channels=256 | 80 | 8 | -1.98 (0.69) | 1.54 (0.94) | 0.88 (0.25) | | 13.45 | -| config_vae_dc2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh | VAE | DC2d Enc-Dec
dc-blocks=4 / hid-channels=64 | 80 | 0.8 | -2.25 (1.00) | 1.49 (1.06) | 0.84 (0.29) | | 10.04 | -| config_vae_dc2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh | VAE | DC2d Enc-Dec
dc-blocks=8 / hid-channels=64 | 80 | 0.8 | -2.23 (1.00) | 1.49 (1.06) | 0.84 (0.29) | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512_c2275.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 2275 | -1.84 (0.21) | 2.20 (0.71) | 1.12 (0.16) | 512 | 28.42 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x2_c1138.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 1138 | -1.79 (0.32) | 1.86 (0.78) | 1.01 (0.19) | 512x2 | 22.08 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 569 | -1.40 (0.43) | 1.69 (0.83) | 0.95 (0.21) | 512x4 | 19.18 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569_predvar.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 569 | -1.78 (0.42) | 1.70 (0.83) | 0.95 (0.21) | 512x4 | 18.16 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 284 | -1.87 (0.59) | 1.56 (0.89) | 0.89 (0.23) | 512x8 | 15.48 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 142 | -2.04 (0.83) | 1.46 (0.96) | 0.84 (0.27) | 512x16 | 11.77 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 71 | -2.15 (1.4) | 1.43 (1.08) | 0.80 (0.32) | 512x32 | 8.13 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x64_c36.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 36 | -9.27 (8.31) | 1.49 (1.22) | 0.79 (0.36) | 512x64 | 6.41 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x128_c18.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 18 | -20.97 (20.62) | 1.46 (1.24) | 0.77 (0.38) | 512x128 | 5.67 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x256_c9.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 9 | -27.91 (26.00) | 1.49 (1.27) | 0.78 (0.39) | 512x256 | 5.41 | -| config_vqvae_transformer_b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6/ d_model=512 / heads=8 / d_ff=2048 | 512 | 36 | -1.74(0.31) | 0.48 (0.15) | 0.52 (0.08) | 512x8 | 10.49 | -| config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6 / d_model=512 / heads=8 / att-context=25 / d_ff=2048 | 512 | 36 | -1.61(0.15) | 0.42 (0.08) | 0.49 (0.05) | 512x8 | 4.26 | -| config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6 / d_model=512 / heads=8 / att-context=25 / d_ff=2048
RAdam Opt. | 512 | 36 | -1.33(0.15) | 0.28 (0.05) | 0.40 (0.03) | 512x8 | 4.06 | -| config_vqvae_transformer_b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6 / d_model=512 / heads=8 / d_ff=2048
Rel. Pos Enc.
RAdam Opt. | 512 | 36 | -1.29(0.10) | 0.27 (0.05) | 0.39 (0.03) | 512x8 | 4.21 | -| config_vqvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6 / d_model=512 / heads=8 / att-context=25 / d_ff=2048
Rel. Pos Enc.
RAdam Opt. | 512 | 36 | -1.30(0.09) | 0.27 (0.04) | 0.39 (0.03) | 512x8 | 4.02 | -| config_vqvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | VQ-VAE | Conformer Enc
blocks=6 / d_model=512 / heads=8 / att-context=25 / d_ff=2048
RAdam Opt. | 512 | 36 | -1.26(0.10) | 0.28 (0.04) | 0.39 (0.03) | 512x8 | 4.06 | - - -### Models trained with augmentation (Denoising versions) - -| Config | Model Type | Architecture | Latent-channels | Compression (bits x/bits z) | ELBO/dim (std) | MSE (std) | L1 (std) | codebook size | EER(%) | -| ------ | ---------- | ------------ | :--------: | :-------: | :----: | :----: | :----: | :----: | :----: | -| config_dvae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh | VAE | ResNet1d Enc-Dec
res-blocks=16 / hid-channels=256 | 80 | 8 | -1.77 (0.33) | 1.67 (0.87) | 0.94 (0.22) | | 16.70 | -| config_dvae_resnet2d_b16c64_z80_c0.8.opt.lr0.01.v1.sh | VAE | ResNet2d Enc-Dec
res-blocks=16 / base-channels=64 | 80 | 0.8 | -1.77 (0.39) | 1.57 (0.92) | 0.89 (0.25) | | 12.40 | -| config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 569 | -1.75 (0.29) | 1.78 (0.84) | 0.98 (0.21) | 512x4 | 18.37 | -| config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 284 | -1.80 (0.42) | 1.69 (0.83) | 0.95 (0.21) | 512x8 | 15.19 | -| config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 142 | -1.81 (0.42) | 1.55 (0.97) | 0.87 (0.26) | 512x16 | 11.37 | -| config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 71 | -1.95 (0.49) | 1.47 (1.03) | 0.83 (0.30) | 512x32 | 8.75 | -| config_vqdvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6/ d_model=512 / heads=8 / att-context=25 / d_ff=2048
Radam Opt. | 512 | 36 | -1.85 (0.13) | 0.56 (0.31) | 0.57 (0.11) | 512x8 | 5.3 | -| config_vqdvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.005.v6.sh | VQ-VAE | Xformer Enc
blocks=6/ d_model=512 / heads=8 / d_ff=2048
Rel. Pos. Enc
Radam Opt. | 512 | 36 | -1.77 (0.05) | 0.43 (0.10) | 0.51 (0.04) | 512x8 | 4.56 | -| config_vqdvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.0025.v6.sh | VQ-VAE | Conformer Enc
blocks=6/ d_model=512 / heads=8 / d_ff=2048
Rel. Pos. Enc
Radam Opt. | 512 | 36 | -1.83 (0.05) | 0.59 (0.11) | 0.59 (0.04) | 512x8 | 6.56 | - - diff --git a/egs/voxceleb/vae.v1/cmd.sh b/egs/voxceleb/vae.v1/cmd.sh deleted file mode 100755 index fe9c55b0..00000000 --- a/egs/voxceleb/vae.v1/cmd.sh +++ /dev/null @@ -1,25 +0,0 @@ -# you can change cmd.sh depending on what type of queue you are using. -# If you have no queueing system and want to run on a local machine, you -# can change all instances 'queue.pl' to run.pl (but be careful and run -# commands one by one: most recipes will exhaust the memory on your -# machine). queue.pl works with GridEngine (qsub). slurm.pl works -# with slurm. Different queues are configured differently, with different -# queue names and different ways of specifying things like memory; -# to account for these differences you can create and edit the file -# conf/queue.conf to match your queue's configuration. Search for -# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, -# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. - -if [ "$(hostname -d)" == "cm.gemini" ];then - #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" - export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" - export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" - export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" -else - export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" - export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" - export cuda_eval_cmd="$train_cmd" -fi - - - diff --git a/egs/voxceleb/vae.v1/conf b/egs/voxceleb/vae.v1/conf deleted file mode 120000 index 7dfe9dce..00000000 --- a/egs/voxceleb/vae.v1/conf +++ /dev/null @@ -1 +0,0 @@ -../../sre19-cmn2/v1/conf \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/datapath.sh b/egs/voxceleb/vae.v1/datapath.sh deleted file mode 100644 index 632362a7..00000000 --- a/egs/voxceleb/vae.v1/datapath.sh +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Paths to the databases used in the experiment - -#paths to databases - -if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then - voxceleb1_root=/export/corpora5/VoxCeleb1 - voxceleb2_root=/export/corpora5/VoxCeleb2 - musan_root=/export/corpora5/JHU/musan -elif [ "$(hostname --domain)" == "cm.gemini" ];then - voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 - voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 - musan_root=/expscratch/dgromero/corpora-open/musan -else - echo "Put your database paths here" - exit 1 -fi - - diff --git a/egs/voxceleb/vae.v1/default_config.sh b/egs/voxceleb/vae.v1/default_config.sh deleted file mode 120000 index 5755326d..00000000 --- a/egs/voxceleb/vae.v1/default_config.sh +++ /dev/null @@ -1 +0,0 @@ -global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index 19b1cedf..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# Denoising VAE with symmetric ResNet1D encoder-decoder with -# 16 residual blocks, 256 dim per block, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=128 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=dvae -narch=resnet1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 3 4 6 3 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 3 4 6 3 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b16d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=90 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0090.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet2d_b16c64_z80_c0.8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet2d_b16c64_z80_c0.8.opt.lr0.01.v1.sh deleted file mode 100644 index 68fbba13..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet2d_b16c64_z80_c0.8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,29 +0,0 @@ -# Denoising VAE with symmetric ResNet2D encoder-decoder with -# 16 residual blocks, 64 base channels, latent_dim=80, compression factor=0.8 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=dvae -narch=resnet2d -enc_opt="--enc.in-conv-channels 64 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 2 2 2 2 --enc.resb-channels 64 128 256 512 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 512 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 2 2 2 2 --dec.resb-channels 512 256 128 64 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b16c64_z${latent_dim}_c0.8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=100 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0100.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b4d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b4d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index 3dc324ae..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b4d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,29 +0,0 @@ -# VAE with symmetric DC1 encoder-decoder with 4 layers, 256 dim per layer, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=512 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=dc1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.conv-repeats 1 1 1 1 --enc.conv-channels 256 --enc.conv-kernel-sizes 3 --enc.conv-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.conv-repeats 1 1 1 1 --dec.conv-channels 256 --dec.conv-kernel-sizes 3 --dec.conv-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b4d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=540 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0540.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b9d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b9d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index 11d79a6b..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b9d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric DC1 encoder-decoder with 9 layers, 256 dim per layer, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=512 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=dc1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.conv-repeats 2 2 3 2 --enc.conv-channels 256 --enc.conv-kernel-sizes 3 --enc.conv-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.conv-repeats 2 2 3 2 --dec.conv-channels 256 --dec.conv-kernel-sizes 3 --dec.conv-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" - -nnet_name=${model_type}_${narch}_b9d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=550 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0550.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh deleted file mode 100644 index 6de722df..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric deep conv 2D encoder-decoder with -# 4 residual blocks, 64 base channels , latent_channels=80, compression factor=0.8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=64 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=dc2d -vae_opt="" -enc_opt="--enc.in-conv-channels 64 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.conv-repeats 1 1 1 1 --enc.conv-channels 64 128 256 512 --enc.conv-kernel-sizes 3 --enc.conv-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 512 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.conv-repeats 1 1 1 1 --dec.conv-channels 512 256 128 64 --dec.conv-kernel-sizes 3 --dec.conv-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b4c64_z${latent_dim}_c0.8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=500 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0440.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh deleted file mode 100644 index 879ce269..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric deep conv 2D encoder-decoder with -# 8 residual blocks, 64 base channels , latent_channels=80, compression factor=0.8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=dc2d -vae_opt="" -enc_opt="--enc.in-conv-channels 64 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.conv-repeats 2 2 2 2 --enc.conv-channels 64 128 256 512 --enc.conv-kernel-sizes 3 --enc.conv-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 512 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.conv-repeats 2 2 2 2 --dec.conv-channels 512 256 128 64 --dec.conv-kernel-sizes 3 --dec.conv-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8c64_z${latent_dim}_c0.8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=400 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0400.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index aca516a1..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric ResNet1D encoder-decoder with -# 16 residual blocks, 256 dim per block, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=128 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=resnet1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 3 4 6 3 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 3 4 6 3 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b16d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=410 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0410.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b4d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b4d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index be0a00b6..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b4d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric ResNet1D encoder-decoder with -# 16 residual blocks, 256 dim per block, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=128 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=resnet1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 1 1 1 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 1 1 1 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b4d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=370 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0370.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b8d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b8d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index 167b3837..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b8d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=128 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=resnet1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=420 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0420.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh deleted file mode 100644 index 0240d1d0..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,31 +0,0 @@ -# VAE with symmetric ResNet2D encoder-decoder with -# 4 residual blocks, 64 base channels , latent_channels=80, compression factor=0.8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=resnet2d -vae_opt="" -enc_opt="--enc.in-conv-channels 64 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 1 1 1 --enc.resb-channels 64 128 256 512 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 512 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 1 1 1 --dec.resb-channels 512 256 128 64 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b4c64_z${latent_dim}_c0.8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=600 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0205.pth - - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh deleted file mode 100644 index ff503162..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,31 +0,0 @@ -# VAE with symmetric ResNet2D encoder-decoder with -# 8 residual blocks, 64 base channels , latent_channels=80, compression factor=0.8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=resnet2d -vae_opt="" -enc_opt="--enc.in-conv-channels 64 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 2 2 2 2 --enc.resb-channels 64 128 256 512 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 512 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 2 2 2 2 --dec.resb-channels 512 256 128 64 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8c64_z${latent_dim}_c0.8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=205 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0205.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth - diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.0025.v6.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.0025.v6.sh deleted file mode 100644 index 98af99a2..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.0025.v6.sh +++ /dev/null @@ -1,45 +0,0 @@ -k# VQ-VAE with Conformer Encoder for Enc and Dec with -# 6 conformer blocks, relative pos encoder, d_model=512, heads=8, d_ff=2048, -# latent_dim=512, codebook=512x8, compression factor=36, att-context=25 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.0025 - -model_type=vq-dvae - -dropout=0 -narch=conformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 -conv_kernel=31 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context --enc.conv-kernel-sizes $conv_kernel" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context --dec.conv-kernel-sizes $conv_kernel" - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 10000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 10000 --lrsched.update-lr-on-opt-step" - -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}cbk${conv_kernel}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv6_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=40 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0040.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh deleted file mode 100644 index 841207ea..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=142 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=16 -narch=resnet1d -model_type=vq-dvae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=100 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0100.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh deleted file mode 100644 index 795a8d4f..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=71 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=32 -narch=resnet1d -model_type=vq-dvae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=100 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0100.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh deleted file mode 100644 index da17dc19..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh +++ /dev/null @@ -1,34 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=569 -# Trained for denosing - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=4 -narch=resnet1d -model_type=vq-dvae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=90 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0090.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh deleted file mode 100644 index a2d8005e..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=284 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=8 -narch=resnet1d -model_type=vq-dvae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=100 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0100.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256swish_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256swish_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh deleted file mode 100644 index 435460c2..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256swish_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=142 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=16 -narch=resnet1d -model_type=vq-dvae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2 --enc.hid-act swish" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2 --dec.hid-act swish" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256swish_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=100 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0100.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh deleted file mode 100644 index f99031d1..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh +++ /dev/null @@ -1,43 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=512x8, compression factor=36 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-dvae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context" - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=40 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0040.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.005.v6.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.005.v6.sh deleted file mode 100644 index 03fe5a33..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.005.v6.sh +++ /dev/null @@ -1,43 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, relative pos encoder, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=512x8, compression factor=36 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.005 - -model_type=vq-dvae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context --enc.rel-pos-enc" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context --dec.rel-pos-enc" - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 10000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 10000 --lrsched.update-lr-on-opt-step" - -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}linff${d_ff}rpe_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv6_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=40 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0040.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh deleted file mode 100644 index e4962443..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh +++ /dev/null @@ -1,43 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=conformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 -conv_kernel=31 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context --enc.conv-kernel-sizes $conv_kernel" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context --dec.conv-kernel-sizes $conv_kernel" - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}cbk${conv_kernel}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=120 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0120.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512_c2275.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512_c2275.opt.lr0.01.v1.sh deleted file mode 100644 index 31487e05..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512_c2275.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=2275 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=1 -narch=resnet1d -model_type=vq-vae -vq_type=ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=370 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0370.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x128_c18.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x128_c18.opt.lr0.01.v1.sh deleted file mode 100644 index 56deb6c8..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x128_c18.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=18 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=128 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=550 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0550.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh deleted file mode 100644 index f5b56dc2..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=142 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=16 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=440 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0440.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x256_c9.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x256_c9.opt.lr0.01.v1.sh deleted file mode 100644 index 7998a6c3..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x256_c9.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=9 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=256 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=360 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0360.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x2_c1138.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x2_c1138.opt.lr0.01.v1.sh deleted file mode 100644 index 1252c9e4..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x2_c1138.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=1138 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=2 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=510 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0510.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh deleted file mode 100644 index 59327eb4..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=71 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=32 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=440 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0440.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh deleted file mode 100644 index 2082dd74..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=569 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=4 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=370 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0370.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569_predvar.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569_predvar.opt.lr0.01.v1.sh deleted file mode 100644 index 6ce2b144..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569_predvar.opt.lr0.01.v1.sh +++ /dev/null @@ -1,34 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=569 -# P(x|z) with sample dependent variances predicted by nnet - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=4 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups --px-pdf normal-diag-cov" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_predvar_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=400 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0400.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x64_c36.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x64_c36.opt.lr0.01.v1.sh deleted file mode 100644 index 8ef652f3..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x64_c36.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=64 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=460 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0460.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh deleted file mode 100644 index 56498b78..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=284 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=8 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=430 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0430.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh deleted file mode 100644 index 3c193e06..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh +++ /dev/null @@ -1,46 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type scaled-dot-prod-v1" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type scaled-dot-prod-v1" - - -#opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -#lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 12000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b${blocks}d${d_model}h${heads}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=160 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0160.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth - - diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh deleted file mode 100644 index ba68e597..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh +++ /dev/null @@ -1,42 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type scaled-dot-prod-v1 --enc.rel-pos-enc" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type scaled-dot-prod-v1 --dec.rel-pos-enc" - - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b${blocks}d${d_model}h${heads}linff${d_ff}rpe_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=150 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0150.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh deleted file mode 100644 index f02db8e9..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh +++ /dev/null @@ -1,45 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context" - - -#opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -#lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 12000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=170 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0170.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh deleted file mode 100644 index 59a8843d..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh +++ /dev/null @@ -1,45 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context" - - -#opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -#lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 12000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=170 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0170.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh deleted file mode 100644 index a04f4b58..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh +++ /dev/null @@ -1,43 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context --enc.rel-pos-enc" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context --dec.rel-pos-enc" - - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}linff${d_ff}rpe_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=160 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0160.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/local b/egs/voxceleb/vae.v1/local deleted file mode 120000 index ce1cbf90..00000000 --- a/egs/voxceleb/vae.v1/local +++ /dev/null @@ -1 +0,0 @@ -../v1/local \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/run_001_prepare_data.sh b/egs/voxceleb/vae.v1/run_001_prepare_data.sh deleted file mode 100755 index 65ff18d0..00000000 --- a/egs/voxceleb/vae.v1/run_001_prepare_data.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. datapath.sh - - -if [ $stage -le 1 ];then - - # Prepare the VoxCeleb2 dataset for training. - local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train - #local/make_voxceleb2cat.pl $voxceleb2_root test 16 data/voxceleb2cat_test - #utils/combine_data.sh data/voxceleb2cat data/voxceleb2cat_train data/voxceleb2cat_test -fi - -if [ $stage -le 2 ];then - # prepare voxceleb1 for test - local/make_voxceleb1_oeh.pl $voxceleb1_root data -fi diff --git a/egs/voxceleb/vae.v1/run_002_compute_evad.sh b/egs/voxceleb/vae.v1/run_002_compute_evad.sh deleted file mode 100755 index eeae00ac..00000000 --- a/egs/voxceleb/vae.v1/run_002_compute_evad.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -nodes=fs01 -storage_name=$(date +'%m_%d_%H_%M') -vaddir=`pwd`/exp/vad_e -vad_config=conf/vad_16k.yaml - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - - -if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $vaddir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $vaddir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $vaddir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $vaddir/storage - else - echo "we don't distribute data between multiple machines" - fi - fi -fi - -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ - --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_vad/$name $vaddir - utils/fix_data_dir.sh data/${name} - done -fi - - diff --git a/egs/voxceleb/vae.v1/run_003_compute_fbank.sh b/egs/voxceleb/vae.v1/run_003_compute_fbank.sh deleted file mode 100755 index 713a34cb..00000000 --- a/egs/voxceleb/vae.v1/run_003_compute_fbank.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -nodes=fs01 -storage_name=$(date +'%m_%d_%H_%M') -fbankdir=`pwd`/exp/fbank -vaddir=`pwd`/exp/fbank -vaddir_gt=`pwd`/exp/vad_gt - -stage=1 -config_file=default_config.sh -feat_vers="numpy" - -. parse_options.sh || exit 1; - -if [ "$feat_vers" == "kaldi" ];then - make_fbank=steps/make_fbank.sh - fbank_cfg=conf/fbank80_16k.conf -else - fbank_cfg=conf/fbank80_16k.yaml - if [ "$feat_vers" == "numpy" ];then - make_fbank=steps_pyfe/make_fbank.sh - else - make_fbank=steps_pyfe/make_torch_fbank.sh - fi -fi - -# Make filterbanks -if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $fbankdir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/fbank/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $fbankdir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $fbankdir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $fbankdir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $fbankdir/storage - else - echo "we don't distribute data between multiple machines" - fi - fi -fi - -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - $make_fbank --write-utt2num-frames true --fbank-config $fbank_cfg --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_fbank/$name $fbankdir - utils/fix_data_dir.sh data/${name} - done - -fi - - diff --git a/egs/voxceleb/vae.v1/run_004_prepare_augment.sh b/egs/voxceleb/vae.v1/run_004_prepare_augment.sh deleted file mode 100755 index 7d78ae92..00000000 --- a/egs/voxceleb/vae.v1/run_004_prepare_augment.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -# In this script, we augment the SWBD,SRE,MX6 and Voxceleb data with reverberation, -# noise, music, and babble, and combined it with the clean data. -# The combined list will be used to train the xvector DNN. - -frame_shift=0.01 - -if [ $stage -le 1 ]; then - - if [ ! -d "RIRS_NOISES" ]; then - if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then - ln -s ../../sre19-cmn2/v1/RIRS_NOISES - else - # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises - wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip - unzip rirs_noises.zip - fi - fi - - # Prepare the MUSAN corpus, which consists of music, speech, and noise - # suitable for augmentation. - local/make_musan.sh $musan_root 16 data - - # Get the duration of the MUSAN recordings. This will be used by the - # script augment_data_dir.py. - for name in speech noise music; do - utils/data/get_utt2dur.sh data/musan_${name} - mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur - done - -fi - - -if [ $stage -le 2 ]; then - - for name in voxceleb2cat_train - do - export TMPDIR=data/tmp - mkdir -p $TMPDIR - - awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/$name/utt2num_frames > data/$name/reco2dur - - # Make a reverberated version of the list. Note that we don't add any - # additive noise here. - - # Make a version with reverberated speech - rvb_opts=() - rvb_opts+=(--rir-set-parameters "0.2, RIRS_NOISES/real_rirs_isotropic_noises/rir_list") - rvb_opts+=(--rir-set-parameters "0.4, RIRS_NOISES/simulated_rirs/smallroom/rir_list") - rvb_opts+=(--rir-set-parameters "0.4, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") - - python steps/data/reverberate_data_dir.py \ - "${rvb_opts[@]}" \ - --speech-rvb-probability 1 \ - --pointsource-noise-addition-probability 0 \ - --isotropic-noise-addition-probability 0 \ - --num-replications 1 \ - --source-sampling-rate 16000 \ - data/${name} data/${name}_reverb - cp data/${name}/vad.scp data/${name}_reverb/ - utils/copy_data_dir.sh --utt-suffix "-reverb" data/${name}_reverb data/${name}_reverb.new - rm -rf data/${name}_reverb - mv data/${name}_reverb.new data/${name}_reverb - - - # Augment with musan_noise - python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0:13:8" --fg-noise-dir "data/musan_noise" data/${name} data/${name}_noise - # Augment with musan_music - python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/${name} data/${name}_music - # Augment with musan_speech - python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13:10" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/${name} data/${name}_babble - - - awk '{ $1=$1"-reverb"; print $0}' data/${name}/reco2dur > data/${name}_reverb/reco2dur - - # Augment with musan_noise - python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0:13:8" --fg-noise-dir "data/musan_noise" data/${name}_reverb data/${name}_reverb_noise - # Augment with musan_music - python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/${name}_reverb data/${name}_reverb_music - # Augment with musan_speech - python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13:10" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/${name}_reverb data/${name}_reverb_babble - - - # Combine noise only - utils/combine_data.sh data/${name}_noise_all \ - data/${name}_noise data/${name}_music data/${name}_babble - - # Combine reverbs - utils/combine_data.sh data/${name}_reverb_all data/${name}_reverb \ - data/${name}_reverb_noise data/${name}_reverb_music data/${name}_reverb_babble - - # Combine reverb, noise, music, and babble into one directory. - utils/combine_data.sh data/${name}_aug data/${name}_reverb_all data/${name}_noise_all - unset TMPDIR - done - -fi - - -if [ $stage -le 3 ];then - # Take a random subset of the augmentations - utils/subset_data_dir.sh data/voxceleb2cat_train_aug \ - $(wc -l data/voxceleb2cat_train/utt2spk | awk '{ print int('$num_augs'*$1)}') \ - data/voxceleb2cat_train_augx${num_augs} - utils/fix_data_dir.sh data/voxceleb2cat_train_augx${num_augs} -fi - - -exit diff --git a/egs/voxceleb/vae.v1/run_005_compute_fbank_augment.sh b/egs/voxceleb/vae.v1/run_005_compute_fbank_augment.sh deleted file mode 100755 index 10d13e03..00000000 --- a/egs/voxceleb/vae.v1/run_005_compute_fbank_augment.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -fbankdir=`pwd`/exp/fbank - -stage=1 -config_file=default_config.sh -feat_vers="numpy" - -. parse_options.sh || exit 1; -. $config_file - -if [ "$feat_vers" == "kaldi" ];then - make_fbank=steps/make_fbank.sh - fbank_cfg=conf/fbank80_16k.conf -else - fbank_cfg=conf/fbank80_16k.yaml - if [ "$feat_vers" == "numpy" ];then - make_fbank=steps_pyfe/make_fbank.sh - else - make_fbank=steps_pyfe/make_torch_fbank.sh - fi -fi - -export TMPDIR=data/tmp -mkdir -p $TMPDIR - -if [ $stage -le 1 ];then - - # Make filterbanks for the augmented data. Note that we do not compute a new - # vad.scp file here. Instead, we use the vad.scp from the clean version of - # the list. - for name in voxceleb2cat_train_augx${num_augs} - do - $make_fbank --write-utt2num-frames true \ - --fbank-config $fbank_cfg --nj 120 --cmd "$train_cmd" \ - data/$name exp/make_fbank/$name $fbankdir - fix_data_dir.sh data/$name - done - -fi - - -if [ $stage -le 2 ];then - - # Combine the clean and augmented lists. - utils/combine_data.sh --extra-files "utt2num_frames" data/voxceleb2cat_train_combined data/voxceleb2cat_train_augx${num_augs} data/voxceleb2cat_train - -fi - -exit - diff --git a/egs/voxceleb/vae.v1/run_010_prepare_gen_model_train_data.sh b/egs/voxceleb/vae.v1/run_010_prepare_gen_model_train_data.sh deleted file mode 100755 index c2f5c832..00000000 --- a/egs/voxceleb/vae.v1/run_010_prepare_gen_model_train_data.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - -# Now we prepare the features to generate examples for xvector training. -if [ $stage -le 2 ]; then - # This script applies CMVN and removes nonspeech frames. Note that this is somewhat - # wasteful, as it roughly doubles the amount of training data on disk. After - # creating training examples, this can be removed. - steps_xvec/prepare_feats_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ - --storage_name voxceleb-vae.v1-$(date +'%m_%d_%H_%M') \ - data/${nnet_data} data/${nnet_data}_no_sil exp/${nnet_data}_no_sil - utils/fix_data_dir.sh data/${nnet_data}_no_sil - -fi - - -if [ $stage -le 3 ]; then - # Now, we need to remove features that are too short after removing silence - # frames. We want atleast 4s (400 frames) per utterance. - hyp_utils/remove_short_utts.sh --min-len 400 data/${nnet_data}_no_sil - - # We also want several utterances per speaker. Now we'll throw out speakers - # with fewer than 8 utterances. - hyp_utils/remove_spk_few_utts.sh --min-num-utts 8 data/${nnet_data}_no_sil - -fi - -if [ $stage -le 4 ]; then - # Prepare train and validation lists for x-vectors - local/make_train_lists_sup_embed_with_augm.sh data/${nnet_data}_no_sil data/${nnet_data}_no_sil/lists_xvec -fi - -exit diff --git a/egs/voxceleb/vae.v1/run_011_train_model.sh b/egs/voxceleb/vae.v1/run_011_train_model.sh deleted file mode 100755 index 8c9bb4d4..00000000 --- a/egs/voxceleb/vae.v1/run_011_train_model.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=1 -config_file=default_config.sh -resume=false -interactive=false -num_workers=8 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ x=int($2/$1+0.5); if(x==0){ x=1 }; print x }') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - - - -# Network Training -if [ $stage -le 1 ]; then - mkdir -p $nnet_dir/log - - if [ "$model_type" == "vae" ] || [ "$model_type" == "vq-vae" ];then - # Train VAE - train_exec=torch-train-${model_type}.py - $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --num-gpus $ngpu \ - $train_exec $narch:$narch \ - --data-rspec scp:$list_dir/feats.scp \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --num-frames-file $list_dir/utt2num_frames \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $opt_opt $lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $nnet_num_epochs \ - --z-dim $latent_dim $enc_opt $dec_opt $vae_opt \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args - - # train_exec=torch-train-${narch}-${model_type}.py - - # $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - # hyp_utils/conda_env.sh --num-gpus $ngpu \ - # $train_exec \ - # --data-rspec scp:$list_dir/feats.scp \ - # --train-list $list_dir/lists_xvec/train.scp \ - # --val-list $list_dir/lists_xvec/val.scp \ - # --num-frames-file $list_dir/utt2num_frames \ - # --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - # --iters-per-epoch $ipe \ - # --batch-size $batch_size \ - # --num-workers $num_workers $opt_opt $lrs_opt \ - # --grad-acc-steps $grad_acc_steps \ - # --epochs $nnet_num_epochs \ - # --z-dim $latent_dim $enc_opt $dec_opt $vae_opt \ - # --num-gpus $ngpu \ - # --log-interval $log_interval \ - # --exp-path $nnet_dir $args - - elif [[ "$model_type" =~ "dvae" ]];then - # Train Denoising VAE - train_exec=torch-train-${model_type}.py - - $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --num-gpus $ngpu \ - $train_exec $narch:$narch\ - --data-rspec scp:$list_dir/feats.scp \ - --train-list $list_dir/lists_xvec/train.scp \ - --train-pair-list $list_dir/lists_xvec/augm2clean.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --val-pair-list $list_dir/lists_xvec/augm2clean.scp \ - --num-frames-file $list_dir/utt2num_frames \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $opt_opt $lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $nnet_num_epochs \ - --z-dim $latent_dim $enc_opt $dec_opt $vae_opt \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args - - # train_exec=torch-train-${narch}-${model_type}.py - - # $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - # hyp_utils/conda_env.sh --num-gpus $ngpu \ - # $train_exec \ - # --data-rspec scp:$list_dir/feats.scp \ - # --train-list $list_dir/lists_xvec/train.scp \ - # --train-pair-list $list_dir/lists_xvec/augm2clean.scp \ - # --val-list $list_dir/lists_xvec/val.scp \ - # --val-pair-list $list_dir/lists_xvec/augm2clean.scp \ - # --num-frames-file $list_dir/utt2num_frames \ - # --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - # --iters-per-epoch $ipe \ - # --batch-size $batch_size \ - # --num-workers $num_workers $opt_opt $lrs_opt \ - # --grad-acc-steps $grad_acc_steps \ - # --epochs $nnet_num_epochs \ - # --z-dim $latent_dim $enc_opt $dec_opt $vae_opt \ - # --num-gpus $ngpu \ - # --log-interval $log_interval \ - # --exp-path $nnet_dir $args - - else - echo "unknown model type $model_type" - exit 1 - - fi - -fi - - -exit diff --git a/egs/voxceleb/vae.v1/run_012_eval_recons.sh b/egs/voxceleb/vae.v1/run_012_eval_recons.sh deleted file mode 100755 index 961ae68d..00000000 --- a/egs/voxceleb/vae.v1/run_012_eval_recons.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -# Copyright -# 2020 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -#xvec_chunk_length=12800 -. parse_options.sh || exit 1; -. $config_file - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -output_dir=exp/recons_output/$nnet_name -if [[ "$model_type" =~ "vae" ]];then - eval_script=hyp_utils/generative/eval_vae.sh -else - echo "unknown model type $model_type" - exit 1 -fi - -if [ $stage -le 1 ]; then - for name in voxceleb1_test - do - num_utt=$(wc -l data/$name/utt2spk | awk '{ print $1}') - nj=$(($num_utt < 100 ? $num_utt:100)) - $eval_script --cmd "$eval_cmd --mem 6G" --nj $nj ${eval_args} \ - $nnet data/$name $output_dir/$name - done -fi - - - - diff --git a/egs/voxceleb/vae.v1/run_013_eval_xvector_asv.sh b/egs/voxceleb/vae.v1/run_013_eval_xvector_asv.sh deleted file mode 100755 index 11932091..00000000 --- a/egs/voxceleb/vae.v1/run_013_eval_xvector_asv.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -xvec_chunk_length=12800 -. parse_options.sh || exit 1; -. $config_file - -if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" - xvec_cmd="$cuda_eval_cmd" -else - xvec_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name/$xvec_nnet_name -score_be_dir=exp/scores/$nnet_name/$xvec_nnet_name/cosine - - -if [ $stage -le 1 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors_with_vae_preproc.sh \ - --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - $xvec_nnet $nnet data/$name \ - $xvector_dir/$name - done -fi - - -if [ $stage -le 2 ];then - - echo "Eval Voxceleb 1 with Cosine scoring" - steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $score_be_dir/voxceleb1_scores - - $train_cmd --mem 10G --num-threads 6 $score_be_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_be_dir - - for f in $(ls $score_be_dir/*_results); - do - echo $f - cat $f - echo "" - done - -fi - -exit diff --git a/egs/voxceleb/vae.v1/steps_be b/egs/voxceleb/vae.v1/steps_be deleted file mode 120000 index 4958fae7..00000000 --- a/egs/voxceleb/vae.v1/steps_be +++ /dev/null @@ -1 +0,0 @@ -../v1.1/steps_be \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/steps_fe b/egs/voxceleb/vae.v1/steps_fe deleted file mode 120000 index 73ccc1eb..00000000 --- a/egs/voxceleb/vae.v1/steps_fe +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/kaldi/vad \ No newline at end of file diff --git a/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav.sh b/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav.sh index 0d7e5d4c..7a97bb56 100755 --- a/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav.sh +++ b/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav.sh @@ -85,7 +85,7 @@ echo "$0: score $key_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-adv-test-wav.py \ + eval_xvec_cosine_scoring_from_adv_test_wav.py \ --feats $feat_config ${args} \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav_wavegan.sh b/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav_wavegan.sh index f083ecb8..5ad16f77 100755 --- a/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav_wavegan.sh +++ b/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav_wavegan.sh @@ -92,7 +92,7 @@ fi echo "$0: score $key_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py \ + eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py \ --feats $feat_config ${args} \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyp_utils/adv/eval_cosine_scoring_from_art_test_wav.sh b/hyp_utils/adv/eval_cosine_scoring_from_art_test_wav.sh index 3abd289b..bca8266e 100755 --- a/hyp_utils/adv/eval_cosine_scoring_from_art_test_wav.sh +++ b/hyp_utils/adv/eval_cosine_scoring_from_art_test_wav.sh @@ -88,7 +88,7 @@ echo "$0: score $key_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ART_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-art-test-wav.py \ + eval_xvec_cosine_scoring_from_art_test_wav.py \ --feats $feat_config ${args} \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyp_utils/adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh b/hyp_utils/adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh index 51c248fd..008b6ccc 100755 --- a/hyp_utils/adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh +++ b/hyp_utils/adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh @@ -94,7 +94,7 @@ echo "$0: score $key_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py \ + eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py \ --feats $feat_config --transfer_feats $transfer_feat_config ${args} \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyp_utils/adv/eval_cosine_scoring_from_transfer_art_test_wav.sh b/hyp_utils/adv/eval_cosine_scoring_from_transfer_art_test_wav.sh index 7f497d02..b60cdee4 100755 --- a/hyp_utils/adv/eval_cosine_scoring_from_transfer_art_test_wav.sh +++ b/hyp_utils/adv/eval_cosine_scoring_from_transfer_art_test_wav.sh @@ -96,7 +96,7 @@ echo "$0: score $key_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ART_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py \ + eval_xvec_cosine_scoring_from_transfer_art_test_wav.py \ --feats $feat_config --transfer_feats $transfer_feat_config ${args} \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyp_utils/adv/generate_adv_attacks_xvector_classif.sh b/hyp_utils/adv/generate_adv_attacks_xvector_classif.sh index 29d762af..f0401c3a 100755 --- a/hyp_utils/adv/generate_adv_attacks_xvector_classif.sh +++ b/hyp_utils/adv/generate_adv_attacks_xvector_classif.sh @@ -75,7 +75,7 @@ echo "$0: generate attacks for $data_dir to $output_dir" if [ $stage -le 1 ];then $cmd JOB=1:$nj $log_dir/generate_attack.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - torch-generate-adv-attacks-xvector-classif.py \ + generate_adv_attacks_xvector_classif.py \ --feats $feat_config ${args} $attacks_opts \ --wav-file $wav \ --list-file $list \ diff --git a/hyp_utils/adv/generate_adv_attacks_xvector_verif.sh b/hyp_utils/adv/generate_adv_attacks_xvector_verif.sh index 4cf99518..e20b03ff 100755 --- a/hyp_utils/adv/generate_adv_attacks_xvector_verif.sh +++ b/hyp_utils/adv/generate_adv_attacks_xvector_verif.sh @@ -73,7 +73,7 @@ echo "$0: generate attacks for $data_dir to $output_dir" if [ $stage -le 1 ];then $cmd JOB=1:$nj $log_dir/generate_attack.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - torch-generate-adv-attacks-xvector-verif.py \ + generate_adv_attacks_xvector_verif.py \ --feats $feat_config ${args} $attacks_opts \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index 1aea9eb9..90ffa369 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -14,6 +14,7 @@ if [ -n "$HYP_ENV" ];then else conda_env=base fi +max_split_size_mb="" while true do @@ -25,6 +26,10 @@ do shift; conda_env=$1 shift; + elif [ "$1" == "--max-split-size-mb" ];then + shift; + max_split_size_mb=$1 + shift; else break fi @@ -47,30 +52,39 @@ fi # echo "LRU_CACHE_CAPACITY=$LRU_CACHE_CAPACITY" conda activate $conda_env -command="python" +command="" if [ $num_gpus -gt 0 ];then + if [ -z "$CUDA_VISIBLE_DEVICES" ];then # set CUDA_VISIBLE_DEVICES - echo "SGE_HGR_gpu=$SGE_HGR_gpu" if [ ! -z "$SGE_HGR_gpu" ]; then - export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g') + echo "SGE_HGR_gpu=$SGE_HGR_gpu" + export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g') else - # seach location of free-gpu program in the PATH or hyp_utils directory - free_gpu=$(which free-gpu) - if [ -z "$free_gpu" ];then - free_gpu=$(which hyp_utils/free-gpu) - fi - - if [ ! -z "$free_gpu" ];then - # if free-gpu found set env var, otherwise we assume that you can use any gpu - export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus) - fi - fi - echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" - if [ $num_gpus -gt 1 ];then - [[ $(type -P "$torchrun") ]] && command="torchrun" \ - || command="python -m torch.distributed.run" - command="$command --nproc_per_node=$num_gpus --standalone --nnodes=1" + # seach location of free-gpu program in the PATH or hyp_utils directory + free_gpu=$(which free-gpu) + if [ -z "$free_gpu" ];then + free_gpu=$(which hyp_utils/free-gpu) + fi + + if [ ! -z "$free_gpu" ];then + # if free-gpu found set env var, otherwise we assume that you can use any gpu + export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus) + fi fi + fi + echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" + if [ -n "$max_split_size_mb" ];then + export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:${max_split_size_mb}" + echo "PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF} + fi + #export CUDA_LAUNCH_BLOCKING=1 + #export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters + if [ $num_gpus -gt 1 ];then + + [[ $(type -P "torchrun") ]] && command="torchrun" \ + || command="python -m torch.distributed.run" + command="$command --nproc_per_node=$num_gpus --standalone --nnodes=1" + fi fi py_exec=$(which $1) diff --git a/hyp_utils/create_audios_split_links.sh b/hyp_utils/create_audios_split_links.sh new file mode 100755 index 00000000..7125a2c4 --- /dev/null +++ b/hyp_utils/create_audios_split_links.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo "$0 exp/xvector_audios/voxceleb data/voxceleb/recordings.csv flac" +fi +echo "$0 $@" # Print the command line for logging +output_dir=$1 +rec_file=$2 +file_format=$3 + +if [[ $(hostname -f) != *.clsp.jhu.edu ]]; then + exit 0 +fi + +for f in $(awk -F "," '$1!="id" { print $1}' $rec_file); do + # the next command does nothing unless $output_dir/storage/ exists, see + # utils/create_data_link.pl for more info. + hyp_utils/create_data_link.pl $output_dir/$f.$file_format +done + + + diff --git a/hyp_utils/create_data_link.pl b/hyp_utils/create_data_link.pl new file mode 100755 index 00000000..850f29f0 --- /dev/null +++ b/hyp_utils/create_data_link.pl @@ -0,0 +1,132 @@ +#!/usr/bin/env perl + +# Copyright 2013 Guoguo Chen +# 2014 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0. +# +# This script distributes data onto different file systems by making symbolic +# links. It is supposed to use together with utils/create_split_dir.pl, which +# creates a "storage" directory that links to different file systems. +# +# If a sub-directory egs/storage does not exist, it does nothing. If it exists, +# then it selects pseudo-randomly a number from those available in egs/storage/* +# creates a link such as +# +# egs/egs.3.4.ark -> storage/4/egs.3.4.ark +# +use strict; +use warnings; +use File::Basename; +use File::Spec; +use Getopt::Long; + +sub GetGCD { + my ($a, $b) = @_; + while ($a != $b) { + if ($a > $b) { + $a = $a - $b; + } else { + $b = $b - $a; + } + } + return $a; +} + +my $Usage = < storage/4/egs.3.4.ark + +Usage: utils/create_data_link.pl [ ... ] + e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark foo/bar/egs.3.5.ark + (note: the dirname, e.g. foo/bar/, must be the same in all cases). + +See also utils/remove_data_links.sh +EOU + +GetOptions(); + +if (@ARGV == 0) { + die $Usage; +} + +my $example_fullpath = $ARGV[0]; + +# Check if the storage has been created. If so, do nothing. +my $dirname = dirname($example_fullpath); +if (! -d "$dirname/storage") { + exit(0); +} + +# Storage exists, create symbolic links in the next few steps. + +# First, get a list of the available storage directories, and check if they are +# properly created. +opendir(my $dh, "$dirname/storage/") || die "$0: Fail to open $dirname/storage/\n"; +my @storage_dirs = grep(/^[0-9]*$/, readdir($dh)); +closedir($dh); +my $num_storage = scalar(@storage_dirs); +for (my $x = 1; $x <= $num_storage; $x++) { + (-d "$dirname/storage/$x") || die "$0: $dirname/storage/$x does not exist\n"; +} + +# Second, get the coprime list. +my @coprimes; +for (my $n = 1; $n <= $num_storage; $n++) { + if (GetGCD($n, $num_storage) == 1) { + push(@coprimes, $n); + } +} + +my $ret = 0; + +foreach my $fullpath (@ARGV) { + if ($dirname ne dirname($fullpath)) { + die "Mismatch in directory names of arguments: $example_fullpath versus $fullpath"; + } + + # Finally, work out the directory index where we should put the data to. + my $basename = basename($fullpath); + my $filename_numbers = $basename; + $filename_numbers =~ s/[^0-9]+/ /g; + my @filename_numbers = split(" ", $filename_numbers); + my $total = 0; + my $index = 0; + foreach my $x (@filename_numbers) { + if ($index >= scalar(@coprimes)) { + $index = 0; + } + $total += $x * $coprimes[$index]; + $index++; + } + my $dir_index = $total % $num_storage + 1; + + # Make the symbolic link. + if (-e $fullpath) { + unlink($fullpath); + } + if (symlink("storage/$dir_index/$basename", $fullpath) != 1) { # failure + $ret = 1; # will exit with error status. + } +} + +exit($ret); + +## testing: +# rm -rf foo bar +# mkdir -p bar/{1,2,3,4} +# mkdir -p foo/storage +# for x in 1 2 3 4; do ln -s ../../bar/$x foo/storage/$x; done +# utils/create_data_link.pl utils/create_data_link.pl foo/1.3.ark foo/2.3.ark +# ls -l foo +# total 0 +# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 1.3.ark -> storage/3/1.3.ark +# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 2.3.ark -> storage/4/2.3.ark +# drwxr-xr-x 2 dpovey fax 38 Sep 2 17:40 storage diff --git a/hyp_utils/create_data_split_dirs.sh b/hyp_utils/create_data_split_dirs.sh new file mode 100755 index 00000000..b8aad6c8 --- /dev/null +++ b/hyp_utils/create_data_split_dirs.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +storage_name=$(date +'%m_%d_%H_%M') + + + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo "$0 exp/vad_dir $USER/hyp-data/voxceleb/v1/vad/storage b0" +fi + +output_dir=$1 +storage_dir=$2 +nodes=$3 + +link_dir=$output_dir/storage + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $linkdir ]; then + echo "$0 $@" # Print the command line for logging + echo "Prepare to distribute data over multiple $nodes nodes" + dir_name=$storage_dir/$storage_name/storage + if [ "$nodes" == "b0" ];then + hyp_utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $link_dir + elif [ "$nodes" == "b1" ];then + hyp_utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $link_dir + elif [ "$nodes" == "c0" ];then + hyp_utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $link_dir + elif [ "$nodes" == "fs01" ];then + hyp_utils/create_split_dir.pl \ + /export/fs01/$dir_name $link_dir + else + echo "we don't distribute data between multiple machines" + fi +fi + + + diff --git a/hyp_utils/create_data_split_links.sh b/hyp_utils/create_data_split_links.sh new file mode 100755 index 00000000..c7cfa3eb --- /dev/null +++ b/hyp_utils/create_data_split_links.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "$0 exp/vad_dir/vad.JOB.ark 40" +fi +echo "$0 $@" # Print the command line for logging +output_file_pattern=$1 +nj=$2 + +for n in $(seq $nj); do + # the next command does nothing unless output_dir/storage exists, see + # utils/create_data_link.pl for more info. + output_file=$(echo $output_file_pattern | sed 's@\.JOB\.[^\.]*$@.'$n'.@') + hyp_utils/create_data_link.pl $output_file +done + diff --git a/hyp_utils/create_split_dir.pl b/hyp_utils/create_split_dir.pl new file mode 100755 index 00000000..ab952357 --- /dev/null +++ b/hyp_utils/create_split_dir.pl @@ -0,0 +1,92 @@ +#!/usr/bin/env perl + +# Copyright 2013 Guoguo Chen +# Apache 2.0. +# +# This script creates storage directories on different file systems, and creates +# symbolic links to those directories. For example, a command +# +# utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage +# +# will mkdir -p all of those directories, and will create links +# +# egs/storage/1 -> /export/gpu-03/egs/storage +# egs/storage/2 -> /export/gpu-03/egs/storage +# ... +# +use strict; +use warnings; +use File::Spec; +use Getopt::Long; + +my $Usage = < + e.g.: utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage + +Allowed options: + --suffix : Common suffix to (string, default = "") + +See also create_data_link.pl, which is intended to work with the resulting +directory structure, and remove_data_links.sh +EOU + +my $suffix=""; +GetOptions('suffix=s' => \$suffix); + +if (@ARGV < 2) { + die $Usage; +} + +my $ans = 1; + +my $dir = pop(@ARGV); +system("mkdir -p $dir 2>/dev/null"); + +my @all_actual_storage = (); +foreach my $file (@ARGV) { + push @all_actual_storage, File::Spec->rel2abs($file . "/" . $suffix); +} + +my $index = 1; +foreach my $actual_storage (@all_actual_storage) { + my $pseudo_storage = "$dir/$index"; + + # If the symbolic link already exists, delete it. + if (-l $pseudo_storage) { + print STDERR "$0: link $pseudo_storage already exists, not overwriting.\n"; + $index++; + next; + } + + # Create the destination directory and make the link. + system("mkdir -p $actual_storage 2>/dev/null"); + if ($? != 0) { + print STDERR "$0: error creating directory $actual_storage\n"; + exit(1); + } + { # create a README file for easier deletion. + open(R, ">$actual_storage/README.txt"); + my $storage_dir = File::Spec->rel2abs($dir); + print R "# This directory is linked from $storage_dir, as part of Kaldi striped data\n"; + print R "# The full list of directories where this data resides is:\n"; + foreach my $d (@all_actual_storage) { + print R "$d\n"; + } + close(R); + } + my $ret = symlink($actual_storage, $pseudo_storage); + + # Process the returned values + $ans = $ans && $ret; + if (! $ret) { + print STDERR "Error linking $actual_storage to $pseudo_storage\n"; + } + + $index++; +} + +exit($ans == 1 ? 0 : 1); diff --git a/hyp_utils/feats/make_evad.sh b/hyp_utils/feats/make_evad.sh index 8717fc3c..16ddbf74 100755 --- a/hyp_utils/feats/make_evad.sh +++ b/hyp_utils/feats/make_evad.sh @@ -86,8 +86,8 @@ fi $cmd JOB=1:$nj $logdir/make_vad_${name}.JOB.log \ hyp_utils/conda_env.sh \ - compute-energy-vad.py --cfg $vad_config $opt_args \ - --input $scp --output ark,scp:$vaddir/vad_$name.JOB.ark,$vaddir/vad_$name.JOB.scp \ + compute_energy_vad.py --cfg $vad_config $opt_args \ + --recordings-file $scp --output-spec ark,scp:$vaddir/vad_$name.JOB.ark,$vaddir/vad_$name.JOB.scp \ --part-idx JOB --num-parts $nj || exit 1 # concatenate the .scp files together. diff --git a/hyp_utils/xvectors/audio_to_duration.sh b/hyp_utils/xvectors/audio_to_duration.sh new file mode 100755 index 00000000..f4187919 --- /dev/null +++ b/hyp_utils/xvectors/audio_to_duration.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# +# 2022 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +set -e +nj=40 +cmd="run.pl" +stage=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data_in=$1 +output_dir=$data_in/durations + +name=`basename $data_in` + +for f in $data_in/wav.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +mkdir -p $output_dir/log + +$cmd JOB=1:$nj $output_dir/log/audio_to_duration.JOB.log \ + hyp_utils/conda_env.sh \ + audio_to_duration.py \ + --audio-file $data_in/wav.scp \ + --output-file $output_dir/utt2dur.JOB \ + --part-idx JOB --num-parts $nj + + +for n in $(seq $nj); do + cat $output_dir/utt2dur.$n || exit 1; +done > ${data_in}/utt2dur || exit 1 + +echo "$0: Succeeded processing audios for $name" diff --git a/hyp_utils/xvectors/eval_cosine_scoring_from_test_wav.sh b/hyp_utils/xvectors/eval_cosine_scoring_from_test_wav.sh index b17a3ea2..963fd91b 100755 --- a/hyp_utils/xvectors/eval_cosine_scoring_from_test_wav.sh +++ b/hyp_utils/xvectors/eval_cosine_scoring_from_test_wav.sh @@ -70,7 +70,7 @@ echo "$0: score $ndx_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-test-wav.py \ + eval_xvec_cosine_scoring_from_test_wav.py \ --feats $feat_config ${args} \ --v-file scp:$vector_file \ --ndx-file $ndx_file \ diff --git a/hyp_utils/xvectors/eval_xvec_logits_from_wav.sh b/hyp_utils/xvectors/eval_xvec_logits_from_wav.sh index bdd53862..4765e809 100755 --- a/hyp_utils/xvectors/eval_xvec_logits_from_wav.sh +++ b/hyp_utils/xvectors/eval_xvec_logits_from_wav.sh @@ -84,7 +84,7 @@ fi if [ $stage -le 0 ];then $cmd JOB=1:$nj $output_dir/log/eval_logits.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - torch-eval-xvec-logits-from-wav.py \ + eval_xvec_logits_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ --part-idx JOB --num-parts $nj \ --input $data_dir/wav.scp \ diff --git a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh new file mode 100755 index 00000000..d8ae2e55 --- /dev/null +++ b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh @@ -0,0 +1,156 @@ +#!/bin/bash +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +nj=30 +cmd="run.pl" + +hf_chunk_length=0 # The chunk size over which the embedding is extracted. +xvec_chunk_length=0 # The chunk size over which the embedding is extracted. +use_gpu=false +write_utt2speech_dur=true # If true writes utt2speech_dur. +stage=0 +min_utt_length=5 +max_utt_length=120 +random_utt_length=false +aug_config="" +num_augs=0 +use_bin_vad=true + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ] && [ $# != 4 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + echo " --use-bin-vad # If true, uses binary VAD from vad.scp" + echo " --write-utt2speech-dur # If true, write utt2speech_dur (in secs) file." + echo " --chunk-length # If provided, applies encoder with specified chunk-length and " + echo " # concatenates the chunks outputs before pooling" + echo " --aug-config # augmentation config file" + echo " --random-utt-length # If true, extracts a random chunk from the utterance between " + echo " # min_utt_length and max_utt_length" + echo " --min-utt-length # " + echo " --max-utt-length # " + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +data_out_dir=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +args="" +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + args="--use-gpu" +fi + +if [ "$use_bin_vad" == "true" ];then + f=$data_dir/vad.scp + [ ! -f $f ] && echo "No such file $f" && exit 1; + args="${args} --vad scp:$f" +fi + +if [ -n "$aug_config" ];then + args="${args} --aug-cfg $aug_config --num-augs $num_augs --aug-info-path $output_dir/aug_info.JOB.csv" +fi + +if [ "$random_utt_length" == "true" ];then + args="${args} --random-utt-length --min-utt-length $min_utt_length --max-utt-length $max_utt_length" +fi + +if [ "$write_utt2speech_dur" == "true" ];then + write_speech_dur_opt="--write-speech-dur $output_dir/utt2speech_dur.JOB" +fi + +if [ $stage -le 0 ];then + set +e + $cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + extract_wav2vec2xvectors.py \ + ${args} $write_speech_dur_opt \ + --part-idx JOB --num-parts $nj \ + --recordings-file $data_dir/wav.scp \ + --model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \ + --output-spec ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp + set -e +fi + +if [ $stage -le 1 ];then + for((i=1;i<=$nj;i++)) + do + status=$(tail -n 1 $output_dir/log/extract_xvectors.$i.log | \ + awk '/status 0/ { print 0} + !/status 0/ { print 1}') + if [ $status -eq 1 ];then + echo "JOB $i failed, resubmitting" + if [ "$write_utt2speech_dur" == "true" ];then + write_speech_dur_opt="--write-speech-dur $output_dir/utt2speech_dur.$i" + fi + $cmd $output_dir/log/extract_xvectors.$i.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + extract_wav2vec2xvectors.py \ + ${args} $write_speech_dur_opt \ + --part-idx $i --num-parts $nj \ + --recordings-file $data_dir/wav.scp \ + --model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \ + --output-spec ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & + fi + done + wait +fi + +if [ $stage -le 2 ]; then + echo "$0: combining xvectors across jobs" + for j in $(seq $nj); do cat $output_dir/xvector.$j.scp; done > $output_dir/xvector.scp || exit 1; + if [ "$write_utt2speech_dur" == "true" ];then + for n in $(seq $nj); do + cat $output_dir/utt2speech_dur.$n || exit 1; + done > $output_dir/utt2speech_dur || exit 1 + fi + + if [ -f $output_dir/aug_info.1.csv ];then + cat $output_dir/aug_info.1.csv > $output_dir/aug_info.csv + for j in $(seq 2 $nj); + do + tail -n +2 $output_dir/aug_info.$j.csv + done >> $output_dir/aug_info.csv + fi +fi + +if [ $stage -le 3 ]; then + if [ -n "$data_out_dir" ];then + echo "$0: creating data dir $data_out_dir for augmented x-vectors" + mkdir -p $data_out_dir + awk -F "," '$1 != "key_aug" { print $1,$2}' $output_dir/aug_info.csv \ + > $data_out_dir/augm2clean + awk -v u2s=$data_dir/utt2spk 'BEGIN{ +while(getline < u2s) +{ + spk[$1]=$2 +} +} +{ print $1,spk[$2]}' $data_out_dir/augm2clean > $data_out_dir/utt2spk + utils/utt2spk_to_spk2utt.pl $data_out_dir/utt2spk > $data_out_dir/spk2utt + cp $output_dir/utt2speech_dur $data_out_dir + else + cp $output_dir/utt2speech_dur $data_dir + fi +fi diff --git a/hyp_utils/xvectors/extract_xvectors_from_wav.sh b/hyp_utils/xvectors/extract_xvectors_from_wav.sh index 2aa0d460..b763a25c 100755 --- a/hyp_utils/xvectors/extract_xvectors_from_wav.sh +++ b/hyp_utils/xvectors/extract_xvectors_from_wav.sh @@ -85,12 +85,12 @@ if [ $stage -le 0 ];then set +e $cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - torch-extract-xvectors-from-wav.py \ + extract_xvectors_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ - --part-idx JOB --num-parts $nj \ - --input $data_dir/wav.scp \ + --part-idx JOB --num-parts $nj \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --chunk-length $chunk_length \ - --output ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp + --output-spec ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp set -e fi @@ -107,12 +107,12 @@ if [ $stage -le 1 ];then fi $cmd $output_dir/log/extract_xvectors.$i.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - torch-extract-xvectors-from-wav.py \ + extract_xvectors_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ --part-idx $i --num-parts $nj \ - --input $data_dir/wav.scp \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --chunk-length $chunk_length \ - --output ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & + --output-spec ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & fi done wait @@ -137,21 +137,27 @@ if [ $stage -le 2 ]; then fi if [ $stage -le 3 ]; then - if [ -n "$data_out_dir" ];then - echo "$0: creating data dir $data_out_dir for augmented x-vectors" - mkdir -p $data_out_dir - awk -F "," '$1 != "key_aug" { print $1,$2}' $output_dir/aug_info.csv \ - > $data_out_dir/augm2clean - awk -v u2s=$data_dir/utt2spk 'BEGIN{ + if [ -n "$data_out_dir" ];then + echo "$0: creating data dir $data_out_dir for augmented x-vectors" + mkdir -p $data_out_dir + awk -F "," '$1 != "key_aug" { print $1,$2}' $output_dir/aug_info.csv \ + > $data_out_dir/augm2clean + + for f in utt2spk utt2lang + do + if [ -f $data_dir/utt2spk ];then + awk -v u2s=$data_dir/$f 'BEGIN{ while(getline < u2s) { spk[$1]=$2 } } -{ print $1,spk[$2]}' $data_out_dir/augm2clean > $data_out_dir/utt2spk - utils/utt2spk_to_spk2utt.pl $data_out_dir/utt2spk > $data_out_dir/spk2utt - cp $output_dir/utt2num_frames $data_out_dir - else - cp $output_dir/utt2num_frames $data_dir - fi +{ print $1,spk[$2]}' $data_out_dir/augm2clean > $data_out_dir/$f + fi + done + utils/utt2spk_to_spk2utt.pl $data_out_dir/utt2spk > $data_out_dir/spk2utt + cp $output_dir/utt2num_frames $data_out_dir + else + cp $output_dir/utt2num_frames $data_dir + fi fi diff --git a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh index 92256004..4530ad3b 100755 --- a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh +++ b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh @@ -8,9 +8,7 @@ nj=1 cmd="run.pl" stage=0 file_format=flac -nodes=b1 storage_name=$(date +'%m_%d_%H_%M') -#proc_opts="--remove-dc-offset" min_spks=3 max_spks=10 num_reuses=5 @@ -23,10 +21,8 @@ if [ $# != 3 ]; then echo "Usage: $0 " echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" echo "Options: " - #echo " --nj # number of parallel jobs" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." echo " --file-format # Output file_format supported by soundfile (flac,ogg,wav,...)" - #echo " --proc-opts # Extra arguments for proc-audio-files.py" echo " --min-spks # max number of spks per utterance" echo " --max-spks # max number of spks per utterance" echo " --num-reuses # number of times a signal is reused to create babble" @@ -51,22 +47,12 @@ output_dir=$(utils/make_absolute.sh $dir) args="" $cmd $dir/log/make_babble_noise_${name}.log \ hyp_utils/conda_env.sh \ - make-babble-noise-audio-files.py ${args} \ - --output-audio-format $file_format $args $proc_opts \ + make_babble_noise_audio_files.py \ + --audio-format $file_format $args $proc_opts \ --min-spks $min_spks --max-spks $max_spks --num-reuses $num_reuses \ --write-time-durs $data_out/utt2dur \ - --input $data_in/wav.scp \ + --recordings-file $data_in/wav.scp \ --output-path $output_dir \ - --output-script $data_out/wav.scp - - - -# for n in $(seq $nj); do -# cat $output_dir/wav.${name}.$n.scp || exit 1; -# done > ${data_out}/wav.scp || exit 1 - -# for n in $(seq $nj); do -# cat $output_dir/utt2dur.${name}.$n || exit 1; -# done > ${data_out}/utt2dur || exit 1 + --output-recordings-file $data_out/wav.scp echo "$0: Succeeded making babble noise for $name" diff --git a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh index 9c122f1e..437cd208 100755 --- a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh +++ b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh @@ -63,16 +63,7 @@ utils/create_data_link.pl $output_dir/rirs_${name}.${file_format} args="" $cmd $dir/log/pack_rirs_${name}.log \ hyp_utils/conda_env.sh \ - pack-wav-rirs.py ${args} --input $data_in/wav.scp \ + pack_wav_rirs.py ${args} --input $data_in/wav.scp \ --output ${file_format},scp:$output_dir/rirs_${name}.${file_format},$data_out/rirs.scp || exit 1; - -# for n in $(seq $nj); do -# cat $output_dir/wav.${name}.$n.scp || exit 1; -# done > ${data_out}/wav.scp || exit 1 - -# for n in $(seq $nj); do -# cat $output_dir/utt2dur.${name}.$n || exit 1; -# done > ${data_out}/utt2dur || exit 1 - echo "$0: Succeeded packing RIRs for $name" diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh index 35794d65..afd13d74 100755 --- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh +++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh @@ -45,7 +45,7 @@ mkdir -p $data_out output_dir=$(utils/make_absolute.sh $dir) if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $output_dir/storage ]; then - dir_name=$USER/hyp-data/xvectors/$storage_name/xvector_audio/storage + dir_name=$USER/hyp-data/$storage_name/xvector_audio/storage if [ "$nodes" == "b0" ];then utils/create_split_dir.pl \ utils/create_split_dir.pl \ @@ -56,9 +56,13 @@ if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $output_dir/storage ]; then elif [ "$nodes" == "s01" ];then utils/create_split_dir.pl \ /export/s01/$dir_name $output_dir/storage - else + elif [ "$nodes" == "c0" ];then utils/create_split_dir.pl \ /export/c{01,06,07,08,09}/$dir_name $output_dir/storage + elif [ "$nodes" == "fs05" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/fs05/$dir_name $output_dir/storage fi for f in $(awk '{ print $1}' $data_in/wav.scp); do @@ -88,12 +92,13 @@ fi $cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \ hyp_utils/conda_env.sh \ - preprocess-audio-files.py ${args} --output-audio-format $file_format $args $proc_opts \ + preprocess_audio_files.py ${args} --audio-format $file_format $args $proc_opts \ --write-time-durs $output_dir/utt2dur.${name}.JOB \ --part-idx JOB --num-parts $nj \ - --input $data_in/wav.scp \ + --recordings-file $data_in/wav.scp \ --output-path $output_dir \ - --output-script $output_dir/wav.${name}.JOB.scp + --output-recordings-file $output_dir/wav.${name}.JOB.scp + for n in $(seq $nj); do cat $output_dir/wav.${name}.$n.scp || exit 1; diff --git a/hyperion/__init__.py b/hyperion/__init__.py index 6e59062b..fc35423c 100644 --- a/hyperion/__init__.py +++ b/hyperion/__init__.py @@ -4,18 +4,6 @@ """ -from . import utils -from . import metrics -from . import pdfs -from . import transforms -from . import io -from . import feats -from . import calibration -from . import score_norm +from . import helpers, io, np, torch, utils -# from . import keras -from . import helpers - -# from . import generators - -__version__ = "0.3.1" +__version__ = "0.4.0a" diff --git a/hyperion/augment/__init__.py b/hyperion/augment/__init__.py deleted file mode 100644 index 210f54e7..00000000 --- a/hyperion/augment/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -""" - Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -from .speech_augment import SpeechAugment -from .speed_augment import SpeedAugment -from .noise_augment import NoiseAugment -from .reverb_augment import ReverbAugment diff --git a/hyperion/bin/__init__.py b/hyperion/bin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperion/bin/adv_finetune_xvector_from_wav.py b/hyperion/bin/adv_finetune_xvector_from_wav.py new file mode 100755 index 00000000..ea3d3b80 --- /dev/null +++ b/hyperion/bin/adv_finetune_xvector_from_wav.py @@ -0,0 +1,483 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import SpineNetXVector as SpineXVec +from hyperion.torch.models import TDNNXVector as TDXVec +from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer +from hyperion.torch.utils import ddp + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + "efficientnet": EXVec, + "tdnn": TDXVec, + "transformer": TFXVec, + "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_feats(rank, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + if rank == 0: + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=True, **feat_args) + if rank == 0: + logging.info("feat-extractor={}".format(feat_extractor)) + return feat_extractor + + +def init_xvector(num_classes, in_model_file, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_finetune_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = TML.load(in_model_file) + model.change_config(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + try: + hard_prototype_mining = val_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + +def init_attack(feat_extractor, model, wav_scale, **kwargs): + victim_model = nn.Sequential(feat_extractor, model) + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = { + "eps_scale": wav_scale, + "loss": nn.functional.cross_entropy, + "time_dim": 1, + } + attack_args.update(extra_args) + logging.info("attacks args={}".format(attack_args)) + attack = AttackFactory.create(victim_model, **attack_args) + return attack + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + feat_extractor = init_feats(**kwargs) + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) + kwargs["wav_scale"] = train_loader.dataset.wav_scale + attack = init_attack(feat_extractor, model, **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + attack, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + AF.add_class_args(parser, prefix="feats") + xvec_class.add_finetune_args(parser, prefix="model") + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--in-model-file", required=True) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser( + description="""Fine-tune x-vector model from audio files + with adversarial training""" + ) + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() + + +# def init_data( +# audio_path, +# train_list, +# val_list, +# train_aug_cfg, +# val_aug_cfg, +# num_workers, +# num_gpus, +# rank, +# **kwargs +# ): + +# ad_args = AD.filter_args(**kwargs) +# sampler_args = Sampler.filter_args(**kwargs) +# if rank == 0: +# logging.info("audio dataset args={}".format(ad_args)) +# logging.info("sampler args={}".format(sampler_args)) +# logging.info("init datasets") + +# train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) +# val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) + +# if rank == 0: +# logging.info("init samplers") +# train_sampler = Sampler(train_data, **sampler_args) +# val_sampler = Sampler(val_data, **sampler_args) + +# num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) +# largs = ( +# {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} +# ) + +# train_loader = torch.utils.data.DataLoader( +# train_data, batch_sampler=train_sampler, **largs +# ) + +# test_loader = torch.utils.data.DataLoader( +# val_data, batch_sampler=val_sampler, **largs +# ) + +# return train_loader, test_loader + + +# def init_feats(rank, **kwargs): +# feat_args = AF.filter_args(**kwargs["feats"]) +# if rank == 0: +# logging.info("feat args={}".format(feat_args)) +# logging.info("initializing feature extractor") +# feat_extractor = AF(trans=True, **feat_args) +# if rank == 0: +# logging.info("feat-extractor={}".format(feat_extractor)) +# return feat_extractor + + +# def init_xvector(num_classes, in_model_path, rank, train_mode, **kwargs): +# xvec_args = XVec.filter_finetune_args(**kwargs) +# if rank == 0: +# logging.info("xvector network ft args={}".format(xvec_args)) +# xvec_args["num_classes"] = num_classes +# model = TML.load(in_model_path) +# model.rebuild_output_layer(**xvec_args) +# if train_mode == "ft-embed-affine": +# model.freeze_preembed_layers() +# if rank == 0: +# logging.info("x-vector-model={}".format(model)) +# return model + + +# def init_attack(feat_extractor, model, wav_scale, **kwargs): +# victim_model = nn.Sequential(feat_extractor, model) +# attack_args = AttackFactory.filter_args(**kwargs["attack"]) +# extra_args = { +# "eps_scale": wav_scale, +# "loss": nn.functional.cross_entropy, +# "time_dim": 1, +# } +# attack_args.update(extra_args) +# logging.info("attacks args={}".format(attack_args)) +# attack = AttackFactory.create(victim_model, **attack_args) +# return attack + + +# def train_xvec(gpu_id, args): + +# config_logger(args.verbose) +# del args.verbose +# logging.debug(args) + +# kwargs = namespace_to_dict(args) +# torch.manual_seed(args.seed) +# set_float_cpu("float32") + +# train_mode = kwargs["train_mode"] + +# ddp_args = ddp.filter_ddp_args(**kwargs) +# device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) +# kwargs["rank"] = rank + +# train_loader, test_loader = init_data(**kwargs) +# feat_extractor = init_feats(**kwargs) +# model = init_xvector(train_loader.dataset.num_classes, **kwargs) +# kwargs["wav_scale"] = train_loader.dataset.wav_scale +# attack = init_attack(feat_extractor, model, **kwargs) + +# trn_args = Trainer.filter_args(**kwargs) +# if rank == 0: +# logging.info("trainer args={}".format(trn_args)) +# metrics = {"acc": CategoricalAccuracy()} +# trainer = Trainer( +# model, +# feat_extractor, +# attack, +# device=device, +# metrics=metrics, +# ddp=world_size > 1, +# train_mode=train_mode, +# **trn_args +# ) +# if args.resume: +# trainer.load_last_checkpoint() +# trainer.fit(train_loader, test_loader) + +# ddp.ddp_cleanup() + + +# if __name__ == "__main__": + +# parser = ArgumentParser( +# description="Fine-tune x-vector model with adv attacks on wav domain" +# ) + +# parser.add_argument("--cfg", action=ActionConfigFile) +# parser.add_argument("--audio-path", required=True) +# parser.add_argument("--train-list", dest="train_list", required=True) +# parser.add_argument("--val-list", dest="val_list", required=True) + +# AD.add_argparse_args(parser) +# Sampler.add_argparse_args(parser) + +# parser.add_argument("--train-aug-cfg", default=None) +# parser.add_argument("--val-aug-cfg", default=None) + +# parser.add_argument( +# "--num-workers", type=int, default=5, help="num_workers of data loader" +# ) + +# AF.add_class_args(parser, prefix="feats") +# parser.add_argument("--in-model-path", required=True) + +# XVec.add_finetune_args(parser) +# AttackFactory.add_class_args(parser, prefix="attack") + +# Trainer.add_class_args(parser) +# ddp.add_ddp_args(parser) + +# # parser.add_argument('--num-gpus', type=int, default=1, +# # help='number of gpus, if 0 it uses cpu') +# parser.add_argument( +# "--seed", type=int, default=1123581321, help="random seed (default: 1)" +# ) +# parser.add_argument( +# "--resume", +# action="store_true", +# default=False, +# help="resume training from checkpoint", +# ) +# parser.add_argument( +# "--train-mode", +# default="ft-full", +# choices=["ft-full", "ft-embed-affine"], +# help=( +# "ft-full: adapt full x-vector network" +# "ft-embed-affine: adapt affine transform before embedding" +# ), +# ) + +# # parser.add_argument('--attack-eps', required=True, type=float, +# # help='epsilon adversarial attack') +# # parser.add_argument('--attack-eps-step', required=True, type=float, +# # help='eps step adversarial attack') +# # parser.add_argument('--attack-random-eps', default=False, +# # action='store_true', +# # help='use random eps in adv. attack') + +# # parser.add_argument('--attack-max-iter', default=10, type=int, +# # help='number of iterations for adversarial optimization') + +# # parser.add_argument('--p-attack', default=0.5, type=float, +# # help='ratio of batches with adv attack') + +# parser.add_argument( +# "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int +# ) +# parser.add_argument("--local_rank", default=0, type=int) + +# args = parser.parse_args() +# gpu_id = args.local_rank +# del args.local_rank + +# if gpu_id == 0: +# try: +# config_file = Path(args.exp_path) / "config.yaml" +# parser.save(args, str(config_file), format="yaml", overwrite=True) +# except: +# pass + +# # torch docs recommend using forkserver +# multiprocessing.set_start_method("forkserver") +# train_xvec(gpu_id, args) diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py new file mode 100755 index 00000000..f8299edc --- /dev/null +++ b/hyperion/bin/apply_mvn_select_frames.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import RandomAccessDataReaderFactory as RDRF +from hyperion.io import SequentialDataReaderFactory as DRF +from hyperion.np.feats import FrameSelector as FSel +from hyperion.np.feats import MeanVarianceNorm as MVN +from hyperion.utils import Utt2Info +from hyperion.utils.kaldi_matrix import compression_methods + + +def process_feats( + input_spec, + output_spec, + vad_spec, + write_num_frames_spec, + path_prefix, + vad_path_prefix, + part_idx, + num_parts, + compress, + compression_method, + **kwargs +): + logging.info("initializing") + mvn_args = MVN.filter_args(**kwargs) + mvn = MVN(**mvn_args) + if vad_spec is not None: + fs_args = FSel.filter_args(**kwargs) + fs = FSel(**fs_args) + + if write_num_frames_spec is not None: + keys = [] + info = [] + + logging.info("opening output stream: %s" % (output_spec)) + with DWF.create( + output_spec, + compress=compress, + compression_method=compression_method, + ) as writer: + logging.info("opening input stream: %s" % (output_spec)) + with DRF.create( + input_spec, + path_prefix=path_prefix, + part_idx=part_idx, + num_parts=num_parts, + ) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = RDRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) + + while not reader.eof(): + key, data = reader.read(1) + if len(key) == 0: + break + logging.info("processing feats at %s" % (key[0])) + x = mvn.normalize(data[0]) + if vad_spec is not None: + vad = v_reader.read(key)[0].astype("bool") + tot_frames = x.shape[0] + x = fs.select(x, vad) + logging.info( + "for %s detected %d/%d (%.2f %%) speech frames" + % ( + key[0], + x.shape[0], + tot_frames, + x.shape[0] / tot_frames * 100, + ) + ) + if x.shape[0] > 0: + writer.write(key, [x]) + if write_num_frames_spec is not None: + keys += key + info.append(x.shape[0]) + + if write_num_frames_spec is not None: + logging.info("writing num-frames to %s" % (write_num_frames_spec)) + u2nf = Utt2Info.create(keys, info) + u2nf.save(write_num_frames_spec) + + +def main(): + parser = ArgumentParser(description="Apply CMVN and remove silence") + + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-num-frames", dest="write_num_frames_spec", default=None + ) + parser.add_argument( + "--path-prefix", dest="path_prefix", default=None, help=("scp file_path prefix") + ) + parser.add_argument( + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + parser.add_argument( + "--part-idx", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) + parser.add_argument( + "--num-parts", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) + + parser.add_argument( + "--compress", + default=False, + action="store_true", + help="Lossy compress the features", + ) + parser.add_argument( + "--compression-method", + default="auto", + choices=compression_methods, + help=( + "Kaldi compression method: " + "{auto (default), speech_feat, " + "2byte-auto, 2byte-signed-integer, " + "1byte-auto, 1byte-unsigned-integer, 1byte-0-1}." + ), + ) + MVN.add_argparse_args(parser) + FSel.add_argparse_args(parser) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + process_feats(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/audio_to_duration.py b/hyperion/bin/audio_to_duration.py new file mode 100755 index 00000000..8ef6b5c1 --- /dev/null +++ b/hyperion/bin/audio_to_duration.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +""" + Copyright 2022 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import SequentialAudioReader as AR +from hyperion.utils import SegmentSet + + +def audio_to_duration(audio_file, output_file, **kwargs): + input_args = AR.filter_args(**kwargs) + logging.info(f"input_args={input_args}") + + keys = [] + durations = [] + with AR(audio_file, **input_args) as reader: + for data in reader: + key, x, fs = data + duration = x.shape[0] / fs + keys.append(key) + durations.append(duration) + logging.info("read audio %s duration=%.3f", key, duration) + + print(len(keys), len(durations)) + seg_set = SegmentSet.from_lists(keys, ["duration"], [durations]) + seg_set.save(output_file) + + +def main(): + parser = ArgumentParser(description="Writes audio file durations to table") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-file", required=True) + parser.add_argument("--output-file", required=True) + AR.add_class_args(parser) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + audio_to_duration(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/cluster_embeddings.py b/hyperion/bin/cluster_embeddings.py new file mode 100644 index 00000000..fb30fcae --- /dev/null +++ b/hyperion/bin/cluster_embeddings.py @@ -0,0 +1,583 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) +from scipy import sparse + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.clustering import AHC, KMeans, KMeansInitMethod, SpectralClustering +from hyperion.np.pdfs import SPLDA, DiagGMM, PLDAFactory +from hyperion.np.transforms import PCA, LNorm +from hyperion.utils import SegmentSet +from hyperion.utils.math_funcs import cosine_scoring + +subcommand_list = ["cos_ahc", "spectral_clustering", "cos_ahc_plda_ahc"] + + +def add_common_args(parser): + parser.add_argument("--feats-file", required=True) + parser.add_argument("--segments-file", required=True) + parser.add_argument("--output-file", required=True) + parser.add_argument( + "--filter-by-gmm-post", + default=0, + type=float, + help="remove segments with gmm posterior lower than threshold", + ) + + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def load_data(segments_file, feats_file): + logging.info("loading data") + segments = SegmentSet.load(segments_file) + reader = DRF.create(feats_file) + x = reader.read(segments["id"], squeeze=True) + return segments, x + + +def do_pca(x, pca_args): + pca_var_r = pca_args["pca_var_r"] + logging.info("computing pca pca_var_r=%f", pca_var_r) + if pca_var_r < 1: + pca = PCA(**pca_args) + pca.fit(x) + x = pca(x) + logging.info("pca-dim=%d", x.shape[1]) + + return x + + +def do_kmeans(x, samples_per_cluster, epochs, rtol, init_method, num_workers): + if samples_per_cluster > 1: + km_clusters = x.shape[0] // samples_per_cluster + logging.info("kmeans with num_clusters=%d", km_clusters) + kmeans = KMeans( + num_clusters=km_clusters, + rtol=rtol, + epochs=epochs, + init_method=init_method, + num_workers=num_workers, + ) + kmeans.fit(x) + idx_km, _ = kmeans(x) + x_km = kmeans.mu + del kmeans + else: + idx_km = None + x_km = x + + return x_km, idx_km + + +def change_precision(x, precision=None): + if precision == "single": + return x.astype(np.float32) + elif precision == "half": + return x.astype(np.float16) + else: + return x + + +def do_cosine_scoring(x, precision=None): + logging.info("compute cosine affinity matrix") + x = change_precision(x) + return cosine_scoring(x, x) + + +def train_plda(x, y, plda, min_samples_per_cluster, max_samples_per_cluster=None): + logging.info("Train Centering/Whitening + PLDA") + _, cluster_idx, counts = np.unique(y, return_inverse=True, return_counts=True) + max_samples_per_cluster = ( + np.max(counts) if max_samples_per_cluster is None else max_samples_per_cluster + ) + transforms = LNorm() + transforms.fit(x) + if plda["y_dim"] > x.shape[1]: + plda["y_dim"] = x.shape[1] + plda_model = PLDAFactory.create(**plda) + + counts = counts[cluster_idx] + keep = np.logical_and( + counts >= min_samples_per_cluster, counts <= max_samples_per_cluster + ) + x = x[keep] + cluster_idx = cluster_idx[keep] + _, cluster_idx = np.unique(cluster_idx, return_inverse=True) + plda_model.fit(x, class_ids=cluster_idx) + + return transforms, plda_model + + +def do_ahc(scores, linkage_method, stop_criterion, threshold, num_clusters): + logging.info( + f"running AHC stop_criterion: {stop_criterion} thr: {threshold} num_clusters: {num_clusters}", + ) + ahc = AHC(method=linkage_method) + ahc.fit(scores) + if stop_criterion == "threshold": + y = ahc.get_flat_clusters_from_thr(threshold) + else: + y = ahc.get_flat_clusters_from_num_clusters(num_clusters) + + return y + + +def get_gmm_post(x, y): + logging.info("computing cluster posteriors with gmm") + num_comp = np.max(y) + 1 + gmm = DiagGMM(num_comp=num_comp, x_dim=x.shape[1], min_N=1) + u_dim = gmm.compute_suff_stats(x[:1]).shape[1] + N = np.zeros((num_comp,), dtype=float) + 1e-5 + u_x = np.zeros((num_comp, u_dim), dtype=float) + + for c in range(num_comp): + mask = y == c + N_c = np.sum(mask) + if N_c == 0: + continue + + N[c] = N_c + u_x_c = gmm.compute_suff_stats(x[mask]) + u_x[c] = np.sum(u_x_c, axis=0) + + gmm.Mstep(N, u_x) + p = gmm.compute_pz(x, mode="std") + p_max = p[np.arange(x.shape[0]), y] + p_2nd = np.sort(p, axis=1, kind="heapsort")[:, -2] + return p_max, p_2nd + + +def plot_score_hist(scores, fig_file): + mask = np.triu(np.ones_like(scores, dtype=bool)) + fig = plt.figure() + scores = scores[mask] + logging.info( + f"score-mean=%f score-std=%f score-max=%f score-min=%f", + scores.mean(), + scores.std(), + scores.max(), + scores.min(), + ) + if np.any(scores < -1.1) or np.any(scores > 1.1): + # if scores come from plda we limit the max and min val + thr = 2 * np.std(scores) + scores = scores.copy() + scores[scores > thr] = thr + scores[scores < -thr] = -thr + + plt.hist(scores, bins=100, density=True) + fig.savefig(fig_file) + + +def plot_cluster_size_hist(y, fig_file): + _, counts = np.unique(y, return_counts=True) + fig = plt.figure() + bins = np.arange(1, np.max(counts) + 1) + plt.hist(counts, bins=bins, density=False) + fig.savefig(fig_file) + + +def cos_ahc( + segments_file, + feats_file, + output_file, + lnorm, + pca, + linkage_method, + stop_criterion, + num_clusters, + threshold, + ahc_precision, + pre_kmeans, + num_workers, + filter_by_gmm_post, +): + Path(output_file).parent.mkdir(exist_ok=True, parents=True) + segments, x = load_data(segments_file, feats_file) + if lnorm: + x = LNorm()(x) + + x = do_pca(x, pca) + x_km, idx_km = do_kmeans(x, num_workers=num_workers, **pre_kmeans) + scores = do_cosine_scoring(x_km, ahc_precision) + fig_file = Path(output_file).parent / "score_hist.png" + plot_score_hist(scores, fig_file) + y = do_ahc(scores, linkage_method, stop_criterion, threshold, num_clusters) + if idx_km is not None: + y = y[idx_km] + del x_km + + p_max, p_2nd = get_gmm_post(x, y) + segments["cluster"] = y + segments["post_cluster"] = p_max + segments["post_cluster_2nd"] = p_2nd + if filter_by_gmm_post > 0: + idx = segments["post_cluster"] > filter_by_gmm_post + segments = SegmentSet(segments.loc[idx]) + + segments.save(output_file) + fig_file = Path(output_file).parent / "cluster_size_hist.png" + plot_cluster_size_hist(segments["cluster"], fig_file) + + +def make_cos_ahc_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + add_common_args(parser) + parser.add_argument("--lnorm", default=False, action=ActionYesNo) + PCA.add_class_args(parser, prefix="pca") + parser.add_argument( + "--linkage-method", + default="average", + choices=["single", "complete", "average", "weighted", "ward"], + help="linkage method", + ) + parser.add_argument( + "--stop-criterion", + default="threshold", + choices=["threshold", "num_clusters"], + help="stopping criterion", + ) + parser.add_argument( + "--num-clusters", default=None, type=int, help="number of AHC clusters" + ) + parser.add_argument("--threshold", default=0, type=float, help="stopping threshold") + parser.add_argument( + "--ahc-precision", default="single", choices=["half", "single", "double"] + ) + parser.add_argument( + "--pre_kmeans.samples-per-cluster", + default=1, + type=int, + help="first k-means is done to recuce the computing cost of AHC", + ) + parser.add_argument( + "--pre_kmeans.init_method", + default=KMeansInitMethod.max_dist, + choices=KMeansInitMethod.choices(), + ) + parser.add_argument("--pre_kmeans.epochs", default=100, type=int) + parser.add_argument("--pre_kmeans.rtol", default=0.001, type=float) + parser.add_argument("--num-workers", default=1, type=int) + return parser + + +def cos_ahc_plda_ahc( + segments_file, + feats_file, + output_file, + lnorm, + pca, + linkage_method, + stop_criterion, + num_clusters_stage_1, + threshold_stage_1, + num_clusters_stage_2, + threshold_stage_2, + min_samples_per_cluster, + max_samples_per_cluster, + plda, + ahc_precision, + pre_kmeans, + num_workers, + filter_by_gmm_post, +): + Path(output_file).parent.mkdir(exist_ok=True, parents=True) + segments, x = load_data(segments_file, feats_file) + if lnorm: + x = LNorm()(x) + + x = do_pca(x, pca) + + # stage 1 + x_km, idx_km = do_kmeans(x, num_workers=num_workers, **pre_kmeans) + scores = do_cosine_scoring(x_km, ahc_precision) + fig_file = Path(output_file).parent / "cosine_score_hist.png" + plot_score_hist(scores, fig_file) + y = do_ahc( + scores, linkage_method, stop_criterion, threshold_stage_1, num_clusters_stage_1 + ) + if idx_km is not None: + y = y[idx_km] + del x_km + + fig_file = Path(output_file).parent / "cosine_cluster_size_hist.png" + plot_cluster_size_hist(y, fig_file) + # stage 2 + transform, plda_model = train_plda( + x, y, plda, min_samples_per_cluster, max_samples_per_cluster + ) + x = transform(x) + z = plda_model.compute_py_g_x(x) + _, idx_km = do_kmeans(z, num_workers=num_workers, **pre_kmeans) + + if idx_km is None: + scores = plda_model.llr_1vs1(x, x) + else: + scores = plda_model.llr_NvsM(x, x, ids1=idx_km, ids2=idx_km) + + scores = change_precision(scores, ahc_precision) + fig_file = Path(output_file).parent / "plda_score_hist.png" + plot_score_hist(scores, fig_file) + y = do_ahc( + scores, linkage_method, stop_criterion, threshold_stage_2, num_clusters_stage_2 + ) + if idx_km is not None: + y = y[idx_km] + + p_max, p_2nd = get_gmm_post(x, y) + segments["cluster"] = y + segments["post_cluster"] = p_max + segments["post_cluster_2nd"] = p_2nd + if filter_by_gmm_post > 0: + idx = segments["post_cluster"] > filter_by_gmm_post + segments = SegmentSet(segments.loc[idx]) + + segments.save(output_file) + fig_file = Path(output_file).parent / "plda_cluster_size_hist.png" + plot_cluster_size_hist(segments["cluster"], fig_file) + + +def make_cos_ahc_plda_ahc_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + add_common_args(parser) + parser.add_argument("--lnorm", default=False, action=ActionYesNo) + PCA.add_class_args(parser, prefix="pca") + parser.add_argument( + "--linkage-method", + default="average", + choices=["single", "complete", "average", "weighted", "ward"], + help="linkage method", + ) + parser.add_argument( + "--stop-criterion", + default="threshold", + choices=["threshold", "num_clusters"], + help="stopping criterion", + ) + parser.add_argument( + "--num-clusters-stage-1", + default=None, + type=int, + help="number of AHC clusters for first stage", + ) + parser.add_argument( + "--threshold-stage-1", + default=0, + type=float, + help="stopping threshold for first stage", + ) + parser.add_argument( + "--num-clusters-stage-2", + default=None, + type=int, + help="number of AHC clusters for first stage", + ) + parser.add_argument( + "--threshold-stage-2", + default=0, + type=float, + help="stopping threshold for first stage", + ) + parser.add_argument( + "--ahc-precision", default="single", choices=["half", "single", "double"] + ) + parser.add_argument( + "--min-samples-per-cluster", + default=8, + type=int, + help="minimum samples/cluster for a cluster to be used to train PLDA", + ) + parser.add_argument( + "--max-samples-per-cluster", + default=50, + type=int, + help="maximum samples/cluster for a cluster to be used to train PLDA", + ) + PLDAFactory.add_class_args(parser, prefix="plda") + parser.add_argument( + "--pre_kmeans.samples-per-cluster", + default=1, + type=int, + help="first k-means is done to recuce the computing cost of AHC", + ) + parser.add_argument( + "--pre_kmeans.init_method", + default=KMeansInitMethod.max_dist, + choices=KMeansInitMethod.choices(), + ) + parser.add_argument("--pre_kmeans.epochs", default=100, type=int) + parser.add_argument("--pre_kmeans.rtol", default=0.001, type=float) + parser.add_argument("--num-workers", default=1, type=int) + return parser + + +def compute_sc_affinity(x, aff_func, gauss_sigma, aff_thr, precision): + if precision == "single": + x = x.astype(np.float32) + elif precision == "half": + x = x.astype(np.float16) + + scores = cosine_scoring(x, x) + if aff_func == "gauss_cos": + assert gauss_sigma > 0 + d2 = 1 - scores + scores = np.exp(-d2 / gauss_sigma) + + assert aff_thr < 1 + scores[scores < aff_thr] = 0 + num_nodes = scores.shape[0] + scores.flat[:: num_nodes + 1] = 0 + aff_size = num_nodes**2 + num_edges = np.sum(scores > 0) + r = aff_size / num_edges + logging.info("num_nodes^2=%d, num_edges=%d r=%f", aff_size, num_edges, r) + if r > 4: + scores = sparse.csr_matrix(scores) + return scores + + +def spectral_clustering( + segments_file, + feats_file, + output_file, + lnorm, + pca, + pre_kmeans, + affinity, + spectral_clustering, + filter_by_gmm_post, +): + Path(output_file).parent.mkdir(exist_ok=True, parents=True) + segments, x = load_data(segments_file, feats_file) + if lnorm: + x = LNorm()(x) + + x = do_pca(x, pca) + x_km, idx_km = do_kmeans(x, **pre_kmeans) + A = compute_sc_affinity(x_km, **affinity) + sc = SpectralClustering(**spectral_clustering) + y, num_clusters, eigengap_stats = sc.fit(A) + if idx_km is not None: + y = y[idx_km] + del x_km + + segments["cluster"] = y + if num_clusters > 1: + p_max, p_2nd = get_gmm_post(x, y) + segments["post_cluster"] = p_max + segments["post_cluster_2nd"] = p_2nd + + if filter_by_gmm_post > 0: + idx = segments["post_cluster"] > filter_by_gmm_post + segments = SegmentSet(segments.loc[idx]) + + segments.save(output_file) + output_file = Path(output_file) + fig_file = Path(output_file).parent / "cluster_size_hist.png" + plot_cluster_size_hist(segments["cluster"], fig_file) + + fig_file = output_file.with_stem(output_file.stem + "_eigengap").with_suffix(".png") + sc.plot_eigengap_stats(eigengap_stats, num_clusters, fig_file) + + df_eig = pd.DataFrame( + {k: eigengap_stats[k] for k in ["eig_vals", "eigengap", "d_eig_vals"]} + ) + df_eig["num_clusters"] = np.arange(1, len(df_eig) + 1) + eig_file = fig_file.with_suffix(".csv") + df_eig.to_csv(eig_file, index=False) + + +def make_spectral_clustering_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + add_common_args(parser) + parser.add_argument("--lnorm", default=False, action=ActionYesNo) + PCA.add_class_args(parser, prefix="pca") + parser.add_argument( + "--pre_kmeans.samples-per-cluster", + default=1, + type=int, + help="first k-means is done to recuce the computing cost of AHC", + ) + parser.add_argument( + "--pre_kmeans.init_method", + default=KMeansInitMethod.max_dist, + choices=KMeansInitMethod.choices(), + ) + parser.add_argument("--pre_kmeans.epochs", default=100, type=int) + parser.add_argument("--pre_kmeans.rtol", default=0.001, type=float) + parser.add_argument("--pre_kmeans.num_workers", default=1, type=int) + parser.add_argument( + "--affinity.aff_func", default="cos", choices=["cos", "gauss_cos"] + ) + parser.add_argument( + "--affinity.gauss-sigma", + default=1, + type=float, + help="std. dev. of gauss function", + ) + parser.add_argument( + "--affinity.aff-thr", + default=0, + type=float, + help="values under this are set to 0", + ) + parser.add_argument( + "--affinity.precision", default="single", choices=["half", "single", "double"] + ) + SpectralClustering.add_class_args(parser, prefix="spectral_clustering") + + return parser + + +def main(): + parser = ArgumentParser( + description="Cluster embeddings into classes, usually speakers" + ) + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in subcommand_list: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(subcommand, subparser) + + args = parser.parse_args() + subcommand = args.subcommand + kwargs = namespace_to_dict(args)[args.subcommand] + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/compute-energy-vad.py b/hyperion/bin/compute-energy-vad.py deleted file mode 100755 index 397aea80..00000000 --- a/hyperion/bin/compute-energy-vad.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2018 Jesus Villalba (Johns Hopkins University) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging - -import numpy as np - -from hyperion.hyp_defs import config_logger -from hyperion.io import SequentialAudioReader as AR -from hyperion.io import DataWriterFactory as DWF -from hyperion.feats import EnergyVAD - - -def compute_vad(input_path, output_path, write_num_frames, **kwargs): - - vad_args = EnergyVAD.filter_args(**kwargs) - vad = EnergyVAD(**vad_args) - - input_args = AR.filter_args(**kwargs) - reader = AR(input_path, **input_args) - - writer = DWF.create(output_path, scp_sep=" ") - - if write_num_frames is not None: - f_num_frames = open(write_num_frames, "w") - - for data in reader: - key, x, fs = data - logging.info("Extracting VAD for %s" % (key)) - t1 = time.time() - y = vad.compute(x) - dt = (time.time() - t1) * 1000 - rtf = vad.frame_shift * y.shape[0] / dt - num_speech_frames = np.sum(y) - prob_speech = num_speech_frames / y.shape[0] * 100 - logging.info( - "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f" - % (key, num_speech_frames, y.shape[0], prob_speech, dt, rtf) - ) - writer.write([key], [y]) - if write_num_frames is not None: - f_num_frames.write("%s %d\n" % (key, y.shape[0])) - - vad.reset() - - if write_num_frames is not None: - f_num_frames.close() - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Compute Kaldi Energy VAD") - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) - parser.add_argument("--output", dest="output_path", required=True) - parser.add_argument("--write-num-frames", default=None) - - AR.add_class_args(parser) - EnergyVAD.add_class_args(parser) - parser.add_argument( - "-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int, - help="Verbose level", - ) - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - compute_vad(**namespace_to_dict(args)) diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py new file mode 100755 index 00000000..fe0b1d8e --- /dev/null +++ b/hyperion/bin/compute_energy_vad.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +""" + Copyright 2018 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.np.feats import EnergyVAD + + +def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): + vad_args = EnergyVAD.filter_args(**kwargs) + vad = EnergyVAD(**vad_args) + + input_args = AR.filter_args(**kwargs) + reader = AR(recordings_file, **input_args) + + metadata_columns = [ + "frame_shift", + "frame_length", + "num_frames", + "num_speech_frames", + "prob_speech", + ] + + writer = DWF.create(output_spec, metadata_columns=metadata_columns) + + if write_num_frames is not None: + f_num_frames = open(write_num_frames, "w") + + for data in reader: + key, x, fs = data + logging.info("Extracting VAD for %s", key) + t1 = time.time() + y = vad.compute(x) + dt = (time.time() - t1) * 1000 + rtf = vad.frame_shift * y.shape[0] / dt + num_speech_frames = np.sum(y) + prob_speech = num_speech_frames / y.shape[0] * 100 + + logging.info( + "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f", + key, + num_speech_frames, + y.shape[0], + prob_speech, + dt, + rtf, + ) + metadata = { + "frame_shift": vad.frame_shift, + "frame_length": vad.frame_length, + "num_frames": y.shape[0], + "num_speech_frames": num_speech_frames, + "prob_speech": prob_speech, + } + writer.write([key], [y], metadata) + if write_num_frames is not None: + f_num_frames.write("%s %d\n" % (key, y.shape[0])) + + vad.reset() + + if write_num_frames is not None: + f_num_frames.close() + + +def main(): + parser = ArgumentParser(description="Compute Kaldi Energy VAD") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--output-spec", required=True) + parser.add_argument("--write-num-frames", default=None) + parser.add_argument("--write-stats", default=None) + + AR.add_class_args(parser) + EnergyVAD.add_class_args(parser) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + compute_vad(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py new file mode 100755 index 00000000..f42f260d --- /dev/null +++ b/hyperion/bin/compute_mfcc_feats.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +""" + Copyright 2018 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import SequentialDataReaderFactory as DRF +from hyperion.io import compression_methods +from hyperion.np.feats import MFCC + + +def compute_mfcc_feats( + input_path, output_path, compress, compression_method, write_num_frames, **kwargs +): + mfcc_args = MFCC.filter_args(**kwargs) + mfcc = MFCC(**mfcc_args) + + if mfcc.input_step == "wave": + input_args = AR.filter_args(**kwargs) + reader = AR(input_path, **input_args) + else: + input_args = DRF.filter_args(**kwargs) + reader = DRF.create(input_path, **input_args) + + writer = DWF.create( + output_path, + compress=compress, + compression_method=compression_method, + ) + + if write_num_frames is not None: + f_num_frames = open(write_num_frames, "w") + + for data in reader: + if mfcc.input_step == "wave": + key, x, fs = data + else: + key, x = data + logging.info("Extracting MFCC for %s num_samples=%d" % (key, len(x))) + t1 = time.time() + y = mfcc.compute(x) + dt = (time.time() - t1) * 1000 + rtf = dt / (mfcc.frame_shift * y.shape[0]) + logging.info( + "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f", + key, + y.shape[0], + dt, + rtf, + ) + writer.write([key], [y]) + + if write_num_frames is not None: + f_num_frames.write("%s %d\n" % (key, y.shape[0])) + + mfcc.reset() + + if write_num_frames is not None: + f_num_frames.close() + + +def main(): + parser = ArgumentParser(description="Compute MFCC features") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--output", dest="output_path", required=True) + parser.add_argument("--write-num-frames", default=None) + + AR.add_class_args(parser) + DRF.add_class_args(parser) + MFCC.add_class_args(parser) + parser.add_argument( + "--compress", + dest="compress", + default=False, + action="store_true", + help="Compress the features", + ) + parser.add_argument( + "--compression-method", + dest="compression_method", + default="auto", + choices=compression_methods, + help="Compression method", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + compute_mfcc_feats(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/copy_feats.py b/hyperion/bin/copy_feats.py new file mode 100755 index 00000000..4ffc1a58 --- /dev/null +++ b/hyperion/bin/copy_feats.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +""" + Copyright 2018 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + Copy features/vectors and change format +""" + +import argparse +import logging +import os +import sys +import time + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.io import CopyFeats as CF + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Copy features and change format", + ) + + parser.add_argument("--input", dest="input_spec", nargs="+", required=True) + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--write-num-frames", dest="write_num_frames", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + CF.add_argparse_args(parser) + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + CF(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py new file mode 100755 index 00000000..bcf9e05c --- /dev/null +++ b/hyperion/bin/decode_wav2transducer.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time +from typing import Dict, List, Tuple + +import numpy as np +import pandas as pd +import sentencepiece as spm +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.np.augment import SpeechAugment +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("transducer-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def decode_one_batch( + model: nn.Module, + sp: spm.SentencePieceProcessor, + x: torch.Tensor, + decoding_method="beam_search", +) -> Dict[str, List[List[str]]]: + """Decode one batch and return the result in a dict. The dict has the + following format: + - key: It indicates the setting used for decoding. For example, + if greedy_search is used, it would be "greedy_search" + If beam search with a beam size of 7 is used, it would be + "beam_7" + - value: It contains the decoding result. `len(value)` equals to + batch size. `value[i]` is the decoding result for the i-th + utterance in the given batch. + Args: + params: + It's the return value of :func:`get_params`. + model: + The neural model. + sp: + The BPE model. + batch: + It is the return value from iterating + `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation + for the format of the `batch`. + Returns: + Return the decoding result. See above description for the format of + the returned dict. + """ + device = model.device + feature = x # batch["inputs"] + assert x.shape[0] == 1 + assert feature.ndim == 2 + + feature = feature.to(device) + # at entry, feature is (N, T, C) + + feature_lens = torch.Tensor([x.shape[1]]).int() + + encoder_out, hid_feats, encoder_out_lens = model.forward_feats( + x=feature, x_lengths=feature_lens + ) + + hyps = [] + batch_size = encoder_out.size(0) + + encoder_out = encoder_out.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + for i in range(batch_size): + # fmt: off + encoder_out_i = encoder_out[i:i + 1, :encoder_out_lens[i]] + # fmt: on + if decoding_method == "greedy_search": + hyp = greedy_search(model=model, encoder_out=encoder_out_i) + elif decoding_method == "beam_search": + hyp = beam_search(model=model, encoder_out=encoder_out_i, beam=5) + else: + raise ValueError(f"Unsupported decoding method: {decoding_method}") + hyps.append(sp.decode(hyp).split()) + + logging.info("hyps:{}".format(" ".join(hyps[0]))) + + if decoding_method == "greedy_search": + return hyps[0] + else: + return hyps[0] + + +def decode_transducer( + input_spec, output_spec, model_path, bpe_model, use_gpu, **kwargs +): + device = init_device(use_gpu) + model = load_model(model_path, device) + + sp = spm.SentencePieceProcessor() + sp.load(bpe_model) + + augmenter = None + aug_df = None + num_augs = 1 + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output: %s" % (output_spec)) + with open(output_spec, "w") as writer: + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) + with AR(input_spec, **ar_args) as reader: + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + t2 = time.time() + + logging.info("processing utt %s" % (key0)) + for aug_id in range(num_augs): + t3 = time.time() + key, x = key0, x0 # augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + + t5 = time.time() + tot_frames = x.shape[1] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + ) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + y = decode_one_batch(model=model, sp=sp, x=x) + + t7 = time.time() + writer.write(key + " " + " ".join(y) + "\n") + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + ) + ) + + +def main(): + parser = ArgumentParser( + description=( + "Extracts x-vectors from waveform computing " "acoustic features on the fly" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + + AR.add_class_args(parser) + + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--model-path", required=True) + + parser.add_argument("--bpe-model", required=True) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + decode_transducer(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py new file mode 100755 index 00000000..33aea8c3 --- /dev/null +++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time +from typing import Dict, List, Tuple + +import numpy as np +import pandas as pd +import sentencepiece as spm +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.np.augment import SpeechAugment +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.models import HFWav2Vec2RNNTransducer +from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("transducer-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def decode_one_batch( + model: nn.Module, + sp: spm.SentencePieceProcessor, + x: torch.Tensor, + decoding_method="beam_search", +) -> Dict[str, List[List[str]]]: + """Decode one batch and return the result in a dict. The dict has the + following format: + - key: It indicates the setting used for decoding. For example, + if greedy_search is used, it would be "greedy_search" + If beam search with a beam size of 7 is used, it would be + "beam_7" + - value: It contains the decoding result. `len(value)` equals to + batch size. `value[i]` is the decoding result for the i-th + utterance in the given batch. + Args: + params: + It's the return value of :func:`get_params`. + model: + The neural model. + sp: + The BPE model. + batch: + It is the return value from iterating + `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation + for the format of the `batch`. + Returns: + Return the decoding result. See above description for the format of + the returned dict. + """ + device = model.device + feature = x # batch["inputs"] + assert x.shape[0] == 1 + assert feature.ndim == 2 + + feature = feature.to(device) + # at entry, feature is (N, T, C) + + feature_lens = torch.Tensor([x.shape[1]]).int() + + encoder_out, hid_feats, encoder_out_lens = model.forward_feats( + x=feature, x_lengths=feature_lens + ) + + hyps = [] + batch_size = encoder_out.size(0) + + encoder_out = encoder_out.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + for i in range(batch_size): + # fmt: off + encoder_out_i = encoder_out[i:i + 1, :encoder_out_lens[i]] + # fmt: on + if decoding_method == "greedy_search": + hyp = greedy_search(model=model, encoder_out=encoder_out_i) + elif decoding_method == "beam_search": + hyp = beam_search(model=model, encoder_out=encoder_out_i, beam=5) + else: + raise ValueError(f"Unsupported decoding method: {decoding_method}") + hyps.append(sp.decode(hyp).split()) + + logging.info("hyps:{}".format(" ".join(hyps[0]))) + + if decoding_method == "greedy_search": + return hyps[0] + else: + return hyps[0] + + +def decode_transducer( + input_spec, + output_spec, + scp_sep, + model_path, + bpe_model, + infer_args, + use_gpu, + **kwargs, +): + device = init_device(use_gpu) + model = load_model(model_path, device) + + logging.info("bpe-model=%s", bpe_model) + sp = spm.SentencePieceProcessor() + sp.load(bpe_model) + + infer_args = HFWav2Vec2RNNTransducer.filter_infer_args(**infer_args) + logging.info(f"infer-args={infer_args}") + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output: %s", output_spec) + with open(output_spec, "w") as writer: + logging.info(f"opening input stream: {input_spec} with args={ar_args}") + with AR(input_spec, **ar_args) as reader: + while not reader.eof(): + t1 = time.time() + key, x, fs = reader.read(1) + if len(key) == 0: + break + + x, key, fs = x[0], key[0], fs[0] + t2 = time.time() + logging.info("processing utt %s", key) + with torch.no_grad(): + x = torch.tensor(x[None, :], dtype=torch.get_default_dtype()).to( + device + ) + + tot_frames = x.shape[1] + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + + if x.shape[1] == 0: + y = [""] + else: + # y = decode_one_batch(model=model, sp=sp, x=x) + x_lengths = torch.tensor( + (x.shape[1],), dtype=torch.long, device=device + ) + y = model.infer(x, x_lengths, **infer_args) + + y = sp.decode(y[0]) + logging.info(f"utt: {key} hyps: {y}") + t3 = time.time() + writer.write(f"{key} {y}\n") + + t4 = time.time() + tot_time = t4 - t1 + infer_time = t3 - t2 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "infer-time=%.3f " + "write-time=%.3f " + "infer-rt-factor=%.2f tot-rt-factor=%.2f" + ), + key, + tot_time, + t2 - t1, + infer_time, + t4 - t3, + x.shape[1] / fs / infer_time, + x.shape[1] / fs / tot_time, + ) + + +def main(): + parser = ArgumentParser( + description=("ASR decoding for RNN-T with Wav2vec features") + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + + AR.add_class_args(parser) + parser.add_argument("--model-path", required=True) + parser.add_argument("--bpe-model", required=True) + + HFWav2Vec2RNNTransducer.add_infer_args(parser, "infer-args") + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + decode_transducer(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_cosine_scoring_backend.py b/hyperion/bin/eval_cosine_scoring_backend.py new file mode 100755 index 00000000..835cae0b --- /dev/null +++ b/hyperion/bin/eval_cosine_scoring_backend.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +import logging +import time +from pathlib import Path + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.transforms import TransformList +from hyperion.utils import EnrollmentMap, SegmentSet, TrialKey, TrialNdx, TrialScores +from hyperion.utils.math_funcs import cosine_scoring + + +def load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + test_feats_reader = DRF.create(feats_file) + if enroll_feats_file is not None and enroll_feats_file != feats_file: + enroll_feats_reader = DRF.create(enroll_feats_file) + else: + enroll_feats_reader = test_feats_reader + + enroll_map = EnrollmentMap.load(enroll_map_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_enroll_parts > 1 or num_test_parts > 1: + ndx = ndx.split( + enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts + ) + + enroll_map = enroll_map.filter(items=ndx.model_set) + x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True) + x_t = test_feats_reader.read(ndx.seg_set, squeeze=True) + return enroll_map, ndx, x_e, x_t + + +def load_cohort_data(segments_file, feats_file): + segments = SegmentSet.load(segments_file) + feats_reader = DRF.create(feats_file) + x = feats_reader.read(segments["id"], squeeze=True) + return segments, x + + +def eval_backend( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + preproc_file, + score_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + cohort_segments_file, + cohort_feats_file, + cohort_nbest, + avg_cohort_by, +): + logging.info("loading data") + enroll_map, ndx, x_e, x_t = load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) + + t1 = time.time() + logging.info("computing score") + if preproc_file is not None: + preprocessor = TransformList.load(preproc_file) + x_e = preprocessor(x_e) + x_t = preprocessor(x_t) + + scores = cosine_scoring(x_e, x_t, ids1=enroll_ids) + dt = time.time() - t1 + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if cohort_segments_file is not None: + t1 = time.time() + cohort_segments, x_coh = load_cohort_data( + cohort_segments_file, cohort_feats_file + ) + if preproc_file is not None: + x_coh = preprocessor(x_coh) + + if avg_cohort_by is not None: + cohort_class = cohort_segments[avg_cohort_by] + _, cohort_ids = np.unique(cohort_class, return_inverse=True) + else: + cohort_ids = None + + logging.info("computing enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh, ids2=cohort_ids) + logging.info("computing cohort vs test") + scores_coh_test = cosine_scoring(x_coh, x_t, ids1=cohort_ids) + snorm = AdaptSNorm(cohort_nbest) + scores = snorm(scores, scores_coh_test, scores_enr_coh) + dt = time.time() - t1 + logging.info( + "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if num_enroll_parts > 1 or num_test_parts > 1: + score_file = Path(score_file) + new_suffix = f".{enroll_part_idx}.{test_part_idx}{score_file.suffix}" + score_file = score_file.with_suffix(new_suffix) + + logging.info("saving scores to %s", score_file) + # sort scores rows to match the ndx model_set order + sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set] + scores = scores[sort_idx] + scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + scores.save(score_file) + + +def main(): + parser = ArgumentParser(description="Eval cosine-scoring with optional AS-Norm") + + parser.add_argument("--enroll-feats-file", default=None) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-map-file", required=True) + parser.add_argument("--preproc-file", default=None) + parser.add_argument("--cohort-segments-file", default=None) + parser.add_argument("--cohort-feats-file", default=None) + parser.add_argument("--cohort-nbest", type=int, default=1000) + parser.add_argument( + "--avg-cohort-by", + default=None, + help="segments file column to average vectors from same class class", + ) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "--enroll-part-idx", default=1, type=int, help="enroll part index" + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts in which we divide the enroll + list to run evaluation in parallel""", + ) + parser.add_argument("--test-part-idx", default=1, type=int, help="test part index") + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts in which we divide the test list + to run evaluation in parallel""", + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_backend(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py new file mode 100755 index 00000000..4fecf2f3 --- /dev/null +++ b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py @@ -0,0 +1,617 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +import logging +import time +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.transforms import TransformList +from hyperion.utils import ( + EnrollmentMap, + InfoTable, + SegmentSet, + TrialKey, + TrialNdx, + TrialScores, +) +from hyperion.utils.math_funcs import average_vectors, cosine_scoring + + +def get_precomp_qm_names(quality_measures): + # snorm qm will be calculated later + return [q for q in quality_measures if q not in ["snorm-mu", "snorm-mu/s"]] + + +def normalize_duration(q, min_dur, max_dur, frame_rate): + q = q / frame_rate + q = np.log(np.clip(q / frame_rate, a_min=min_dur, a_max=max_dur)) + log_min_dur = np.log(min_dur) + log_max_dur = np.log(max_dur) + q = (q - log_min_dur) / (log_max_dur - log_min_dur) + return q + + +def load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + test_feats_reader = DRF.create(feats_file) + if enroll_feats_file is not None and enroll_feats_file != feats_file: + enroll_feats_reader = DRF.create(enroll_feats_file) + else: + enroll_feats_reader = test_feats_reader + + enroll_map = EnrollmentMap.load(enroll_map_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_enroll_parts > 1 or num_test_parts > 1: + ndx = ndx.split( + enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts + ) + + enroll_map = enroll_map.filter(items=ndx.model_set) + x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True) + x_t = test_feats_reader.read(ndx.seg_set, squeeze=True) + + # quality measures may be in segments file or/and feature_set file + # so we combine both if both are given + if segments_file is not None: + test_segments = SegmentSet.load(segments_file) + if enroll_segments_file is not None and segments_file != enroll_segments_file: + enroll_segments = SegmentSet.load(enroll_segments_file) + else: + enroll_segments = test_segments + + test_feats_set = test_feats_reader.feature_set + enroll_feats_set = enroll_feats_reader.feature_set + if segments_file: + test_segments.add_columns(test_feats_set) + if enroll_feats_set != test_feats_set or enroll_segments != test_segments: + enroll_segments.add_columns(enroll_feats_set) + else: + test_segments = test_feats_set + enroll_segments = enroll_feats_set + + # now we retrive the quality measures + q_e = [] + q_t = [] + # snorm qm will be calculated later + retrieve_qm = get_precomp_qm_names(quality_measures) + q_e = enroll_segments.loc[enroll_map["segmentid"], retrieve_qm] + q_t = test_segments.loc[ndx.seg_set, retrieve_qm] + + # normalize durations + if "speech_duration" in retrieve_qm: + q_e["speech_duration"] = normalize_duration( + q_e["speech_duration"], min_dur, max_dur, 1 + ) + q_t["speech_duration"] = normalize_duration( + q_t["speech_duration"], min_dur, max_dur, 1 + ) + + if "num_speech_frames" in retrieve_qm: + q_e["num_speech_frames"] = normalize_duration( + q_e["num_speech_frames"], min_dur, max_dur, frame_rate + ) + q_t["num_speech_frames"] = normalize_duration( + q_t["num_speech_frames"], min_dur, max_dur, frame_rate + ) + + # q_e = np.asarray(q_e) + # q_t = np.asarray(q_t) + + return enroll_map, ndx, x_e, x_t, q_e, q_t + + +def load_cohort_data(segments_file, feats_file): + segments = SegmentSet.load(segments_file) + feats_reader = DRF.create(feats_file) + x = feats_reader.read(segments["id"], squeeze=True) + + # segments.add_columns(feats_reader.feature_set) + + # retrieve_qm = get_precomp_qm_names(quality_measures) + # q = np.asarray(segments[retrieve_qm]) + return segments, x # , q + + +def average_qm(q, model_set, ids): + q_avg = average_vectors(q.values, ids) + q_avg = pd.DataFrame(q, columns=q.columns) + q_avg["id"] = model_set + q_avg.set_index("id", drop=False, inplace=True) + return q_avg + + +def get_score_filepath( + score_file, + score_name, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + score_file = Path(score_file) + new_suffix = "" + if score_name is not None: + new_suffix = f".{score_name}" + + if num_enroll_parts > 1 or num_test_parts > 1: + new_suffix = f"{new_suffix}.{enroll_part_idx}.{test_part_idx}" + + if new_suffix: + new_suffix = f"{new_suffix}{score_file.suffix}" + score_file = score_file.with_suffix(new_suffix) + + return score_file + + +def save_scores( + ndx, + scores, + score_file, + score_name, + q_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + score_file = get_score_filepath( + score_file, + score_name, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + logging.info("saving scores with to %s", score_file) + scores = TrialScores( + ndx.model_set, ndx.seg_set, scores, ndx.trial_mask, q_measures=q_measures + ) + scores.save(score_file) + + +def save_empty_scores( + ndx, + score_file, + score_name, + q_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + scores = np.zeros(ndx.trial_mask.shape, dtype="float32") + if q_measures is not None: + q_measures = {k: scores for k in q_measures} + + save_scores( + ndx, + scores, + score_file, + score_name, + q_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + +def segment_to_trial_qm(q_e, q_t): + q_trial = {} + for q_name in ["speech_duration", "num_speech_frames"]: + if q_name in q_e: + q_trial_name = f"max_{q_name}" + q_trial[q_trial_name] = np.maximum( + q_e[q_name].values[:, None], q_t[q_name].values[None, :] + ) + q_trial_name = f"min_{q_name}" + q_trial[q_trial_name] = np.minimum( + q_e[q_name].values[:, None], q_t[q_name].values[None, :] + ) + + return q_trial + + +def align_scores_to_ndx(enroll_set, ndx, scores, scores_norm, q_trial): + # sort scores rows to match the ndx model_set order + sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set] + scores = scores[sort_idx] + if scores_norm is not None: + scores_norm = scores_norm[sort_idx] + for qm in q_trial: + q_trial[qm] = q_trial[qm][sort_idx] + + return scores, scores_norm, q_trial + + +# def make_qm_table(ndx, scores, scores_norm, q_trial): +# if scores_norm is None: +# scores = scores[ndx.trial_mask] +# else: +# scores = scores_norm[ndx.trial_mask] + +# for qm in q_trial: +# q_trial[qm] = q_trial[qm][ndx.trial_mask] + +# I, J = np.nonzero(ndx.trial_mask) +# modelid = ndx.model_set[I] +# segmentid = ndx.seg_set[J] +# unique_id = [f"{a}-{b}" for a, b in zip(modelid, segmentid)] + +# q_dict = { +# "id": unique_id, +# "modelid": modelid, +# "segmentid": segmentid, +# "scores": scores, +# } +# q_dict.update(q_trial) +# df = pd.DataFrame(q_dict) +# return InfoTable(df) + + +def eval_backend( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + preproc_file, + qmf_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + cohort_segments_file, + cohort_feats_file, + cohort_nbest, + avg_cohort_by, + score_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + logging.info("loading data") + enroll_map, ndx, x_e, x_t, q_e, q_t = load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if not np.any(ndx.trial_mask): + # this part doesn't have any trials, save empty files + if qmf_file is not None: + quality_measures = None + save_empty_scores( + ndx, + score_file, + "snorm.qmf" if cohort_segments_file is not None else "qmf", + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + save_empty_scores( + ndx, + score_file, + None, + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if cohort_segments_file is not None: + save_empty_scores( + ndx, + score_file, + "snorm", + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + return + + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) + q_e = average_qm(q_e, enroll_set, enroll_ids) + + t1 = time.time() + logging.info("computing score") + if preproc_file is not None: + preprocessor = TransformList.load(preproc_file) + x_e = preprocessor(x_e) + x_t = preprocessor(x_t) + + scores = cosine_scoring(x_e, x_t, ids1=enroll_ids) + dt = time.time() - t1 + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + q_trial = segment_to_trial_qm(q_e, q_t) + scores_norm = None + if cohort_segments_file is not None: + t1 = time.time() + cohort_segments, x_coh = load_cohort_data( + cohort_segments_file, cohort_feats_file + ) + if preproc_file is not None: + x_coh = preprocessor(x_coh) + + if avg_cohort_by is not None: + cohort_class = cohort_segments[avg_cohort_by] + _, cohort_ids = np.unique(cohort_class, return_inverse=True) + else: + cohort_ids = None + + logging.info("computing enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh, ids2=cohort_ids) + logging.info("computing cohort vs test") + scores_coh_test = cosine_scoring(x_coh, x_t, ids1=cohort_ids) + snorm = AdaptSNorm(cohort_nbest) + scores_norm, mu_z, s_z, mu_t, s_t = snorm( + scores, scores_coh_test, scores_enr_coh, return_stats=True + ) + if "snorm-mu" in quality_measures: + q_trial["max_snorm-mu"] = np.maximum(mu_z, mu_t) + q_trial["min_snorm-mu"] = np.minimum(mu_z, mu_t) + if "snorm-mu/s" in quality_measures: + mu_z = mu_z / s_z + mu_t = mu_t / s_t + q_trial["max_snorm-mu/s"] = np.maximum(mu_z, mu_t) + q_trial["min_snorm-mu/s"] = np.minimum(mu_z, mu_t) + + dt = time.time() - t1 + logging.info( + "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + scores, scores_norm, q_trial = align_scores_to_ndx( + enroll_set, ndx, scores, scores_norm, q_trial + ) + if qmf_file is None: + save_scores( + ndx, + scores, + score_file, + None, + q_trial, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if scores_norm is not None: + save_scores( + ndx, + scores_norm, + score_file, + "snorm", + q_trial, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + # qm_table = make_qm_table(ndx, scores, scores_norm, q_trial) + # qm_file = get_score_filepath( + # score_file, + # "qm", + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # qm_table.save(qm_file) + return + + save_scores( + ndx, + scores, + score_file, + None, + None, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if scores_norm is not None: + save_scores( + ndx, + scores_norm, + score_file, + "snorm", + None, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + logging.info("applying qmf") + if scores_norm is None: + score_name = "qmf" + scores_fus = [scores.ravel()] + else: + score_name = "snorm.qmf" + scores_fus = [scores_norm.ravel()] + + q_names = list(q_trial.keys()) + q_names.sort() + for q_name in q_names: + scores_fus.append(q_trial[q_name].ravel()) + + scores_fus = np.vstack(scores_fus).T + lr = LR.load(qmf_file) + scores_fus = lr.predict(scores_fus) + scores_fus = np.reshape(scores_fus, (ndx.num_models, ndx.num_tests)) + save_scores( + ndx, + scores_fus, + score_file, + score_name, + None, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + # score_file_nonorm = get_score_filepath( + # score_file, + # None, + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # logging.info("saving scores to %s", score_file_nonorm) + # scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + # scores.save(score_file_nonorm) + + # if scores_norm is not None: + # score_file_snorm = get_score_filepath( + # score_file, + # "snorm", + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # logging.info("saving scores with AS-Norm to %s", score_file_snorm) + # scores.scores = scores_norm + # scores.save(score_file_snorm) + + +def main(): + parser = ArgumentParser( + description="Eval cosine-scoring with optional AS-Norm and QMF" + ) + + parser.add_argument("--enroll-feats-file", default=None) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-map-file", required=True) + parser.add_argument("--enroll-segments-file", default=None) + parser.add_argument("--segments-file", default=None) + parser.add_argument("--preproc-file", default=None) + parser.add_argument("--qmf-file", default=None) + parser.add_argument( + "--quality-measures", + default=["snorm-mu/s", "speech_duration"], + nargs="+", + choices=["snorm-mu/s", "snorm-mu", "speech_duration", "num_speech_frames"], + ) + parser.add_argument( + "--min-dur", default=0.1, type=float, help="lower bound to clip durations" + ) + parser.add_argument( + "--max-dur", default=30.0, type=float, help="upper bound to clip durations" + ) + parser.add_argument( + "--frame-rate", + default=100, + type=float, + help="frames/sec when durationa are expressed in frames", + ) + parser.add_argument("--cohort-segments-file", default=None) + parser.add_argument("--cohort-feats-file", default=None) + parser.add_argument("--cohort-nbest", type=int, default=1000) + parser.add_argument( + "--avg-cohort-by", + default=None, + help="segments file column to average vectors from same class class", + ) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "--enroll-part-idx", default=1, type=int, help="enroll part index" + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts in which we divide the enroll + list to run evaluation in parallel""", + ) + parser.add_argument("--test-part-idx", default=1, type=int, help="test part index") + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts in which we divide the test list + to run evaluation in parallel""", + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_backend(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_plda_backend.py b/hyperion/bin/eval_plda_backend.py new file mode 100755 index 00000000..2058b2cb --- /dev/null +++ b/hyperion/bin/eval_plda_backend.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +import logging +import time +from pathlib import Path + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np import NPModel +from hyperion.np.pdfs import PLDAFactory, PLDALLRNvsMMethod +from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.transforms import LNorm, TransformList +from hyperion.utils import EnrollmentMap, SegmentSet, TrialKey, TrialNdx, TrialScores +from hyperion.utils.math_funcs import cosine_scoring + + +def load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + test_feats_reader = DRF.create(feats_file) + if enroll_feats_file is not None and enroll_feats_file != feats_file: + enroll_feats_reader = DRF.create(enroll_feats_file) + else: + enroll_feats_reader = test_feats_reader + + enroll_map = EnrollmentMap.load(enroll_map_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_enroll_parts > 1 or num_test_parts > 1: + ndx = ndx.split( + enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts + ) + + enroll_map = enroll_map.filter(items=ndx.model_set) + x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True) + x_t = test_feats_reader.read(ndx.seg_set, squeeze=True) + return enroll_map, ndx, x_e, x_t + + +def load_cohort_data(segments_file, feats_file): + segments = SegmentSet.load(segments_file) + feats_reader = DRF.create(feats_file) + x = feats_reader.read(segments["id"], squeeze=True) + return segments, x + + +def eval_backend( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + preproc_file, + plda_file, + llr_method, + score_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + cohort_segments_file, + cohort_feats_file, + cohort_nbest, + avg_cohort_by, +): + logging.info("loading data") + enroll_map, ndx, x_e, x_t = load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) + if len(enroll_set) == np.max(enroll_ids) + 1: + is_Nvs1 = False + else: + is_Nvs1 = True + + t1 = time.time() + + if preproc_file is not None: + logging.info("Loading Preprocessor") + preprocessor = TransformList.load(preproc_file) + x_e = preprocessor(x_e) + x_t = preprocessor(x_t) + if llr_method == PLDALLRNvsMMethod.vavg and isinstance( + preprocessor.transforms[-1], LNorm + ): + llr_method = PLDALLRNvsMMethod.lnorm_vavg + + assert llr_method == PLDALLRNvsMMethod.lnorm_vavg, preprocessor.transforms + logging.info("Loading PLDA model") + plda_model = NPModel.auto_load(plda_file) + logging.info("computing score") + if is_Nvs1: + scores = plda_model.llr_Nvs1(x_e, x_t, ids1=enroll_ids, method=llr_method) + else: + scores = plda_model.llr_1vs1(x_e, x_t) + + dt = time.time() - t1 + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if cohort_segments_file is not None: + t1 = time.time() + cohort_segments, x_coh = load_cohort_data( + cohort_segments_file, cohort_feats_file + ) + if preproc_file is not None: + x_coh = preprocessor(x_coh) + + if avg_cohort_by is not None: + cohort_class = cohort_segments[avg_cohort_by] + _, cohort_ids = np.unique(cohort_class, return_inverse=True) + else: + cohort_ids = None + + logging.info("computing enroll vs cohort") + scores_enr_coh = plda_model.llr_NvsM( + x_e, x_coh, ids1=enroll_ids, ids2=cohort_ids, method=llr_method + ) + logging.info("computing cohort vs test") + scores_coh_test = plda_model.lrr_Nvs1( + x_coh, x_t, ids1=cohort_ids, method=llr_method + ) + snorm = AdaptSNorm(cohort_nbest) + scores = snorm(scores, scores_coh_test, scores_enr_coh) + dt = time.time() - t1 + logging.info( + "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if num_enroll_parts > 1 or num_test_parts > 1: + score_file = Path(score_file) + new_suffix = f".{enroll_part_idx}.{test_part_idx}{score_file.suffix}" + score_file = score_file.with_suffix(new_suffix) + + logging.info("saving scores to %s", score_file) + # sort scores rows to match the ndx model_set order + sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set] + scores = scores[sort_idx] + scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + scores.save(score_file) + + +def main(): + parser = ArgumentParser(description="Eval PLDA LLR with optional AS-Norm") + + parser.add_argument("--enroll-feats-file", default=None) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-map-file", required=True) + parser.add_argument("--preproc-file", default=None) + parser.add_argument("--plda-file", required=True) + parser.add_argument( + "--llr-method", + default=PLDALLRNvsMMethod.vavg, + choices=PLDALLRNvsMMethod.choices(), + ) + parser.add_argument("--cohort-segments-file", default=None) + parser.add_argument("--cohort-feats-file", default=None) + parser.add_argument("--cohort-nbest", type=int, default=1000) + parser.add_argument( + "--avg-cohort-by", + default=None, + help="segments file column to average vectors from same class class", + ) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "--enroll-part-idx", default=1, type=int, help="enroll part index" + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts in which we divide the enroll + list to run evaluation in parallel""", + ) + parser.add_argument("--test-part-idx", default=1, type=int, help="test part index") + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts in which we divide the test list + to run evaluation in parallel""", + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_backend(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_verification_metrics.py b/hyperion/bin/eval_verification_metrics.py new file mode 100755 index 00000000..98fd37e2 --- /dev/null +++ b/hyperion/bin/eval_verification_metrics.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.np.metrics import VerificationEvaluator as VE + + +def eval_verification_metrics( + key_files, + score_files, + key_names, + score_names, + p_tar, + c_miss, + c_fa, + sparse, + output_file, +): + assert len(key_files) == len(key_names) + assert len(score_files) == len(score_names) + dfs = [] + for score_file, score_name in zip(score_files, score_names): + for key_file, key_name in zip(key_files, key_names): + logging.info("Evaluating %s - %s", score_name, key_name) + evaluator = VE( + key_file, + score_file, + p_tar, + c_miss, + c_fa, + key_name, + score_name, + sparse=sparse, + ) + df_ij = evaluator.compute_dcf_eer() + dfs.append(df_ij) + + df = pd.concat(dfs) + logging.info("saving results to %s", output_file) + output_file = Path(output_file) + output_file.parent.mkdir(exist_ok=True, parents=True) + sep = "\t" if output_file.suffix == ".tsv" else "," + df.to_csv(output_file, sep=sep, index=False, float_format="{:,.4f}".format) + + pd.options.display.float_format = "{:.4}".format + print(df.to_string(), flush=True) + + +def main(): + parser = ArgumentParser(description="Evaluate speaker verification metrics") + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--key-files", required=True, nargs="+") + parser.add_argument("--score-files", required=True, nargs="+") + parser.add_argument("--key-names", required=True, nargs="+") + parser.add_argument("--score-names", required=True, nargs="+") + parser.add_argument( + "--p-tar", + default=[0.05, 0.01, 0.005, 0.001], + nargs="+", + type=float, + help="target priors", + ) + parser.add_argument( + "--c-miss", default=None, nargs="+", type=float, help="cost of miss" + ) + parser.add_argument( + "--c-fa", default=None, nargs="+", type=float, help="cost of false alarm" + ) + parser.add_argument("--sparse", default=False, action=ActionYesNo) + parser.add_argument("--output-file", required=True) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + args = parser.parse_args() + kwargs = namespace_to_dict(args) + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + eval_verification_metrics(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py new file mode 100755 index 00000000..1baad913 --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -0,0 +1,424 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + + +class MyModel(nn.Module): + def __init__( + self, feat_extractor, xvector_model, embed_layer=None, calibrator=None, sigma=0 + ): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.x_e = None + self.vad_t = None + self.embed_layer = embed_layer + self.calibrator = calibrator + self.sigma = sigma + + def forward(self, s_t): + if self.sigma > 0: + s_t = s_t + self.sigma * torch.randn_like(s_t) + + f_t, _ = self.feat_extractor(s_t) + if self.vad_t is not None: + n_vad_frames = len(self.vad_t) + n_feat_frames = f_t.shape[1] + if n_vad_frames > n_feat_frames: + self.vad_t = self.vad_t[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f_t = f_t[:, :n_vad_frames] + + f_t = f_t[:, self.vad_t] + + f_t = f_t.transpose(1, 2).contiguous() + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + x_t = l2_norm(x_t) + x_e = l2_norm(self.x_e) + score = torch.sum(x_e * x_t, dim=-1) + if self.calibrator is not None: + score = self.calibrator(score) + + return score + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(**kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + return feat_extractor + + +def load_model(model_path): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.eval() + return model + + +def load_calibrator(cal_file, threshold): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + # subting the threshold here will put the decision threshold in 0 + # some attacks use thr=0 to decide if the attack is succesful + calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) + calibrator.eval() + return calibrator + + +def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + key = TrialKey.load(key_file) + if num_seg_parts > 1: + key = key.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + f, idx = ismember(key.model_set, enroll.info) + assert np.all(f) + x_e = x_e[idx] + return key, x_e + + +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + stats_file, + cal_file, + threshold, + smooth_sigma, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + device = init_device(use_gpu) + feat_extractor = init_feats(**kwargs) + xvector_model = load_model(model_path) + + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file, threshold) + + tar = torch.as_tensor([1], dtype=torch.float).to(device) + non = torch.as_tensor([0], dtype=torch.float).to(device) + + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) + x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file, **audio_args) + wav_scale = audio_reader.wav_scale + + if save_adv_wav: + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") + + smooth_sigma *= wav_scale + model = MyModel( + feat_extractor, xvector_model, embed_layer, calibrator, smooth_sigma + ) + model.to(device) + model.eval() + + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = { + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.binary_cross_entropy_with_logits, + "time_dim": 1, + } + attack_args.update(extra_args) + logging.info("attacks args={}".format(attack_args)) + attack = AttackFactory.create(model, **attack_args) + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) + for j in range(key.num_tests): + t1 = time.time() + logging.info("scoring test utt %s", key.seg_set[j]) + s, fs = audio_reader.read([key.seg_set[j]]) + s = s[0] + fs = fs[0] + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) + + if vad_spec is not None: + vad = v_reader.read([key.seg_set[j]])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.tensor(vad, dtype=torch.bool).to(device) + model.vad_t = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) + + t2 = time.time() + + trial_time = 0 + num_trials = 0 + for i in range(key.num_models): + if key.tar[i, j] or key.non[i, j]: + t3 = time.time() + model.x_e = x_e[i : i + 1].to(device) + if key.tar[i, j]: + if attack.targeted: + t = non + else: + t = tar + else: + if attack.targeted: + t = tar + else: + t = non + + s_adv = attack.generate(s, t) + with torch.no_grad(): + # we add the threshold back here to make sure the scores are well calibrated + scores[i, j] = model(s_adv) + threshold + + t4 = time.time() + trial_time += t4 - t3 + num_trials += 1 + + s_adv = s_adv.detach() + stats_ij = compute_stats_adv_attack(s, s_adv) + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] + attack_stats = attack_stats.append( + { + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + }, + ignore_index=True, + ) + + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + if save_adv_wav: + s_adv = s_adv.cpu().numpy()[0] + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] and scores[i, j] < threshold: + tar_audio_writer.write(trial_name, s_adv, fs) + elif key.non[i, j] and scores[i, j] > threshold: + non_audio_writer.write(trial_name, s_adv, fs) + + trial_time /= num_trials + t7 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.4f trial-time=%.4f n_trials=%d " + "rt-factor=%.5f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s", score_file) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) + s.save_txt(score_file) + + logging.info("saving stats to %s", stats_file) + attack_stats.to_csv(stats_file) + + +def main(): + parser = ArgumentParser( + description="Eval cosine-scoring given enroll x-vector and test wave" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + + # parser.add_argument('--save-adv-wav-tar-thr', + # default=0.75, type=float, + # help='min score to save signal from attack that makes non-tar into tar') + + # parser.add_argument('--save-adv-wav-non-thr', + # default=-0.75, type=float, + # help='max score to save signal from attack that makes tar into non-tar') + + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--smooth-sigma", default=0, type=float, help="sigma for smoothing" + ) + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py new file mode 100755 index 00000000..3e4e9229 --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -0,0 +1,498 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time + +# [Added Sonal May21] +from pathlib import Path + +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.adv_defenses.wave_gan_white import WaveGANDefender +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + +torch.backends.cudnn.enabled = False + + +class MyModel(nn.Module): + def __init__( + self, + feat_extractor, + xvector_model, + embed_layer=None, + calibrator=None, + sigma=0, + smoothing_after_wavegan=None, + wave_gan_defender=None, + wav_scale=2**15 - 1, + ): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.x_e = None + self.vad_t = None + self.embed_layer = embed_layer + self.calibrator = calibrator + self.sigma = sigma + self.smoothing_after_wavegan = smoothing_after_wavegan + self.wave_gan_defender = wave_gan_defender + self.wav_scale = wav_scale + self.apply_wavegan = False if wave_gan_defender is None else True + + def forward(self, s_t): + # Pre-proceessing defense, wavegan + smoothing [Added Sonal May21] + s_t = s_t / self.wav_scale + if self.smoothing_after_wavegan: + if self.apply_wavegan: + s_t = self.wave_gan_defender(s_t) + if self.sigma > 0: + s_t = s_t + self.sigma * torch.randn_like(s_t) + else: + if self.sigma > 0: + s_t = s_t + self.sigma * torch.randn_like(s_t) + if self.apply_wavegan: + s_t = self.wave_gan_defender(s_t) + + s_t = self.wav_scale * s_t + # End of pre-processing defense + + f_t, _ = self.feat_extractor(s_t) + if self.vad_t is not None: + n_vad_frames = len(self.vad_t) + n_feat_frames = f_t.shape[1] + if n_vad_frames > n_feat_frames: + self.vad_t = self.vad_t[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f_t = f_t[:, :n_vad_frames] + + f_t = f_t[:, self.vad_t] + + f_t = f_t.transpose(1, 2).contiguous() + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + x_t = l2_norm(x_t) + x_e = l2_norm(self.x_e) + score = torch.sum(x_e * x_t, dim=-1) + if self.calibrator is not None: + score = self.calibrator(score) + + return score + + +def fix_out_of_memory(model, tensors): + for p in model.parameters(): + if p.grad is not None: + del p.grad # free some memory + + for tensor in tensors: + if tensor.grad is not None: + del tensor.grad + + torch.cuda.empty_cache() + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(**kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + return feat_extractor + + +def load_model(model_path): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.eval() + return model + + +def load_calibrator(cal_file, threshold): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + # subting the threshold here will put the decision threshold in 0 + # some attacks use thr=0 to decide if the attack is succesful + calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) + calibrator.eval() + return calibrator + + +def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + key = TrialKey.load(key_file) + if num_seg_parts > 1: + key = key.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + f, idx = ismember(key.model_set, enroll.info) + assert np.all(f) + x_e = x_e[idx] + return key, x_e + + +def eval_cosine_scoring_wavegan( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + stats_file, + cal_file, + threshold, + smooth_sigma, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + smoothing_after_wavegan, + wave_gan_root_dir, + wave_gan_model_ckpt, + **kwargs +): + device = init_device(use_gpu) + feat_extractor = init_feats(**kwargs) + + wave_gan_defender = WaveGANDefender( + Path(wave_gan_root_dir), Path(wave_gan_model_ckpt) + ) + xvector_model = load_model(model_path) + + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file, threshold) + + tar = torch.as_tensor([1], dtype=torch.float).to(device) + non = torch.as_tensor([0], dtype=torch.float).to(device) + + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) + x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file, **audio_args) + wav_scale = audio_reader.wav_scale + + if save_adv_wav: + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") + + model = MyModel( + feat_extractor, + xvector_model, + embed_layer, + calibrator, + smooth_sigma, + smoothing_after_wavegan, + wave_gan_defender, + wav_scale, + ) + model.to(device) + model.eval() + + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = { + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.binary_cross_entropy_with_logits, + "time_dim": 1, + } + attack_args.update(extra_args) + logging.info("attacks args={}".format(attack_args)) + attack = AttackFactory.create(model, **attack_args) + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) + + for j in range(key.num_tests): + t1 = time.time() + logging.info("scoring test utt %s" % (key.seg_set[j])) + s, fs = audio_reader.read([key.seg_set[j]]) + s = s[0] + fs = fs[0] + + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + s_cpu = s[None, :] + s = torch.as_tensor(s_cpu, dtype=torch.get_default_dtype(), device=device) + + if vad_spec is not None: + vad = v_reader.read([key.seg_set[j]])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.tensor(vad, dtype=torch.bool).to(device) + model.vad_t = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + + t2 = time.time() + + trial_time = 0 + num_trials = 0 + for i in range(key.num_models): + if key.tar[i, j] or key.non[i, j]: + t3 = time.time() + model.x_e = x_e[i : i + 1].to(device) + if key.tar[i, j]: + if attack.targeted: + t = non + else: + t = tar + else: + if attack.targeted: + t = tar + else: + t = non + + s_adv = attack.generate(s, t) + with torch.no_grad(): + scores[i, j] = model(s_adv) + threshold + + t4 = time.time() + trial_time += t4 - t3 + num_trials += 1 + + s_adv = s_adv.detach() + stats_ij = compute_stats_adv_attack(s, s_adv) + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] + attack_stats = attack_stats.append( + { + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + }, + ignore_index=True, + ) + + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + if save_adv_wav: + s_adv = s_adv.cpu().numpy()[0] + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] and scores[i, j] < threshold: + tar_audio_writer.write(trial_name, s_adv, fs) + elif key.non[i, j] and scores[i, j] > threshold: + non_audio_writer.write(trial_name, s_adv, fs) + + trial_time /= num_trials + t7 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.5f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) + s.save_txt(score_file) + + logging.info("saving stats to %s" % (stats_file)) + attack_stats.to_csv(stats_file) + + +def main(): + parser = ArgumentParser( + description="Eval cosine-scoring given enroll x-vector and test wave" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + + # parser.add_argument('--save-adv-wav-tar-thr', + # default=0.75, type=float, + # help='min score to save signal from attack that makes non-tar into tar') + + # parser.add_argument('--save-adv-wav-non-thr', + # default=-0.75, type=float, + # help='max score to save signal from attack that makes tar into non-tar') + + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--smooth-sigma", default=0, type=float, help="sigma for smoothing" + ) + parser.add_argument( + "--max-test-length", + default=5, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + # Defense: WaveGAN specific arguments [Added Sonal May21] + parser.add_argument( + "--smoothing-after-wavegan", + default=False, + action="store_true", + help=( + "Smoothing before or after wavegan, if true: " + "smoothing is done after wavegan" + ), + ) + + parser.add_argument( + "--wave-gan-root-dir", default=None, help="WaveGAN model root directory" + ) + parser.add_argument( + "--wave-gan-model-ckpt", default=None, help="WaveGAN model checkpoint" + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring_wavegan(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py new file mode 100755 index 00000000..781cdbdf --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -0,0 +1,442 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from art.classifiers import PyTorchClassifier +from art.estimators.classification import PyTorchClassifier +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(**kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + return feat_extractor + + +def load_model(model_path): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.eval() + return model + + +def load_calibrator(cal_file): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + calibrator = Calibrator(lr.A[0, 0], lr.b[0]) + calibrator.eval() + return calibrator + + +def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + key = TrialKey.load(key_file) + if num_seg_parts > 1: + key = key.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + f, idx = ismember(key.model_set, enroll.info) + assert np.all(f) + x_e = x_e[idx] + return key, x_e + + +class MyModel(nn.Module): + def __init__( + self, + feat_extractor, + xvector_model, + embed_layer=None, + calibrator=None, + threshold=0, + ): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.x_e = None + self.vad_t = None + self.embed_layer = embed_layer + self.calibrator = calibrator + self.threshold = threshold + + def forward(self, s_t): + if s_t.dim() == 4: + # this is for attacks that only work in 4D inputs + s_t = s_t[0, 0] + + f_t = s_t + f_t, _ = self.feat_extractor(s_t) + if self.vad_t is not None: + n_vad_frames = len(self.vad_t) + n_feat_frames = f_t.shape[1] + if n_vad_frames > n_feat_frames: + self.vad_t = self.vad_t[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f_t = f_t[:, :n_vad_frames] + + f_t = f_t[:, self.vad_t] + + f_t = f_t.transpose(1, 2).contiguous() + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + if self.x_e is None: + # this is for auto-pgd, when it runs a dummy evaluation + self.x_e = x_t + + x_t = l2_norm(x_t) + x_e = l2_norm(self.x_e) + tar_score = torch.sum(x_e * x_t, dim=-1, keepdim=True) + if self.calibrator is not None: + score = self.calibrator(tar_score) + + non_score = self.threshold + 0 * tar_score + score = torch.cat((non_score, tar_score), dim=-1) # .unsqueeze(0) + return score + + +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + stats_file, + cal_file, + threshold, + save_adv_wav, + save_adv_wav_path, + max_test_length, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + device_type = "gpu" if use_gpu else "cpu" + device = init_device(use_gpu) + feat_extractor = init_feats(**kwargs) + xvector_model = load_model(model_path) + + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file) + + model = MyModel( + feat_extractor, xvector_model, embed_layer, calibrator, threshold=threshold + ) + model.to(device) + model.eval() + + tar = np.asarray([1], dtype=int) + non = np.asarray([0], dtype=int) + + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) + x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file, **audio_args) + wav_scale = audio_reader.wav_scale + + if save_adv_wav: + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") + + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = {"eps_scale": wav_scale} + attack_args.update(extra_args) + logging.info("attack-args={}".format(attack_args)) + + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) + + for j in range(key.num_tests): + t1 = time.time() + logging.info("scoring test utt %s", key.seg_set[j]) + s, fs = audio_reader.read([key.seg_set[j]]) + s = s[0] + fs = fs[0] + + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + s = s[None, :].astype("float32", copy=False) + s_tensor = torch.as_tensor(s, dtype=torch.get_default_dtype()).to(device) + + if vad_spec is not None: + vad = v_reader.read([key.seg_set[j]])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.tensor(vad, dtype=torch.bool).to(device) + model.vad_t = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + + t2 = time.time() + + trial_time = 0 + num_trials = 0 + model_art = PyTorchClassifier( + model=model, + loss=nn.CrossEntropyLoss(), + optimizer=None, + input_shape=(s.shape[1],), + nb_classes=2, + clip_values=(-wav_scale, wav_scale), + device_type=device_type, + ) + + attack_args["num_samples"] = s.shape[-1] + attack = AttackFactory.create(model_art, **attack_args) + # s = s[None, None, :, :] + for i in range(key.num_models): + if key.tar[i, j] or key.non[i, j]: + t3 = time.time() + model.x_e = x_e[i : i + 1].to(device) + if key.tar[i, j]: + if attack.targeted: + t = non + else: + t = tar + else: + if attack.targeted: + t = tar + else: + t = non + + s_adv = attack.generate(s, t) + # s_adv = s_adv[0, 0] + s_adv = torch.from_numpy(s_adv).to(device) + with torch.no_grad(): + scores[i, j] = model(s_adv).cpu().numpy()[0, 1] + + t4 = time.time() + trial_time += t4 - t3 + num_trials += 1 + + s_adv = s_adv.detach() + stats_ij = compute_stats_adv_attack(s_tensor, s_adv) + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] + attack_stats = attack_stats.append( + { + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + }, + ignore_index=True, + ) + + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + if save_adv_wav: + s_adv = s_adv.cpu().numpy()[0] + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] and scores[i, j] < threshold: + tar_audio_writer.write(trial_name, s_adv, fs) + elif key.non[i, j] and scores[i, j] > threshold: + non_audio_writer.write(trial_name, s_adv, fs) + + del attack + del model_art + trial_time /= num_trials + t7 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.5f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s", score_file) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) + s.save_txt(score_file) + + logging.info("saving stats to %s", stats_file) + attack_stats.to_csv(stats_file) + + +def main(): + parser = ArgumentParser( + description=( + "Eval cosine-scoring given enroll x-vector " + "and adversarial test wave from ART" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py new file mode 100755 index 00000000..2ebb7e3d --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(device, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + feat_extractor.to(device) + return feat_extractor + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def load_calibrator(cal_file, device): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + calibrator = Calibrator(lr.A[0, 0], lr.b[0]) + calibrator.to(device) + calibrator.eval() + return calibrator + + +def read_data(v_file, ndx_file, enroll_file, seg_part_idx, num_seg_parts): + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_seg_parts > 1: + ndx = ndx.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + + f, idx = ismember(ndx.model_set, enroll.info) + + assert np.all(f) + x_e = x_e[idx] + + return ndx, x_e + + +def eval_cosine_scoring( + v_file, + ndx_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + cal_file, + max_test_length, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + device = init_device(use_gpu) + feat_extractor = init_feats(device, **kwargs) + model = load_model(model_path, device) + + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file, device) + + logging.info("loading ndx and enrollment x-vectors") + ndx, y_e = read_data(v_file, ndx_file, enroll_file, seg_part_idx, num_seg_parts) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file, **audio_args) + + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + scores = np.zeros((ndx.num_models, ndx.num_tests), dtype="float32") + with torch.no_grad(): + for j in range(ndx.num_tests): + t1 = time.time() + logging.info("scoring test utt %s", ndx.seg_set[j]) + s, fs = audio_reader.read([ndx.seg_set[j]]) + s = s[0] + fs = fs[0] + + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + t2 = time.time() + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) + x_t, _ = feat_extractor(s) + t4 = time.time() + tot_frames = x_t.shape[1] + if vad_spec is not None: + vad = v_reader.read([ndx.seg_set[j]], num_frames=x_t.shape[1])[0] + vad = torch.tensor(vad, dtype=torch.bool).to(device) + x_t = x_t[:, vad] + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + ndx.seg_set[j], + x_t.shape[1], + tot_frames, + x_t.shape[1] / tot_frames * 100, + ) + + t5 = time.time() + x_t = x_t.transpose(1, 2).contiguous() + y_t = model.extract_embed(x_t, embed_layer=embed_layer) + y_t = l2_norm(y_t) + t6 = time.time() + + for i in range(ndx.num_models): + if ndx.trial_mask[i, j]: + y_e_i = torch.as_tensor( + y_e[i : i + 1], dtype=torch.get_default_dtype() + ).to(device) + y_e_i = l2_norm(y_e_i) + scores_ij = torch.sum(y_e_i * y_t, dim=-1) + if calibrator is None: + scores[i, j] = scores_ij + else: + scores[i, j] = calibrator(scores_ij) + + t7 = time.time() + num_trials = np.sum(ndx.trial_mask[:, j]) + trial_time = (t7 - t6) / num_trials + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.2f" + ), + ndx.seg_set[j], + t7 - t1, + t2 - t1, + t4 - t2, + t5 - t4, + t6 - t5, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + logging.info("saving scores to %s", score_file) + s = TrialScores(ndx.model_set, ndx.seg_set, scores, score_mask=ndx.trial_mask) + s.save_txt(score_file) + + +def main(): + parser = ArgumentParser( + description="Eval cosine-scoring given enroll x-vector and test wave" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", required=True) + parser.add_argument("--ndx-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="evaluate in gpu" + ) + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", required=True) + parser.add_argument("--cal-file", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py new file mode 100755 index 00000000..5cd4b864 --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -0,0 +1,441 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + + +class MyModel(nn.Module): + def __init__( + self, feat_extractor, xvector_model, embed_layer=None, calibrator=None + ): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.x_e = None + self.vad_t = None + self.embed_layer = embed_layer + self.calibrator = calibrator + + def forward(self, s_t): + f_t = s_t + f_t, _ = self.feat_extractor(s_t) + if self.vad_t is not None: + n_vad_frames = len(self.vad_t) + n_feat_frames = f_t.shape[1] + if n_vad_frames > n_feat_frames: + self.vad_t = self.vad_t[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f_t = f_t[:, :n_vad_frames] + + f_t = f_t[:, self.vad_t] + + f_t = f_t.transpose(1, 2).contiguous() + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + x_t = l2_norm(x_t) + x_e = l2_norm(self.x_e) + score = torch.sum(x_e * x_t, dim=-1) + if self.calibrator is not None: + score = self.calibrator(score) + + return score + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(**kwargs): + feat_args = AF.filter_args(**kwargs) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + return feat_extractor + + +def load_model(model_path): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.freeze() + model.eval() + return model + + +def load_calibrator(cal_file, threshold): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + # subting the threshold here will put the decision threshold in 0 + # some attacks use thr=0 to decide if the attack is succesful + calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) + calibrator.eval() + return calibrator + + +def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + key = TrialKey.load(key_file) + + if num_seg_parts > 1: + key = key.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + + f, idx = ismember(key.model_set, enroll.info) + + assert np.all(f) + x_e = x_e[idx] + + return key, x_e + + +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + transfer_v_file, + model_path, + transfer_model_path, + embed_layer, + score_file, + stats_file, + cal_file, + transfer_cal_file, + threshold, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + device = init_device(use_gpu) + # load victim model + feat_extractor = init_feats(**kwargs["feats"]) + xvector_model = load_model(model_path) + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file, 0) + + model = MyModel(feat_extractor, xvector_model, embed_layer, calibrator) + model.to(device) + model.eval() + + # load white-box model + tfeat_extractor = init_feats(**kwargs["transfer_feats"]) + xvector_tmodel = load_model(transfer_model_path) + tcalibrator = None + if transfer_cal_file is not None: + tcalibrator = load_calibrator(transfer_cal_file, threshold) + + tmodel = MyModel(tfeat_extractor, xvector_tmodel, embed_layer, tcalibrator) + tmodel.to(device) + tmodel.eval() + + tar = torch.as_tensor([1], dtype=torch.float).to(device) + non = torch.as_tensor([0], dtype=torch.float).to(device) + + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) + x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) + + _, t_x_e = read_data( + transfer_v_file, key_file, enroll_file, seg_part_idx, num_seg_parts + ) + t_x_e = torch.as_tensor(t_x_e, dtype=torch.get_default_dtype()) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file) + wav_scale = audio_reader.wav_scale + + if save_adv_wav: + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") + + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = { + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.binary_cross_entropy_with_logits, + "time_dim": 1, + } + attack_args.update(extra_args) + logging.info("attacks args={}".format(attack_args)) + attack = AttackFactory.create(model, **attack_args) + + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) + for j in range(key.num_tests): + t1 = time.time() + logging.info("scoring test utt %s", key.seg_set[j]) + s, fs = audio_reader.read([key.seg_set[j]]) + s = s[0] + fs = fs[0] + + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) + + if vad_spec is not None: + vad = v_reader.read([key.seg_set[j]])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.as_tensor(vad.astype(bool, copy=False), dtype=torch.bool).to( + device + ) + model.vad_t = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + + t2 = time.time() + + trial_time = 0 + num_trials = 0 + for i in range(key.num_models): + if key.tar[i, j] or key.non[i, j]: + t3 = time.time() + model.x_e = x_e[i : i + 1].to(device) + tmodel.x_e = t_x_e[i : i + 1].to(device) + if key.tar[i, j]: + if attack.targeted: + t = non + else: + t = tar + else: + if attack.targeted: + t = tar + else: + t = non + + s_adv = attack.generate(s, t) + with torch.no_grad(): + scores[i, j] = model(s_adv) + + t4 = time.time() + trial_time += t4 - t3 + num_trials += 1 + + s_adv = s_adv.detach() + stats_ij = compute_stats_adv_attack(s, s_adv) + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] + attack_stats = attack_stats.append( + { + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + }, + ignore_index=True, + ) + + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + if save_adv_wav: + s_adv = s_adv.cpu().numpy()[0] + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] and scores[i, j] < threshold: + tar_audio_writer.write(trial_name, s_adv, fs) + elif key.non[i, j] and scores[i, j] > threshold: + non_audio_writer.write(trial_name, s_adv, fs) + + trial_time /= num_trials + t7 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.2f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s", score_file) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) + s.save_txt(score_file) + + logging.info("saving stats to %s", stats_file) + attack_stats.to_csv(stats_file) + + +def main(): + parser = ArgumentParser( + description=( + "Eval cosine-scoring given enroll x-vector and " + "adversarial test wave obtained from a different model" + ) + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) + + parser.add_argument("--transfer-v-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + AF.add_class_args(parser, prefix="transfer_feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument("--transfer-model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument( + "--transfer-cal-file", + default=None, + help="score calibration file for transfer model", + ) + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py new file mode 100755 index 00000000..7b8bc245 --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -0,0 +1,467 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from art.classifiers import PyTorchClassifier +from art.estimators.classification import PyTorchClassifier +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + + +class MyModel(nn.Module): + def __init__( + self, + feat_extractor, + xvector_model, + embed_layer=None, + calibrator=None, + threshold=0, + ): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.x_e = None + self.vad_t = None + self.embed_layer = embed_layer + self.calibrator = calibrator + self.threshold = threshold + + def forward(self, s_t): + f_t, _ = self.feat_extractor(s_t) + if self.vad_t is not None: + n_vad_frames = len(self.vad_t) + n_feat_frames = f_t.shape[1] + if n_vad_frames > n_feat_frames: + self.vad_t = self.vad_t[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f_t = f_t[:, :n_vad_frames] + + f_t = f_t[:, self.vad_t] + + f_t = f_t.transpose(1, 2).contiguous() + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + x_t = l2_norm(x_t) + x_e = l2_norm(self.x_e) + tar_score = torch.sum(x_e * x_t, dim=-1, keepdim=True) + if self.calibrator is not None: + score = self.calibrator(tar_score) + + non_score = self.threshold + 0 * tar_score + score = torch.cat((non_score, tar_score), dim=-1) # .unsqueeze(0) + return score + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(**kwargs): + feat_args = AF.filter_args(**kwargs) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + return feat_extractor + + +def load_model(model_path): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.freeze() + model.eval() + return model + + +def load_calibrator(cal_file): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + calibrator = Calibrator(lr.A[0, 0], lr.b[0]) + calibrator.eval() + return calibrator + + +def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + key = TrialKey.load(key_file) + + if num_seg_parts > 1: + key = key.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + + f, idx = ismember(key.model_set, enroll.info) + + assert np.all(f) + x_e = x_e[idx] + + return key, x_e + + +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + transfer_v_file, + model_path, + transfer_model_path, + embed_layer, + score_file, + stats_file, + cal_file, + transfer_cal_file, + threshold, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + device_type = "gpu" if use_gpu else "cpu" + device = init_device(use_gpu) + # load victim model + feat_extractor = init_feats(**kwargs["feats"]) + xvector_model = load_model(model_path) + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file) + + model = MyModel( + feat_extractor, xvector_model, embed_layer, calibrator, threshold=threshold + ) + model.to(device) + model.eval() + + # load white-box model + tfeat_extractor = init_feats(**kwargs["transfer_feats"]) + xvector_tmodel = load_model(transfer_model_path) + tcalibrator = None + if transfer_cal_file is not None: + tcalibrator = load_calibrator(transfer_cal_file) + + tmodel = MyModel( + tfeat_extractor, xvector_tmodel, embed_layer, tcalibrator, threshold=threshold + ) + tmodel.to(device) + tmodel.eval() + + tar = np.asarray([1], dtype=int) + non = np.asarray([0], dtype=int) + + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) + x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) + + _, t_x_e = read_data( + transfer_v_file, key_file, enroll_file, seg_part_idx, num_seg_parts + ) + t_x_e = torch.as_tensor(t_x_e, dtype=torch.get_default_dtype()) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file) + wav_scale = audio_reader.wav_scale + + if save_adv_wav: + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") + + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = {"eps_scale": wav_scale} + attack_args.update(extra_args) + logging.info("attack-args={}".format(attack_args)) + + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) + for j in range(key.num_tests): + t1 = time.time() + logging.info("scoring test utt %s" % (key.seg_set[j])) + s, fs = audio_reader.read([key.seg_set[j]]) + s = s[0] + fs = fs[0] + + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + s = s[None, :].astype("float32", copy=False) + s_tensor = torch.as_tensor(s, dtype=torch.get_default_dtype()).to(device) + + if vad_spec is not None: + vad = v_reader.read([key.seg_set[j]])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.tensor(vad, dtype=torch.bool).to(device) + model.vad_t = vad + tmodel.vad_t = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) + + t2 = time.time() + + trial_time = 0 + num_trials = 0 + model_art = PyTorchClassifier( + model=tmodel, + loss=nn.CrossEntropyLoss(), + optimizer=None, + input_shape=[1, s.shape[1]], + nb_classes=2, + clip_values=(-wav_scale, wav_scale), + device_type=device_type, + ) + + attack_args["num_samples"] = s.shape[-1] + attack = AttackFactory.create(model_art, **attack_args) + for i in range(key.num_models): + if key.tar[i, j] or key.non[i, j]: + t3 = time.time() + model.x_e = x_e[i : i + 1].to(device) + tmodel.x_e = t_x_e[i : i + 1].to(device) + if key.tar[i, j]: + if attack.targeted: + t = non + else: + t = tar + else: + if attack.targeted: + t = tar + else: + t = non + + s_adv = attack.generate(s, t) + s_adv = torch.from_numpy(s_adv).to(device) + with torch.no_grad(): + scores[i, j] = model(s_adv).cpu().numpy()[0, 1] + + t4 = time.time() + trial_time += t4 - t3 + num_trials += 1 + + s_adv = s_adv.detach() + stats_ij = compute_stats_adv_attack(s_tensor, s_adv) + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] + attack_stats = attack_stats.append( + { + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + }, + ignore_index=True, + ) + + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + if save_adv_wav: + s_adv = s_adv.cpu().numpy()[0] + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] and scores[i, j] < threshold: + tar_audio_writer.write(trial_name, s_adv, fs) + elif key.non[i, j] and scores[i, j] > threshold: + non_audio_writer.write(trial_name, s_adv, fs) + + del attack + del model_art + trial_time /= num_trials + t7 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.2f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) + s.save_txt(score_file) + + logging.info("saving stats to %s" % (stats_file)) + attack_stats.to_csv(stats_file) + + +def main(): + parser = ArgumentParser( + description=( + "Eval cosine-scoring given enroll x-vector and " + "adversarial test wave obtained from a different model" + "using ART" + ) + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) + + parser.add_argument("--transfer-v-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + AF.add_class_args(parser, prefix="transfer_feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument("--transfer-model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument( + "--transfer-cal-file", + default=None, + help="score calibration file for transfer model", + ) + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py new file mode 100755 index 00000000..b2e6a665 --- /dev/null +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.augment import SpeechAugment +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(device, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + feat_extractor.to(device) + return feat_extractor + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def augment(key0, x0, augmenter, aug_df, aug_id): + if augmenter is None: + x = x0 + key = key0 + else: + x, aug_info = augmenter(x0) + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) + + return key, x + + +def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) + if utt_length < x.shape[1]: + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] + logging.info( + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, + ) + return x + + +def eval_xvec( + input_spec, + output_spec, + vad_spec, + write_num_frames_spec, + vad_path_prefix, + model_path, + chunk_length, + random_utt_length, + min_utt_length, + max_utt_length, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs +): + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + feat_extractor = init_feats(device, **kwargs) + model = load_model(model_path, device) + + if write_num_frames_spec is not None: + keys = [] + info = [] + + if aug_cfg is not None: + augmenter = SpeechAugment.create(aug_cfg, rng=rng) + aug_df = [] + else: + augmenter = None + aug_df = None + num_augs = 1 + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output stream: %s", output_spec) + with DWF.create(output_spec) as writer: + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) + with AR(input_spec, **ar_args) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) + + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + t2 = time.time() + + logging.info("processing utt %s" % (key0)) + for aug_id in range(num_augs): + t3 = time.time() + key, x = augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + + x, _ = feat_extractor(x) + t5 = time.time() + tot_frames = x.shape[1] + if vad_spec is not None: + vad = v_reader.read(key0, num_frames=tot_frames)[0] + vad = torch.tensor(vad, dtype=torch.bool).to(device) + x = x[:, vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + + if random_utt_length: + x = select_random_chunk( + key, x, min_utt_length, max_utt_length, rng + ) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + x = x.transpose(1, 2).contiguous() + y = model(x).cpu().numpy()[0] + + t7 = time.time() + writer.write([key], [y]) + if write_num_frames_spec is not None: + keys.append(key) + info.append(str(x.shape[1])) + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f", + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + ) + + if write_num_frames_spec is not None: + logging.info("writing num-frames to %s", write_num_frames_spec) + u2nf = Utt2Info.create(keys, info) + u2nf.save(write_num_frames_spec) + + if aug_info_path is not None: + aug_df = pd.concat(aug_df, ignore_index=True) + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + + +def main(): + parser = ArgumentParser( + description=( + "Evaluates x-vectors logits from waveform computing " + "acoustic features on the fly" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-num-frames", dest="write_num_frames_spec", default=None + ) + + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + AR.add_class_args(parser) + + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=int, + default=500, + help=("minimum utterance length when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=int, + default=12000, + help=("maximum utterance length when using random utt length"), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="run in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_xvec(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py new file mode 100755 index 00000000..336ec818 --- /dev/null +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -0,0 +1,354 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +import torchaudio.transforms as tat +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.augment import SpeechAugment + +# from hyperion.torch import TorchModelLoader as TML +from hyperion.torch import TorchModel +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info + +resamplers = {} + + +def get_resampler(source_fs, target_fs): + if source_fs in resamplers: + return resamplers[source_fs] + + resampler = tat.Resample( + int(source_fs), + int(target_fs), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492, + ) + resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() + resamplers[source_fs] = resampler_f + return resampler_f + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + # model = TML.load(model_path) + model = TorchModel.auto_load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def augment(key0, x0, augmenter, aug_df, aug_id): + if augmenter is None: + x = x0 + key = key0 + else: + x, aug_info = augmenter(x0) + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) + + return key, x + + +def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): + utt_length = rng.integers( + low=int(fs * min_utt_length), high=int(fs * max_utt_length + 1) + ) + if utt_length < x.shape[1]: + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] + logging.info( + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, + ) + return x + + +def extract_xvectors( + recordings_file, + output_spec, + vad_spec, + write_speech_dur, + vad_path_prefix, + model_path, + hf_chunk_length, + xvec_chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs, +): + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + model = load_model(model_path, device) + + if write_speech_dur is not None: + keys = [] + info = [] + + if aug_cfg is not None: + augmenter = SpeechAugment.create(aug_cfg, rng=rng) + aug_df = [] + else: + augmenter = None + aug_df = None + num_augs = 1 + + metadata_columns = ["speech_duration"] + + ar_args = AR.filter_args(**kwargs) + ar_args["wav_scale"] = 1.0 + logging.info("opening output stream: %s", output_spec) + with DWF.create(output_spec, metadata_columns=metadata_columns) as writer: + logging.info(f"opening input stream: {recordings_file} with args={ar_args}") + with AR(recordings_file, **ar_args) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) + + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + fs = fs[0] + t2 = time.time() + if fs != model.sample_frequency: + resampler = get_resampler(fs, model.sample_frequency) + x0 = resampler(x0) + + logging.info("processing utt %s", key0) + for aug_id in range(num_augs): + metadata = {} + t3 = time.time() + key, x = augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + + t5 = time.time() + tot_samples = x.shape[1] + if vad_spec is not None: + vad = v_reader.read(key0)[0] + vad = torch.tensor( + vad[None, None, :], dtype=torch.float + ).to(device) + vad = torch.nn.functional.interpolate( + vad, size=x.size(-1), mode="nearest" + ).bool()[0, 0] + x = x[:, vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech samples", + key, + x.shape[1], + tot_samples, + x.shape[1] / tot_samples * 100, + ) + + if random_utt_length: + x = select_random_chunk( + key, x, fs, min_utt_length, max_utt_length, rng + ) + + metadata["speech_duration"] = x.shape[1] / fs + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + y = ( + model.extract_embed( + x, + hf_chunk_length=hf_chunk_length, + xvec_chunk_length=xvec_chunk_length, + embed_layer=embed_layer, + ) + .cpu() + .numpy()[0] + ) + + t7 = time.time() + writer.write([key], [y], metadata=metadata) + if write_speech_dur is not None: + keys.append(key) + info.append(str(x.shape[1] / fs)) + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ), + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x.shape[1] / fs / tot_time, + ) + + if write_speech_dur is not None: + logging.info("writing speech duration in secs to %s", write_speech_dur) + u2sd = Utt2Info.create(keys, info) + u2sd.save(write_speech_dur) + + if aug_info_path is not None: + aug_df = pd.concat(aug_df, ignore_index=True) + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + + +def main(): + parser = ArgumentParser( + description=( + "Extracts x-vectors from waveform computing " "acoustic features on the fly" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument("--write-speech-dur", default=None) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + AR.add_class_args(parser) + + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--hf-chunk-length", + type=float, + default=0, + help=( + "max. chunk length used in each forward pass " + "of the hf encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--xvec-chunk-length", + type=float, + default=0, + help=( + "max. chunk length used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=float, + default=5, + help=("minimum utterance length in secs when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=float, + default=120, + help=("maximum utterance length in secs when using random utt length"), + ) + + parser.add_argument("--output-spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_wav2xvectors.py b/hyperion/bin/extract_wav2xvectors.py new file mode 100755 index 00000000..3cca3ede --- /dev/null +++ b/hyperion/bin/extract_wav2xvectors.py @@ -0,0 +1,336 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +import torchaudio.transforms as tat +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.augment import SpeechAugment + +# from hyperion.torch import TorchModelLoader as TML +from hyperion.torch import TorchModel +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info + +resamplers = {} + + +def get_resampler(source_fs, target_fs): + if source_fs in resamplers: + return resamplers[source_fs] + + resampler = tat.Resample( + int(source_fs), + int(target_fs), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492, + ) + resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() + resamplers[source_fs] = resampler_f + return resampler_f + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus=%d", num_gpus) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model %s", model_path) + model = TorchModel.auto_load(model_path) + logging.info(f"xvector-model={model}") + model.to(device) + model.eval() + return model + + +def augment(key0, x0, augmenter, aug_df, aug_id): + if augmenter is None: + x = x0 + key = key0 + else: + x, aug_info = augmenter(x0) + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) + + return key, x + + +def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): + utt_length = rng.integers( + low=int(fs * min_utt_length), high=int(fs * max_utt_length + 1) + ) + if utt_length < x.shape[1]: + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] + logging.info( + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, + ) + return x + + +def extract_xvectors( + recordings_file, + output_spec, + vad_spec, + write_speech_dur, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs, +): + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + model = load_model(model_path, device) + + if write_speech_dur is not None: + keys = [] + info = [] + + if aug_cfg is not None: + augmenter = SpeechAugment.create(aug_cfg, rng=rng) + aug_df = [] + else: + augmenter = None + aug_df = None + num_augs = 1 + + metadata_columns = ["speech_duration"] + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output stream: %s with args=%s", output_spec, str(ar_args)) + with DWF.create(output_spec, metadata_columns=metadata_columns) as writer: + logging.info(f"opening input stream: {recordings_file} with args={ar_args}") + with AR(recordings_file, **ar_args) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + fs = fs[0] + t2 = time.time() + if fs != model.sample_frequency: + resampler = get_resampler(fs, model.sample_frequency) + x0 = resampler(x0) + + logging.info("processing utt %s", key0) + for aug_id in range(num_augs): + metadata = {} + t3 = time.time() + key, x = augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + t5 = time.time() + tot_samples = x.shape[1] + if vad_spec is not None: + vad = v_reader.read(key0)[0] + vad = torch.tensor( + vad[None, None, :], dtype=torch.float + ).to(device) + vad = torch.nn.functional.interpolate( + vad, size=x.size(-1), mode="nearest" + ).bool()[0, 0] + x = x[:, vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech samples", + key, + x.shape[1], + tot_samples, + x.shape[1] / tot_samples * 100, + ) + + if random_utt_length: + x = select_random_chunk( + key, x, fs, min_utt_length, max_utt_length, rng + ) + + metadata["speech_duration"] = ( + x.shape[1] / model.sample_frequency + ) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + y = ( + model.extract_embed( + x, + chunk_length=chunk_length, + embed_layer=embed_layer, + ) + .cpu() + .numpy()[0] + ) + + t7 = time.time() + writer.write([key], [y], metadata=metadata) + if write_speech_dur is not None: + keys.append(key) + info.append(str(x.shape[1] / fs)) + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ), + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x.shape[1] / fs / tot_time, + ) + + if write_speech_dur is not None: + logging.info("writing speech duration in secs to %s", write_speech_dur) + u2sd = Utt2Info.create(keys, info) + u2sd.save(write_speech_dur) + + if aug_info_path is not None: + aug_df = pd.concat(aug_df, ignore_index=True) + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + + +def main(): + parser = ArgumentParser( + description="""Extracts x-vectors from waveform computing acoustic features on the fly""" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument("--write-speech-dur", default=None) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + AR.add_class_args(parser) + + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=float, + default=0, + help=( + "max. chunk length used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=float, + default=5, + help=("minimum utterance length in secs when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=float, + default=120, + help=("maximum utterance length in secs when using random utt length"), + ) + + parser.add_argument("--output-spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py new file mode 100755 index 00000000..e70225c2 --- /dev/null +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.feats import MeanVarianceNorm as MVN +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_mvn(device, **kwargs): + mvn_args = MVN.filter_args(**kwargs["mvn"]) + logging.info("mvn args={}".format(mvn_args)) + mvn = MVN(**mvn_args) + if mvn.norm_mean or mvn.norm_var: + return mvn + return None + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) + if utt_length < x.shape[1]: + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] + logging.info( + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, + ) + return x + + +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_num_frames_spec, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + use_gpu, + **kwargs +): + logging.info("initializing") + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + mvn = init_mvn(device, **kwargs) + model = load_model(model_path, device) + + if write_num_frames_spec is not None: + keys = [] + info = [] + + dr_args = DRF.filter_args(**kwargs) + logging.info("opening output stream: %s" % (output_spec)) + with DWF.create(output_spec) as writer: + logging.info("opening input stream: %s" % (input_spec)) + with DRF.create(input_spec, **dr_args) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + while not reader.eof(): + t1 = time.time() + key, data = reader.read(1) + if len(key) == 0: + break + t2 = time.time() + logging.info("processing utt %s" % (key[0])) + x = data[0] + if mvn is not None: + x = mvn.normalize(x) + t3 = time.time() + tot_frames = x.shape[0] + if vad_spec is not None: + vad = v_reader.read(key, num_frames=x.shape[0])[0].astype( + "bool", copy=False + ) + x = x[vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % (key[0], x.shape[0], tot_frames, x.shape[0] / tot_frames * 100) + ) + + if random_utt_length: + x = select_random_chunk(key, x, min_utt_length, max_utt_length, rng) + + t4 = time.time() + if x.shape[0] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) + with torch.no_grad(): + y = ( + model.extract_embed( + xx, chunk_length=chunk_length, embed_layer=embed_layer + ) + .detach() + .cpu() + .numpy()[0] + ) + + t5 = time.time() + writer.write(key, [y]) + if write_num_frames_spec is not None: + keys.append(key[0]) + info.append(str(x.shape[0])) + t6 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f mvn-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key[0], + t6 - t1, + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + t6 - t5, + x.shape[0] * 1e-2 / (t6 - t1), + ) + ) + + if write_num_frames_spec is not None: + logging.info("writing num-frames to %s" % (write_num_frames_spec)) + u2nf = Utt2Info.create(keys, info) + u2nf.save(write_num_frames_spec) + + +def main(): + parser = ArgumentParser(description="Extracts x-vectors from features") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + DRF.add_class_args(parser) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-num-frames", dest="write_num_frames_spec", default=None + ) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + MVN.add_class_args(parser, prefix="mvn") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk of the utterance", + ) + parser.add_argument( + "--min-utt-length", + type=int, + default=500, + help=("minimum utterance length when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=int, + default=12000, + help=("maximum utterance length when using random utt length"), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py new file mode 100755 index 00000000..71a24bd4 --- /dev/null +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.augment import SpeechAugment +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(device, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + feat_extractor.to(device) + return feat_extractor + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def augment(key0, x0, augmenter, aug_df, aug_id): + if augmenter is None: + x = x0 + key = key0 + else: + x, aug_info = augmenter(x0) + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) + + return key, x + + +def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) + if utt_length < x.shape[1]: + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] + logging.info( + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, + ) + return x + + +def extract_xvectors( + recordings_file, + output_spec, + vad_spec, + write_num_frames_spec, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs +): + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + feat_extractor = init_feats(device, **kwargs) + model = load_model(model_path, device) + + if write_num_frames_spec is not None: + keys = [] + info = [] + + if aug_cfg is not None: + augmenter = SpeechAugment.create(aug_cfg, rng=rng) + aug_df = [] + else: + augmenter = None + aug_df = None + num_augs = 1 + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output stream: %s", output_spec) + with DWF.create(output_spec) as writer: + logging.info( + "opening input stream: {} with args={}".format(recordings_file, ar_args) + ) + with AR(recordings_file, **ar_args) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + t2 = time.time() + + logging.info("processing utt %s", key0) + for aug_id in range(num_augs): + t3 = time.time() + key, x = augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + + x, _ = feat_extractor(x) + t5 = time.time() + tot_frames = x.shape[1] + if vad_spec is not None: + vad = v_reader.read(key0, num_frames=tot_frames)[0] + vad = torch.tensor(vad, dtype=torch.bool).to(device) + x = x[:, vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + + if random_utt_length: + x = select_random_chunk( + key, x, min_utt_length, max_utt_length, rng + ) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + x = x.transpose(1, 2).contiguous() + y = ( + model.extract_embed( + x, + chunk_length=chunk_length, + embed_layer=embed_layer, + ) + .cpu() + .numpy()[0] + ) + + t7 = time.time() + writer.write([key], [y]) + if write_num_frames_spec is not None: + keys.append(key) + info.append(str(x.shape[-1])) + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ), + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + ) + + if write_num_frames_spec is not None: + logging.info("writing num-frames to %s", write_num_frames_spec) + u2nf = Utt2Info.create(keys, info) + u2nf.save(write_num_frames_spec) + + if aug_info_path is not None: + aug_df = pd.concat(aug_df, ignore_index=True) + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + + +def main(): + parser = ArgumentParser( + description=( + "Extracts x-vectors from waveform computing acoustic features on the fly" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-num-frames", dest="write_num_frames_spec", default=None + ) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + AR.add_class_args(parser) + + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=int, + default=500, + help=("minimum utterance length when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=int, + default=12000, + help=("maximum utterance length when using random utt length"), + ) + + parser.add_argument("--output-spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py new file mode 100755 index 00000000..a1186ed2 --- /dev/null +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import torch +import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.feats import MeanVarianceNorm as MVN +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_mvn(device, **kwargs): + mvn_args = MVN.filter_args(**kwargs["mvn"]) + logging.info("mvn args={}".format(mvn_args)) + mvn = MVN(**mvn_args) + if mvn.norm_mean or mvn.norm_var: + return mvn + return None + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_timestamps_spec, + slidwin_params_path, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_shift, + feat_snip_edges, + use_gpu, + **kwargs +): + logging.info("initializing") + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + mvn = init_mvn(device, **kwargs) + model = load_model(model_path, device) + + if write_timestamps_spec is not None: + time_writer = DWF.create(write_timestamps_spec) + + dr_args = DRF.filter_args(**kwargs) + logging.info("opening output stream: %s" % (output_spec)) + with DWF.create(output_spec) as writer: + logging.info("opening input stream: %s" % (output_spec)) + with DRF.create(input_spec, **dr_args) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + while not reader.eof(): + t1 = time.time() + key, data = reader.read(1) + if len(key) == 0: + break + t2 = time.time() + logging.info("processing utt %s" % (key[0])) + x = data[0] + if mvn is not None: + x = mvn.normalize(x) + t3 = time.time() + tot_frames = x.shape[0] + if vad_spec is not None: + vad = v_reader.read(key, num_frames=x.shape[0])[0].astype( + "bool", copy=False + ) + x = x[vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % (key[0], x.shape[0], tot_frames, x.shape[0] / tot_frames * 100) + ) + + t4 = time.time() + if x.shape[0] == 0: + y = np.zeros( + ( + 1, + model.embed_dim, + ), + dtype=float_cpu(), + ) + else: + xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) + with torch.no_grad(): + y = ( + model.extract_embed_slidwin( + xx, + win_length, + win_shift, + snip_edges=snip_edges, + feat_frame_length=feat_frame_length, + feat_frame_shift=feat_frame_shift, + chunk_length=chunk_length, + embed_layer=embed_layer, + detach_chunks=True, + ) + .detach() + .cpu() + .numpy()[0] + ) + + t5 = time.time() + y = y.T + writer.write(key, [y]) + + if write_timestamps_spec is not None: + num_wins = y.shape[0] + timestamps = model.compute_slidwin_timestamps( + num_wins, + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ).numpy() + logging.info("{}".format(timestamps)) + time_writer.write(key, [timestamps]) + t6 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f mvn-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key[0], + t6 - t1, + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + t6 - t5, + x.shape[0] * 1e-2 / (t6 - t1), + ) + ) + + if write_timestamps_spec is not None: + time_writer.close() + + if slidwin_params_path is not None: + params = { + "padding": model.compute_slidwin_left_padding( + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ), + "win_length": win_length, + "win_shift": win_shift, + } + with open(slidwin_params_path, "w") as f: + yaml.dump(params, f) + + +def main(): + parser = ArgumentParser(description="Extract x-vectors over a sliding window") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + DRF.add_class_args(parser) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-timestamps", dest="write_timestamps_spec", default=None + ) + parser.add_argument("--slidwin-params-path", default=None) + parser.add_argument( + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + MVN.add_class_args(parser, prefix="mvn") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--win-length", + type=float, + default=1.5, + help=("window length for x-vector extraction in seconds"), + ) + parser.add_argument( + "--win-shift", + type=float, + default=0.25, + help=("window shift for x-vector extraction in seconds"), + ) + parser.add_argument( + "--snip-edges", + default=False, + action="store_true", + help=( + "If true, end effects will be handled by outputting " + "only windows that completely fit in the file, " + "and the number of windows depends on the window-length. " + "If false, the number of windows depends only on " + "the window-shift, and we reflect the data at the ends." + ), + ) + + parser.add_argument( + "--feat-frame-length", + type=float, + default=25, + help=("frame-length used to compute the acoustic features in msecs"), + ) + parser.add_argument( + "--feat-frame-shift", + type=float, + default=10, + help=("frame-shift used to compute the acoustic features in msecs"), + ) + parser.add_argument( + "--feat-snip-edges", + default=False, + action="store_true", + help=( + "If true, end effects will be handled by outputting only windows " + "that completely fit in the file, and the number of windows " + "depends on the feat-frame-length. " + "If false, the number of feature frames depends only on the " + "feat-frame-shift, and we reflect the waveform at the ends." + ), + ) + + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py new file mode 100755 index 00000000..f973b566 --- /dev/null +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -0,0 +1,359 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.augment import SpeechAugment +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(device, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + feat_extractor.to(device) + return feat_extractor + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def augment(key0, x0, augmenter, aug_df, aug_id): + if augmenter is None: + x = x0 + key = key0 + else: + x, aug_info = augmenter(x0) + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) + + return key, x + + +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_timestamps_spec, + slidwin_params_path, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + win_length, + win_shift, + snip_edges, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs +): + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + feat_extractor = init_feats(device, **kwargs) + model = load_model(model_path, device) + + feat_args = kwargs["feats"]["audio_feats"] + feat_frame_length = feat_args["frame_length"] + feat_frame_shift = feat_args["frame_shift"] + feat_snip_edges = feat_args["snip_edges"] + + if write_timestamps_spec is not None: + time_writer = DWF.create(write_timestamps_spec) + + if aug_cfg is not None: + augmenter = SpeechAugment.create(aug_cfg, rng=rng) + aug_df = [] + else: + augmenter = None + aug_df = None + num_augs = 1 + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output stream: %s", output_spec) + with DWF.create(output_spec) as writer: + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) + with AR(input_spec, **ar_args) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) + + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + t2 = time.time() + + logging.info("processing utt %s", key0) + for aug_id in range(num_augs): + t3 = time.time() + key, x = augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + + x, _ = feat_extractor(x) + t5 = time.time() + tot_frames = x.shape[1] + if vad_spec is not None: + vad = v_reader.read(key0, num_frames=tot_frames)[0] + vad = torch.tensor(vad, dtype=torch.bool).to(device) + x = x[:, vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros( + ( + 1, + model.embed_dim, + ), + dtype=float_cpu(), + ) + else: + x = x.transpose(1, 2).contiguous() + y = ( + model.extract_embed_slidwin( + x, + win_length, + win_shift, + snip_edges=snip_edges, + feat_frame_length=feat_frame_length, + feat_frame_shift=feat_frame_shift, + chunk_length=chunk_length, + embed_layer=embed_layer, + detach_chunks=True, + ) + .detach() + .cpu() + .numpy()[0] + ) + + t7 = time.time() + y = y.T + writer.write([key], [y]) + + if write_timestamps_spec is not None: + num_wins = y.shape[0] + timestamps = model.compute_slidwin_timestamps( + num_wins, + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ).numpy() + logging.info("{}".format(timestamps)) + time_writer.write([key], [timestamps]) + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ), + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + ) + + if write_timestamps_spec is not None: + time_writer.close() + + if aug_info_path is not None: + aug_df = pd.concat(aug_df, ignore_index=True) + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + + if slidwin_params_path is not None: + params = { + "padding": model.compute_slidwin_left_padding( + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ), + "win_length": win_length, + "win_shift": win_shift, + } + with open(slidwin_params_path, "w") as f: + yaml.dump(params, f) + + +def main(): + parser = ArgumentParser( + description=( + "Extract x-vectors over a sliding window" + "from waveform computing " + "acoustic features on the fly" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-timestamps", dest="write_timestamps_spec", default=None + ) + parser.add_argument("--slidwin-params-path", default=None) + + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + AR.add_argparse_args(parser) + + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--win-length", + type=float, + default=1.5, + help=("window length for x-vector extraction in seconds"), + ) + parser.add_argument( + "--win-shift", + type=float, + default=0.25, + help=("window shift for x-vector extraction in seconds"), + ) + parser.add_argument( + "--snip-edges", + default=False, + action="store_true", + help=( + "If true, end effects will be handled by outputting " + "only windows that completely fit in the file, " + "and the number of windows depends on the window-length. " + "If false, the number of windows depends only on " + "the window-shift, and we reflect the data at the ends." + ), + ) + + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2vec2transducer.py b/hyperion/bin/finetune_wav2vec2transducer.py new file mode 100755 index 00000000..138f18f7 --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2transducer.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import HFWav2Vec2Transducer +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.utils import ddp + +model_dict = { + "hf_wav2vec2transducer": HFWav2Vec2Transducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + for record in batch: + wav = torch.as_tensor(record[0]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record[1]) + audio = pad_sequence(audio) + audio_length = torch.as_tensor(audio_length) + target = k2.RaggedTensor(target) + return torch.transpose(audio, 0, 1), audio_length, target + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) + return data_loader + + +def init_model(in_model_file, rank, model_class, **kwargs): + model_args = model_class.filter_finetune_args(**kwargs["model"]) + # model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network ft args={}".format(model_args)) + model = TML.load(in_model_file) + model.change_config(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(**kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") + + parser.add_argument("--in-model-file", required=True) + model_class.add_finetune_args(parser, prefix="model") + # model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser( + description="Fine-tune Wav2Vec2Transducer model from audio files" + ) + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + # multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py new file mode 100755 index 00000000..7020e32f --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import ( + HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector, +) +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp + +model_dict = { + "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, + "hf_hubert2resnet1d": HFHubert2ResNet1dXVector, + "hf_wavlm2resnet1d": HFWavLM2ResNet1dXVector, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_model(num_classes, in_model_file, rank, **kwargs): + model_args = kwargs["model"] + if rank == 0: + logging.info("xvector network ft args={}".format(model_args)) + model_args["xvector"]["num_classes"] = num_classes + model = TML.load(in_model_file) + model.change_config(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + if not val_loader.batch_sampler.hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + +def train_model(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.add_argument("--in-model-file", required=True) + model_class.add_finetune_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser( + description="Finetunes Wav2Vec2XVector model from audio files" + ) + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2xvector.py b/hyperion/bin/finetune_wav2xvector.py new file mode 100755 index 00000000..97356c01 --- /dev/null +++ b/hyperion/bin/finetune_wav2xvector.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy + +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as RXVec + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +# from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + # "efficientnet": EXVec, + # "tdnn": TDXVec, + # "transformer": TFXVec, + # "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_xvector(num_classes, in_model_file, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_finetune_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["xvector"]["num_classes"] = num_classes + model = TML.load(in_model_file) + model.change_config(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + try: + hard_prototype_mining = val_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + xvec_class.add_finetune_args(parser, prefix="model") + parser.add_argument("--in-model-file", required=True) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser(description="Fine-tune x-vector model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_dfr_from_feats.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py new file mode 100755 index 00000000..140cc3a2 --- /dev/null +++ b/hyperion/bin/finetune_xvector_dfr_from_feats.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import XVector as XVec +from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer +from hyperion.torch.utils import ddp, open_device + + +def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): + sd_args = SD.filter_args(**kwargs) + sampler_args = Sampler.filter_args(**kwargs) + if rank == 0: + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") + + train_data = SD(data_rspec, train_list, **sd_args) + val_data = SD(data_rspec, val_list, is_val=True, **sd_args) + if rank == 0: + logging.info("init samplers") + train_sampler = Sampler(train_data, **sampler_args) + val_sampler = Sampler(val_data, **sampler_args) + + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + + train_loader = torch.utils.data.DataLoader( + train_data, batch_sampler=train_sampler, **largs + ) + + test_loader = torch.utils.data.DataLoader( + val_data, batch_sampler=val_sampler, **largs + ) + + return train_loader, test_loader + + +def init_xvector( + num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs +): + xvec_args = XVec.filter_finetune_args(**kwargs) + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = TML.load(in_model_path) + model.rebuild_output_layer(**xvec_args) + if prior_model_path: + prior_model = TML.load(prior_model_path) + else: + prior_model = model.copy() + prior_model.freeze() + prior_model.eval() + if train_mode == "ft-embed-affine": + model.freeze_preembed_layers() + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model, prior_model + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + train_mode = kwargs["train_mode"] + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + train_loader, test_loader = init_data(**kwargs) + model, prior_model = init_xvector(train_loader.dataset.num_classes, **kwargs) + + trn_args = Trainer.filter_args(**kwargs) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + + trainer = Trainer( + model, + prior_model, + device=device, + metrics=metrics, + ddp=world_size > 1, + train_mode=train_mode, + **trn_args + ) + if args.resume: + trainer.load_last_checkpoint() + trainer.fit(train_loader, test_loader) + + ddp.ddp_cleanup() + + +# def train_xvec(data_rspec, train_list, val_list, in_model_path, +# prior_model_path, +# reg_layers_enc, reg_layers_classif, +# reg_weight_enc, reg_weight_classif, reg_loss, +# num_gpus, resume, num_workers, +# train_mode, **kwargs): + +# set_float_cpu('float32') +# logging.info('initializing devices num_gpus={}'.format(num_gpus)) +# device = open_device(num_gpus=num_gpus) + +# sd_args = SD.filter_args(**kwargs) +# sampler_args = Sampler.filter_args(**kwargs) +# xvec_args = XVec.filter_finetune_args(**kwargs) +# opt_args = OF.filter_args(prefix='opt', **kwargs) +# lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) +# trn_args = Trainer.filter_args(**kwargs) +# logging.info('seq dataset args={}'.format(sd_args)) +# logging.info('sampler args={}'.format(sampler_args)) +# logging.info('xvector finetune args={}'.format(xvec_args)) +# logging.info('optimizer args={}'.format(opt_args)) +# logging.info('lr scheduler args={}'.format(lrsch_args)) +# logging.info('trainer args={}'.format(trn_args)) + +# logging.info('init datasets') +# train_data = SD(data_rspec, train_list, **sd_args) +# val_data = SD(data_rspec, val_list, is_val=True, **sd_args) + +# logging.info('init samplers') +# train_sampler = Sampler(train_data, **sampler_args) +# val_sampler = Sampler(val_data, **sampler_args) + +# largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + +# train_loader = torch.utils.data.DataLoader( +# train_data, batch_sampler = train_sampler, **largs) + +# test_loader = torch.utils.data.DataLoader( +# val_data, batch_sampler = val_sampler, **largs) + +# xvec_args['num_classes'] = train_data.num_classes +# model = TML.load(in_model_path) +# model.rebuild_output_layer(**xvec_args) +# if prior_model_path: +# prior_model = TML.load(prior_model_path) +# else: +# prior_model = model.copy() +# prior_model.freeze() +# prior_model.eval() +# if train_mode == 'ft-embed-affine': +# model.freeze_preembed_layers() +# logging.info(str(model)) + +# optimizer = OF.create(model.parameters(), **opt_args) +# lr_sch = LRSF.create(optimizer, **lrsch_args) +# metrics = { 'acc': CategoricalAccuracy() } + +# if reg_loss == 'l1': +# reg_loss = nn.L1Loss() +# else: +# reg_loss = nn.MSELoss() + +# trainer = Trainer(model, prior_model, optimizer, +# reg_layers_enc=reg_layers_enc, +# reg_layers_classif=reg_layers_classif, +# reg_weight_enc=reg_weight_enc, +# reg_weight_classif=reg_weight_classif, +# reg_loss=reg_loss, +# device=device, metrics=metrics, lr_scheduler=lr_sch, +# data_parallel=(num_gpus>1), train_mode=train_mode, +# **trn_args) +# if resume: +# trainer.load_last_checkpoint() +# trainer.fit(train_loader, test_loader) + + +def main(): + parser = ArgumentParser( + description="Fine-tune x-vector model with deep feature loss regularization" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--data-rspec", dest="data_rspec", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--val-list", dest="val_list", required=True) + + SD.add_argparse_args(parser) + Sampler.add_argparse_args(parser) + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + + # parser.add_argument('--reg-layers-enc', type=int, default=None, nargs='+', + # help='list of layers from the encoder nnet to use for regularization ') + # parser.add_argument('--reg-layers-classif', type=int, default=None, nargs='+', + # help='list of layers from the classif nnet to use for regularization ') + # parser.add_argument('--reg-weight-enc', type=float, default=0.1, + # help='weight for regularization from enc layers') + # parser.add_argument('--reg-weight-classif', type=float, default=0.1, + # help='weight for regularization from classif layers') + # parser.add_argument('--reg-loss', default='l1', + # choices=['l1', 'mse'], + # help=('type of regularization loss')) + + parser.add_argument("--in-model-path", required=True) + parser.add_argument("--prior-model-path") + XVec.add_finetune_args(parser) + Trainer.add_class_args(parser) + ddp.add_ddp_args(parser) + + # parser.add_argument('--num-gpus', type=int, default=1, + # help='number of gpus, if 0 it uses cpu') + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "--train-mode", + default="ft-embed-affine", + choices=["ft-full", "ft-embed-affine"], + help=( + "ft-full: adapt full x-vector network" + "ft-embed-affine: adapt affine transform before embedding" + ), + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument("--local_rank", default=0, type=int) + + args = parser.parse_args() + gpu_id = args.local_rank + del args.local_rank + + if gpu_id == 0: + try: + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args) + + # config_logger(args.verbose) + # del args.verbose + # logging.debug(args) + + # torch.manual_seed(args.seed) + # del args.seed + + # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_dfr_from_wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py new file mode 100755 index 00000000..9d745e67 --- /dev/null +++ b/hyperion/bin/finetune_xvector_dfr_from_wav.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import XVector as XVec +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer +from hyperion.torch.utils import ddp, open_device + + +def init_data( + audio_path, + train_list, + val_list, + train_aug_cfg, + val_aug_cfg, + num_workers, + num_gpus, + rank, + **kwargs +): + ad_args = AD.filter_args(**kwargs) + sampler_args = Sampler.filter_args(**kwargs) + if rank == 0: + logging.info("audio dataset args={}".format(ad_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") + + train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) + val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) + + if rank == 0: + logging.info("init samplers") + train_sampler = Sampler(train_data, **sampler_args) + val_sampler = Sampler(val_data, **sampler_args) + + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + + train_loader = torch.utils.data.DataLoader( + train_data, batch_sampler=train_sampler, **largs + ) + + test_loader = torch.utils.data.DataLoader( + val_data, batch_sampler=val_sampler, **largs + ) + + return train_loader, test_loader + + +def init_feats(rank, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + if rank == 0: + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=True, **feat_args) + if rank == 0: + logging.info("feat-extractor={}".format(feat_extractor)) + return feat_extractor + + +def init_xvector( + num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs +): + xvec_args = XVec.filter_finetune_args(**kwargs) + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = TML.load(in_model_path) + model.rebuild_output_layer(**xvec_args) + if prior_model_path: + prior_model = TML.load(prior_model_path) + else: + prior_model = model.copy() + prior_model.freeze() + prior_model.eval() + if train_mode == "ft-embed-affine": + model.freeze_preembed_layers() + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model, prior_model + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + train_mode = kwargs["train_mode"] + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader, test_loader = init_data(**kwargs) + feat_extractor = init_feats(**kwargs) + model, prior_model = init_xvector(train_loader.dataset.num_classes, **kwargs) + + trn_args = Trainer.filter_args(**kwargs) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + prior_model, + device=device, + metrics=metrics, + ddp=world_size > 1, + train_mode=train_mode, + **trn_args + ) + if args.resume: + trainer.load_last_checkpoint() + trainer.fit(train_loader, test_loader) + + ddp.ddp_cleanup() + + +# def train_xvec(audio_path, train_list, val_list, +# train_aug_cfg, val_aug_cfg, +# in_model_path, prior_model_path, +# reg_layers_enc, reg_layers_classif, +# reg_weight_enc, reg_weight_classif, reg_loss, +# num_gpus, resume, num_workers, +# train_mode, **kwargs): + +# set_float_cpu('float32') +# logging.info('initializing devices num_gpus={}'.format(num_gpus)) +# device = open_device(num_gpus=num_gpus) + +# ad_args = AD.filter_args(**kwargs) +# sampler_args = Sampler.filter_args(**kwargs) +# feat_args = AFF.filter_args(prefix='feats', **kwargs) +# mvn_args = MVN.filter_args(prefix='mvn', **kwargs) +# xvec_args = XVec.filter_finetune_args(**kwargs) +# opt_args = OF.filter_args(prefix='opt', **kwargs) +# lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) +# trn_args = Trainer.filter_args(**kwargs) +# logging.info('audio dataset args={}'.format(ad_args)) +# logging.info('sampler args={}'.format(sampler_args)) +# logging.info('feat args={}'.format(feat_args)) +# logging.info('mvn args={}'.format(mvn_args)) +# logging.info('xvector finetune args={}'.format(xvec_args)) +# logging.info('optimizer args={}'.format(opt_args)) +# logging.info('lr scheduler args={}'.format(lrsch_args)) +# logging.info('trainer args={}'.format(trn_args)) + +# logging.info('initializing feature extractor args={}'.format(feat_args)) +# feat_extractor = AFF.create(**feat_args) +# mvn = None +# if mvn_args['norm_mean'] or mvn_args['norm_var']: +# logging.info('initializing short-time mvn') +# mvn = MVN(**mvn_args) + +# feat_extractor = FeatExtractor(feat_extractor, mvn) + +# logging.info('init datasets') +# train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) +# val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) + +# logging.info('init samplers') +# train_sampler = Sampler(train_data, **sampler_args) +# val_sampler = Sampler(val_data, **sampler_args) + +# largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + +# train_loader = torch.utils.data.DataLoader( +# train_data, batch_sampler = train_sampler, **largs) + +# test_loader = torch.utils.data.DataLoader( +# val_data, batch_sampler = val_sampler, **largs) + +# xvec_args['num_classes'] = train_data.num_classes +# model = TML.load(in_model_path) +# model.rebuild_output_layer(**xvec_args) +# if prior_model_path: +# prior_model = TML.load(prior_model_path) +# else: +# prior_model = model.copy() +# prior_model.freeze() +# prior_model.eval() +# if train_mode == 'ft-embed-affine': +# model.freeze_preembed_layers() +# logging.info(str(model)) + +# optimizer = OF.create(model.parameters(), **opt_args) +# lr_sch = LRSF.create(optimizer, **lrsch_args) +# metrics = { 'acc': CategoricalAccuracy() } + +# if reg_loss == 'l1': +# reg_loss = nn.L1Loss() +# else: +# reg_loss = nn.MSELoss() + +# trainer = Trainer(model, feat_extractor, prior_model, optimizer, +# reg_layers_enc=reg_layers_enc, reg_layers_classif=reg_layers_classif, +# reg_weight_enc=reg_weight_enc, reg_weight_classif=reg_weight_classif, +# reg_loss=reg_loss, +# device=device, metrics=metrics, lr_scheduler=lr_sch, +# data_parallel=(num_gpus>1), train_mode=train_mode, +# **trn_args) +# if resume: +# trainer.load_last_checkpoint() +# trainer.fit(train_loader, test_loader) + + +def main(): + parser = ArgumentParser( + description=( + "Fine-tune x-vector model with deep feature loss " + "regularization from audio files" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-path", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--val-list", dest="val_list", required=True) + + AD.add_argparse_args(parser) + Sampler.add_argparse_args(parser) + + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + + parser.add_argument("--train-aug-cfg", default=None) + parser.add_argument("--val-aug-cfg", default=None) + + AF.add_class_args(parser, prefix="feats") + + # AFF.add_argparse_args(parser, prefix='feats') + # MVN.add_argparse_args(parser, prefix='mvn') + + # parser.add_argument('--reg-layers-enc', type=int, default=None, nargs='+', + # help='list of layers from the encoder nnet to use for regularization ') + # parser.add_argument('--reg-layers-classif', type=int, default=None, nargs='+', + # help='list of layers from the classif nnet to use for regularization ') + # parser.add_argument('--reg-weight-enc', type=float, default=0.1, + # help='weight for regularization from enc layers') + # parser.add_argument('--reg-weight-classif', type=float, default=0.1, + # help='weight for regularization from classif layers') + # parser.add_argument('--reg-loss', default='l1', + # choices=['l1', 'mse'], + # help=('type of regularization loss')) + + parser.add_argument("--in-model-path", required=True) + parser.add_argument("--prior-model-path") + + XVec.add_finetune_args(parser) + Trainer.add_class_args(parser) + ddp.add_ddp_args(parser) + + # parser.add_argument('--num-gpus', type=int, default=1, + # help='number of gpus, if 0 it uses cpu') + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "--train-mode", + default="ft-embed-affine", + choices=["ft-full", "ft-embed-affine"], + help=( + "ft-full: adapt full x-vector network" + "ft-embed-affine: adapt affine transform before embedding" + ), + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument("--local_rank", default=0, type=int) + + args = parser.parse_args() + gpu_id = args.local_rank + del args.local_rank + + if gpu_id == 0: + try: + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args) + + # config_logger(args.verbose) + # del args.verbose + # logging.debug(args) + + # torch.manual_seed(args.seed) + # del args.seed + + # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_from_feats.py b/hyperion/bin/finetune_xvector_from_feats.py new file mode 100755 index 00000000..01e0c778 --- /dev/null +++ b/hyperion/bin/finetune_xvector_from_feats.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import numpy as np +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import XVector as XVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp, open_device + + +def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): + sd_args = SD.filter_args(**kwargs) + sampler_args = Sampler.filter_args(**kwargs) + if rank == 0: + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") + + train_data = SD(data_rspec, train_list, **sd_args) + val_data = SD(data_rspec, val_list, is_val=True, **sd_args) + if rank == 0: + logging.info("init samplers") + train_sampler = Sampler(train_data, **sampler_args) + val_sampler = Sampler(val_data, **sampler_args) + + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + + train_loader = torch.utils.data.DataLoader( + train_data, batch_sampler=train_sampler, **largs + ) + + test_loader = torch.utils.data.DataLoader( + val_data, batch_sampler=val_sampler, **largs + ) + + return train_loader, test_loader + + +def init_xvector(num_classes, in_model_path, rank, train_mode, **kwargs): + xvec_args = XVec.filter_finetune_args(**kwargs) + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = TML.load(in_model_path) + model.rebuild_output_layer(**xvec_args) + if train_mode == "ft-embed-affine": + model.freeze_preembed_layers() + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + train_mode = kwargs["train_mode"] + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + train_loader, test_loader = init_data(**kwargs) + model = init_xvector(train_loader.dataset.num_classes, **kwargs) + + trn_args = Trainer.filter_args(**kwargs) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + train_mode=train_mode, + **trn_args + ) + if args.resume: + trainer.load_last_checkpoint() + trainer.fit(train_loader, test_loader) + + ddp.ddp_cleanup() + + +# (data_rspec, train_list, val_list, in_model_path, +# num_gpus, resume, num_workers, train_mode, **kwargs): + +# set_float_cpu('float32') +# logging.info('initializing devices num_gpus={}'.format(num_gpus)) +# device = open_device(num_gpus=num_gpus) + +# sd_args = SD.filter_args(**kwargs) +# sampler_args = Sampler.filter_args(**kwargs) +# xvec_args = XVec.filter_finetune_args(**kwargs) +# opt_args = OF.filter_args(prefix='opt', **kwargs) +# lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) +# trn_args = Trainer.filter_args(**kwargs) +# logging.info('seq dataset args={}'.format(sd_args)) +# logging.info('sampler args={}'.format(sampler_args)) +# logging.info('xvector finetune args={}'.format(xvec_args)) +# logging.info('optimizer args={}'.format(opt_args)) +# logging.info('lr scheduler args={}'.format(lrsch_args)) +# logging.info('trainer args={}'.format(trn_args)) + +# logging.info('init datasets') +# train_data = SD(data_rspec, train_list, **sd_args) +# val_data = SD(data_rspec, val_list, is_val=True, **sd_args) + +# logging.info('init samplers') +# train_sampler = Sampler(train_data, **sampler_args) +# val_sampler = Sampler(val_data, **sampler_args) + +# largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + +# train_loader = torch.utils.data.DataLoader( +# train_data, batch_sampler = train_sampler, **largs) + +# test_loader = torch.utils.data.DataLoader( +# val_data, batch_sampler = val_sampler, **largs) + +# xvec_args['num_classes'] = train_data.num_classes +# model = TML.load(in_model_path) +# model.rebuild_output_layer(**xvec_args) +# if train_mode == 'ft-embed-affine': +# model.freeze_preembed_layers() +# logging.info(str(model)) + +# optimizer = OF.create(model.parameters(), **opt_args) +# lr_sch = LRSF.create(optimizer, **lrsch_args) +# metrics = { 'acc': CategoricalAccuracy() } + +# trainer = Trainer(model, optimizer, +# device=device, metrics=metrics, lr_scheduler=lr_sch, +# data_parallel=(num_gpus>1), train_mode=train_mode, +# **trn_args) +# if resume: +# trainer.load_last_checkpoint() +# trainer.fit(train_loader, test_loader) + + +def main(): + parser = ArgumentParser(description="Fine-tune x-vector model") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + + SD.add_argparse_args(parser) + Sampler.add_argparse_args(parser) + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument("--in-model-path", required=True) + XVec.add_finetune_args(parser) + Trainer.add_class_args(parser) + ddp.add_ddp_args(parser) + + # parser.add_argument('--num-gpus', type=int, default=1, + # help='number of gpus, if 0 it uses cpu') + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "--train-mode", + default="ft-embed-affine", + choices=["ft-full", "ft-embed-affine"], + help=( + "ft-full: adapt full x-vector network" + "ft-embed-affine: adapt affine transform before embedding" + ), + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument("--local_rank", default=0, type=int) + + args = parser.parse_args() + gpu_id = args.local_rank + del args.local_rank + + if gpu_id == 0: + try: + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args) + + # args = parser.parse_args() + # config_logger(args.verbose) + # del args.verbose + # logging.debug(args) + + # torch.manual_seed(args.seed) + # del args.seed + + # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py new file mode 100755 index 00000000..2c884d0b --- /dev/null +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import SpineNetXVector as SpineXVec +from hyperion.torch.models import TDNNXVector as TDXVec +from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + "efficientnet": EXVec, + "tdnn": TDXVec, + "transformer": TFXVec, + "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_feats(rank, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + if rank == 0: + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=True, **feat_args) + if rank == 0: + logging.info("feat-extractor={}".format(feat_extractor)) + return feat_extractor + + +def init_xvector(num_classes, in_model_file, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_finetune_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = TML.load(in_model_file) + model.change_config(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + try: + hard_prototype_mining = val_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + feat_extractor = init_feats(**kwargs) + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + AF.add_class_args(parser, prefix="feats") + xvec_class.add_finetune_args(parser, prefix="model") + parser.add_argument("--in-model-file", required=True) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser(description="Fine-tune x-vector model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py new file mode 100755 index 00000000..6f36e3d3 --- /dev/null +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -0,0 +1,411 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time +from pathlib import Path + +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import RandomAttackFactory +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialNdx, Utt2Info + + +def read_utt_list(list_file, class2int_file, part_idx, num_parts): + logging.info("reading utt list %s", list_file) + utt_list = Utt2Info.load(list_file) + utt_list = utt_list.split(part_idx, num_parts) + logging.info("reading class2int-file %s", class2int_file) + class_info = pd.read_csv(class2int_file, header=None, sep=" ") + class2idx = {str(k): i for i, k in enumerate(class_info[0])} + class_idx = np.array([class2idx[k] for k in utt_list.info], dtype=int) + keys = utt_list.key + class_names = utt_list.info + return keys, class_names, class_idx + + +class MyModel(nn.Module): + def __init__(self, feat_extractor, xvector_model): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.vad = None + + def forward(self, s): + f, _ = self.feat_extractor(s) + if self.vad is not None: + n_vad_frames = len(self.vad) + n_feat_frames = f.shape[1] + if n_vad_frames > n_feat_frames: + self.vad = self.vad[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f = f[:, :n_vad_frames] + + f = f[:, self.vad] + + f = f.transpose(1, 2).contiguous() + score = self.xvector_model(f) + return score + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_model(model_path, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + + # feat_args = AFF.filter_args(prefix='feats', **kwargs) + # logging.info('initializing feature extractor args={}'.format(feat_args)) + # feat_extractor = AFF.create(**feat_args) + + # mvn_args = MVN.filter_args(prefix='mvn', **kwargs) + # mvn = None + # if mvn_args['norm_mean'] or mvn_args['norm_var']: + # logging.info('initializing short-time mvn args={}'.format(mvn_args)) + # mvn = MVN(**mvn_args) + + logging.info("loading model {}".format(model_path)) + xvector_model = TML.load(model_path) + xvector_model.freeze() + logging.info("xvector-model={}".format(xvector_model)) + + model = MyModel(feat_extractor, xvector_model) + model.eval() + return model + + +def init_attack_factory(wav_scale=1, **kwargs): + attacks_args = RandomAttackFactory.filter_args(**kwargs["attacks"]) + extra_args = { + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.cross_entropy, + "time_dim": 1, + } + attacks_args.update(extra_args) + + logging.info("attacks args={}".format(attacks_args)) + attack_factory = RandomAttackFactory(**attacks_args) + return attack_factory + + +def select_random_chunk(key, s, fs, min_utt_length, max_utt_length): + utt_length = torch.randint( + low=min_utt_length * fs, high=max_utt_length * fs + 1, size=(1,) + ).item() + if utt_length < len(s): + first_sample = torch.randint(low=0, high=len(s) - utt_length, size=(1,)).item() + s = s[first_sample : first_sample + utt_length] + logging.info( + "extract-random-utt %s of length=%d first-sample=%d" + % (key, len(s), first_sample) + ) + return s + + +def generate_attacks( + wav_file, + list_file, + vad_spec, + vad_path_prefix, + class2int_file, + model_path, + output_wav_dir, + attack_info_file, + attack_tag, + random_utt_length, + min_utt_length, + max_utt_length, + random_seed, + p_attack, + save_failed, + save_benign, + use_gpu, + part_idx, + num_parts, + **kwargs +): + device = init_device(use_gpu) + model = init_model(model_path, **kwargs) + model.to(device) + + logging.info("opening audio read stream: %s", wav_file) + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(wav_file, **audio_args) + wav_scale = audio_reader.wav_scale + + logging.info("opening audio write stream: %s", output_wav_dir) + audio_writer = AW(output_wav_dir, audio_format="flac") + + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + keys, class_names, class_ids = read_utt_list( + list_file, class2int_file, part_idx, num_parts + ) + + attack_factory = init_attack_factory(**kwargs) + attacks_info = {} + + for i in range(len(keys)): + key = keys[i] + class_id = class_ids[i] + + t1 = time.time() + logging.info("reading utt %s" % (key)) + s, fs = audio_reader.read([key]) + s = s[0] + fs = fs[0] + + torch.manual_seed(random_seed + len(s)) # this is to make results reproducible + p = torch.rand(1).item() + if p > p_attack: + logging.info("skipping attack for utt %s", key) + continue + + if random_utt_length: + s = select_random_chunk(key, s, fs, min_utt_length, max_utt_length) + + if save_benign: + s_benign = s + + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) + target = torch.as_tensor([class_id], dtype=torch.long).to(device) + if vad_spec is not None: + vad = v_reader.read([key])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.as_tensor(vad.astype(bool, copy=False), dtype=torch.bool).to( + device + ) + model.vad = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % (key, speech_frames, tot_frames, speech_frames / tot_frames * 100,) + ) + + t2 = time.time() + with torch.no_grad(): + score_benign = model(s) + + _, pred = torch.max(score_benign, dim=1) + if pred[0] != class_id: + logging.info("utt %s failed benign classification, skipping...", key) + continue + + t3 = time.time() + attack = attack_factory.sample_attack(model) + attack_info = attack.attack_info + s_adv = attack.generate(s, target).detach() + t4 = time.time() + with torch.no_grad(): + score_adv = model(s_adv) + t5 = time.time() + + _, pred = torch.max(score_adv, dim=1) + success = False + if pred[0] != class_id: + success = True + + if success or save_failed: + key_attack = "%s-%s" % (key, attack_tag) + logging.info("utt %s attack successful" % (key)) + + stats_ij = compute_stats_adv_attack(s, s_adv) + stats_ij = [float(stat.detach().cpu().numpy()[0]) for stat in stats_ij] + + s_adv = s_adv.cpu().numpy()[0] + wav_attack = audio_writer.write(key_attack, s_adv, fs)[0] + if save_benign: + key_benign = "%s-benign" % (key_attack) + wav_benign = audio_writer.write(key_benign, s_benign, fs)[0] + else: + key_benign = key + wav_benign = "" + + attack_info.update( + { + "attack_tag": attack_tag, + "wav_path": wav_attack, + "class_name": class_names[i], + "class_id": int(class_id), + "key_benign": key_benign, + "wav_benign": wav_benign, + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + "success": success, + } + ) + attacks_info[key_attack] = attack_info + + else: + logging.info("utt %s attack failed, skipping..." % (key)) + + t6 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "eval-benign-time=%.3f attack-time=%.3f eval-attack-time=%3f " + "rt-factor=%.4f" + ) + % ( + key, + t6 - t1, + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + s.shape[1] / fs / (t6 - t1), + ) + ) + + logging.info("saving attack info to %s" % (attack_info_file)) + Path(attack_info_file).parent.mkdir(parents=True, exist_ok=True) + + with open(attack_info_file, "w") as f: + # only save if we have successful attacks + if attacks_info: + yaml.dump(attacks_info, f, sort_keys=True) + + +def main(): + parser = ArgumentParser( + description="Generate Attacks for speaker classification with x-vectors" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--wav-file", required=True) + parser.add_argument("--list-file", required=True) + parser.add_argument("--class2int-file", required=True) + parser.add_argument("--attack-tag", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + RandomAttackFactory.add_class_args(parser, prefix="attacks") + + parser.add_argument("--part-idx", default=1, type=int, help=("part index")) + parser.add_argument( + "--num-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument( + "--output-wav-dir", default=None, help="output path of adv signals" + ) + parser.add_argument( + "--attack-info-file", + default=None, + help="output path of to save information about the generated attacks", + ) + parser.add_argument( + "--random-seed", default=1234, type=int, help="random seed for pytorch" + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=int, + default=5, + help=("minimum utterance length (in secs) when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=int, + default=120, + help=("maximum utterance length (in secs) when using random utt length"), + ) + + parser.add_argument( + "--p-attack", + type=float, + default=1, + help=("probability of generating an attack for a given utterance"), + ) + parser.add_argument( + "--save-failed", + default=False, + action="store_true", + help=("save failed attacks also"), + ) + parser.add_argument( + "--save-benign", + default=False, + action="store_true", + help=("save a copy of the benign sample"), + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + generate_attacks(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py new file mode 100755 index 00000000..ae78ea5b --- /dev/null +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -0,0 +1,448 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time +from pathlib import Path + +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import RandomAttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + + +class MyModel(nn.Module): + def __init__( + self, feat_extractor, xvector_model, embed_layer=None, calibrator=None, sigma=0 + ): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.x_e = None + self.vad_t = None + self.embed_layer = embed_layer + self.calibrator = calibrator + self.sigma = sigma + + def forward(self, s_t): + # print('sigma0=', self.sigma) + if self.sigma > 0: + s_t = s_t + self.sigma * torch.randn_like(s_t) + # print('sigma1=', self.sigma) + f_t = self.feat_extractor(s_t) + if self.vad_t is not None: + n_vad_frames = len(self.vad_t) + n_feat_frames = f_t.shape[1] + if n_vad_frames > n_feat_frames: + self.vad_t = self.vad_t[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f_t = f_t[:, :n_vad_frames] + + f_t = f_t[:, self.vad_t] + + f_t = f_t.transpose(1, 2).contiguous() + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + x_t = l2_norm(x_t) + x_e = l2_norm(self.x_e) + score = torch.sum(x_e * x_t, dim=-1) + if self.calibrator is not None: + score = self.calibrator(score) + + return score + + +def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + key = TrialKey.load(key_file) + if num_seg_parts > 1: + key = key.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + f, idx = ismember(key.model_set, enroll.info) + + assert np.all(f) + x_e = x_e[idx] + + return key, x_e + + +def init_model(model_path, embed_layer, cal_file, threshold, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + + logging.info("loading model {}".format(model_path)) + xvector_model = TML.load(model_path) + xvector_model.freeze() + logging.info("xvector-model={}".format(xvector_model)) + + calibrator = None + if cal_file is not None: + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + # subting the threshold here will put the decision threshold in 0 + # some attacks use thr=0 to decide if the attack is succesful + calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) + + model = MyModel(feat_extractor, xvector_model, embed_layer, calibrator) + model.eval() + return model + + +def init_attack_factory(wav_scale=1, **kwargs): + attacks_args = RandomAttackFactory.filter_args(**kwargs["attacks"]) + extra_args = { + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.binary_cross_entropy_with_logits, + "time_dim": 1, + } + attacks_args.update(extra_args) + + logging.info("attacks args={}".format(attacks_args)) + attack_factory = RandomAttackFactory(**attacks_args) + return attack_factory + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def skip_attack(is_target, p_tar_attack, p_non_attack): + p = torch.rand(1).item() + if is_target: + if p > p_tar_attack: + return True + else: + if p > p_non_attack: + return True + + return False + + +def generate_attacks( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + cal_file, + threshold, + output_wav_dir, + attack_info_file, + attack_tag, + p_tar_attack, + p_non_attack, + save_failed, + use_gpu, + seg_part_idx, + num_seg_parts, + random_seed, + **kwargs +): + device = init_device(use_gpu) + model = init_model(model_path, embed_layer, cal_file, threshold, **kwargs) + model.to(device) + + tar = torch.as_tensor([1], dtype=torch.float).to(device) + non = torch.as_tensor([0], dtype=torch.float).to(device) + + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) + x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) + + logging.info("opening audio read stream: %s", test_wav_file) + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file, **audio_args) + wav_scale = audio_reader.wav_scale + kwargs["wav_scale"] = wav_scale + + logging.info("opening audio write stream: %s", output_wav_dir) + audio_writer = AW(output_wav_dir, audio_format="flac") + + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + attack_factory = init_attack_factory(**kwargs) + attacks_info = {} + + for j in range(key.num_tests): + t1 = time.time() + logging.info("scoring test utt %s", key.seg_set[j]) + s, fs = audio_reader.read([key.seg_set[j]]) + s = s[0] + fs = fs[0] + torch.manual_seed(random_seed + len(s)) # this is to make results reproducible + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) + + if vad_spec is not None: + vad = v_reader.read([key.seg_set[j]])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.as_tensor(vad.astype(bool, copy=False), dtype=torch.bool).to( + device + ) + model.vad_t = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + + t2 = time.time() + + trial_time = 0 + num_trials = 0 + for i in range(key.num_models): + trial_id = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] or key.non[i, j]: + t3 = time.time() + if skip_attack(key.tar[i, j], p_tar_attack, p_non_attack): + logging.info("skipping attack for tar trial %s", trial_id) + continue + + model.x_e = x_e[i : i + 1].to(device) + with torch.no_grad(): + score_benign = model(s) + + if key.tar[i, j] and score_benign < 0: + logging.info( + "target trial %s failed benign classification, skipping...", + trial_id, + ) + continue + elif key.non[i, j] and score_benign > 0: + logging.info( + "non-target trial %s failed benign classification, skipping...", + trial_id, + ) + continue + + attack = attack_factory.sample_attack(model) + if key.tar[i, j]: + t = non if attack.targeted else tar + else: + t = tar if attack.targeted else non + + attack_info = attack.attack_info + s_adv = attack.generate(s, t).detach() + with torch.no_grad(): + # we add the threshold back here to make sure the scores are well calibrated + score_adv = model(s_adv) + + t4 = time.time() + trial_time += t4 - t3 + num_trials += 1 + success = True + if key.tar[i, j] and score_adv > 0: + success = False + if not save_failed: + logging.info( + "attack on target trial %s failed, skipping...", trial_id + ) + continue + elif key.non[i, j] and score_adv < 0: + success = False + if not save_failed: + logging.info( + "attack on non-target trial %s failed benign classification, skipping...", + trial_id, + ) + continue + if success: + logging.info("attack on trial %s successful", trial_id) + + stats_ij = compute_stats_adv_attack(s, s_adv) + stats_ij = [float(stat.detach().cpu().numpy()[0]) for stat in stats_ij] + + s_adv = s_adv.cpu().numpy()[0] + key_attack = "%s-%s" % (trial_id, attack_tag) + output_wav = audio_writer.write(key_attack, s_adv, fs) + + attack_info.update( + { + "attack_tag": attack_tag, + "wav_path": output_wav[0], + "class_name": "target" if key.tar[i, j] else "non-target", + "class_id": int(key.tar[i, j]), + "key_benign": trial_id, + "enroll": str(key.model_set[i]), + "test_benign": str(key.seg_set[j]), + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + "success": success, + } + ) + attacks_info[key_attack] = attack_info + + if num_trials > 0: + trial_time /= num_trials + t7 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.4f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + num_trials * len(s) / fs / (t7 - t1), + ) + + logging.info("saving attack info to %s", attack_info_file) + Path(attack_info_file).parent.mkdir(parents=True, exist_ok=True) + + with open(attack_info_file, "w") as f: + # only save if we have successful attacks + if attacks_info: + yaml.dump(attacks_info, f, sort_keys=True) + + +def main(): + parser = ArgumentParser( + description="Generate Attacks for speaker verification with x-vectors+cos+calibration" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) + parser.add_argument("--attack-tag", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + + RandomAttackFactory.add_class_args(parser, prefix="attacks") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument( + "--output-wav-dir", default=None, help="output path of adv signals" + ) + parser.add_argument( + "--attack-info-file", + default=None, + help="output path of to save information about the generated attacks", + ) + parser.add_argument( + "--random-seed", default=1234, type=int, help="random seed for pytorch" + ) + + parser.add_argument( + "--p-tar-attack", + type=float, + default=1, + help=("probability of generating an attack for a target trial"), + ) + parser.add_argument( + "--p-non-attack", + type=float, + default=1, + help=("probability of generating an attack for a non-target trial"), + ) + parser.add_argument( + "--save-failed", + default=False, + action="store_true", + help=("save failed attacks also"), + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + generate_attacks(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py new file mode 100755 index 00000000..f5db8ada --- /dev/null +++ b/hyperion/bin/hyperion_dataset.py @@ -0,0 +1,629 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path +from typing import List, Optional, Union + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import ( + ClassInfo, + EnrollmentMap, + FeatureSet, + HypDataset, + InfoTable, + PathLike, + RecordingSet, + SegmentSet, +) + +subcommand_list = [ + "add_features", + "set_recordings", + "make_from_recordings", + "remove_short_segments", + "rebuild_class_idx", + "remove_classes_few_segments", + "remove_classes_few_toomany_segments", + "split_train_val", + "copy", + "add_cols_to_segments", + "merge", + "from_lhotse", + "from_kaldi", +] + + +def add_common_args(parser): + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def make_add_features_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--features-name", required=True, help="""name of the feature""" + ) + parser.add_argument("--features-file", required=True, help="""feature set file""") + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def add_features( + dataset: PathLike, + features_name: str, + features_file: PathLike, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = HypDataset.load(dataset, lazy=True) + dataset.add_features(features_name, features_file) + dataset.save(output_dataset) + + +def make_set_recordings_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--recordings-file", required=True, help="""recordings set file""" + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + parser.add_argument( + "--remove-features", + default=None, + nargs="+", + help="""removes feature files from the dataset, + since they maybe obsolote after modifiying the recordings""", + ) + parser.add_argument( + "--update-seg-durs", + default=False, + action=ActionYesNo, + help="""updates the durations in the segment table""", + ) + + add_common_args(parser) + return parser + + +def set_recordings( + dataset: PathLike, + recordings_file: PathLike, + output_dataset: PathLike, + remove_features: List[str], + update_seg_durs: bool, +): + if output_dataset is None: + output_dataset = dataset + + dataset = HypDataset.load(dataset, lazy=True) + dataset.set_recordings(recordings_file, update_seg_durs) + if remove_features is not None: + for features_name in remove_features: + dataset.remove_features(features_name) + + dataset.save(output_dataset) + + +def make_make_from_recordings_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--recordings-file", required=True, help="""recordings set file""" + ) + + add_common_args(parser) + return parser + + +def make_from_recordings( + dataset: PathLike, + recordings_file: PathLike, +): + output_dataset = dataset + import pandas as pd + + rec_df = pd.read_csv(recordings_file) + seg_df = rec_df[["id"]] + segments = SegmentSet(seg_df) + dataset = HypDataset(segments, recordings=recordings_file) + dataset.save(output_dataset) + + +def make_remove_short_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--min-length", + required=True, + type=float, + help="""minimum required length of the segment""", + ) + + parser.add_argument( + "--length-name", + default="duration", + help="""name of the column indicating the length of the segment""", + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def remove_short_segments( + dataset: PathLike, + min_length: float, + length_name: str, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = HypDataset.load(dataset, lazy=True) + dataset.remove_short_segments(min_length, length_name) + dataset.save(output_dataset) + + +def make_rebuild_class_idx_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--class-name", required=True, help="""name of the class type e.g.: speaker""" + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def rebuild_class_idx( + dataset: PathLike, + class_name: str, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = HypDataset.load(dataset, lazy=True) + dataset.rebuild_class_idx(class_name) + dataset.save(output_dataset) + + +def make_remove_classes_few_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--class-name", required=True, help="""name of the class type e.g.: speaker""" + ) + parser.add_argument( + "--min-segs", default=1, type=int, help="""min. num. of segments/class""" + ) + parser.add_argument( + "--rebuild-idx", + default=False, + action=ActionYesNo, + help="""regenerate class indexes from 0 to new_num_classes-1""", + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def remove_classes_few_segments( + dataset: PathLike, + class_name: str, + min_segs: int, + rebuild_idx: bool, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = HypDataset.load(dataset, lazy=True) + dataset.remove_classes_few_segments(class_name, min_segs, rebuild_idx) + dataset.save(output_dataset) + + +def make_remove_classes_few_toomany_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--class-name", required=True, help="""name of the class type e.g.: speaker""" + ) + parser.add_argument( + "--min-segs", default=1, type=int, help="""min. num. of segments/class""" + ) + parser.add_argument( + "--max-segs", default=None, type=int, help="""max. num. of segments/class""" + ) + parser.add_argument( + "--rebuild-idx", + default=False, + action=ActionYesNo, + help="""regenerate class indexes from 0 to new_num_classes-1""", + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def remove_classes_few_toomany_segments( + dataset: PathLike, + class_name: str, + min_segs: int, + max_segs: Union[int, None], + rebuild_idx: bool, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = HypDataset.load(dataset, lazy=True) + dataset.remove_classes_few_toomany_segments( + class_name, min_segs, max_segs, rebuild_idx + ) + dataset.save(output_dataset) + + +def make_split_train_val_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""input dataset dir or .yaml file""" + ) + parser.add_argument( + "--val-prob", + default=0.05, + type=float, + help="""proportion of segments used for val""", + ) + parser.add_argument( + "--min-train-samples", + default=1, + type=int, + help="""min. number of training samples / class""", + ) + + parser.add_argument( + "--joint-classes", + default=None, + nargs="+", + help="""types of classes that need to have same classes in train and val""", + ) + parser.add_argument( + "--disjoint-classes", + default=None, + nargs="+", + help="""types of classes that need to have different classes in train and val""", + ) + parser.add_argument( + "--seed", + default=11235813, + type=int, + help="""random seed""", + ) + + parser.add_argument( + "--train-dataset", + required=True, + help="""output train dataset dir""", + ) + parser.add_argument( + "--val-dataset", + required=True, + help="""output val dataset dir""", + ) + + add_common_args(parser) + return parser + + +def split_train_val( + dataset: PathLike, + val_prob: float, + joint_classes: List[str], + disjoint_classes: List[str], + min_train_samples: int, + seed: int, + train_dataset: PathLike, + val_dataset: PathLike, +): + dataset = HypDataset.load(dataset, lazy=True) + train_ds, val_ds = dataset.split_train_val( + val_prob, joint_classes, disjoint_classes, min_train_samples, seed + ) + train_ds.save(train_dataset) + val_ds.save(val_dataset) + + num_total = len(dataset) + num_train = len(train_ds) + num_val = len(val_ds) + logging.info( + "train: %d (%.2f%%) segments, val: %d (%.2f%%) segments", + num_train, + num_train / num_total * 100, + num_val, + num_val / num_total * 100, + ) + + +def make_copy_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--output-dataset", + required=True, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def copy( + dataset: PathLike, + output_dataset: PathLike, +): + dataset = HypDataset.load(dataset, lazy=True) + dataset.save(output_dataset) + + +def make_add_cols_to_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--right-table", required=True, help="table where the new data is" + ) + parser.add_argument( + "--column-names", + required=True, + nargs="+", + help="""columns to copy to segments table""", + ) + parser.add_argument( + "--on", + default=["id"], + nargs="+", + help="""columns to match both tables rows""", + ) + parser.add_argument( + "--right-on", + default=None, + nargs="+", + help="""columns to match both tables rows""", + ) + + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + parser.add_argument( + "--remove-missing", + default=False, + action=ActionYesNo, + help="remove dataset entries that don't have a value in the right table", + ) + + parser.add_argument( + "--create-class-info", + default=False, + action=ActionYesNo, + help="creates class-info tables for the new columns added to the dataset", + ) + + add_common_args(parser) + return parser + + +def add_cols_to_segments( + dataset: PathLike, + right_table: PathLike, + column_names: List[str], + on: List[str], + right_on: List[str], + output_dataset: PathLike, + remove_missing: bool = False, + create_class_info: bool = False, +): + if output_dataset is None: + output_dataset = dataset + + dataset = HypDataset.load(dataset, lazy=True) + dataset.add_cols_to_segments( + right_table, + column_names, + on, + right_on, + remove_missing=remove_missing, + create_class_info=create_class_info, + ) + dataset.save(output_dataset) + + +def make_merge_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--input-datasets", required=True, nargs="+", help="input datasets" + ) + add_common_args(parser) + return parser + + +def merge(dataset: PathLike, input_datasets: List[PathLike]): + input_dataset_paths = input_datasets + dataset_path = dataset + input_datasets = [] + for dset_file in input_dataset_paths: + input_datasets.append(HypDataset.load(dset_file)) + + dataset = HypDataset.merge(input_datasets) + dataset.save(dataset_path) + + +def make_from_lhotse_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--cuts-file", + default=None, + help="lhotse cuts file", + ) + parser.add_argument( + "--recordings-file", + default=None, + help="lhotse recordings set file", + ) + parser.add_argument( + "--supervisions-file", + default=None, + help="lhotse supervisions file", + ) + add_common_args(parser) + return parser + + +def from_lhotse( + dataset: PathLike, + cuts_file: Optional[PathLike] = None, + recordings_file: Optional[PathLike] = None, + supervisions_file: Optional[PathLike] = None, +): + + assert cuts_file is not None or supervisions_file is not None + dataset_path = dataset + dataset = HypDataset.from_lhotse( + cuts=cuts_file, recordings=recordings_file, supervisions=supervisions_file + ) + dataset.save(dataset_path) + + +def make_from_kaldi_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--kaldi-data-dir", + required=True, + help="Kaldi data directory", + ) + add_common_args(parser) + return parser + + +def from_kaldi( + dataset: PathLike, + kaldi_data_dir: PathLike, +): + + dataset_path = dataset + dataset = HypDataset.from_kaldi(kaldi_data_dir) + dataset.save(dataset_path) + + +def main(): + parser = ArgumentParser(description="Tool to manipulates the Hyperion dataset") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in subcommand_list: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(subcommand, subparser) + + args = parser.parse_args() + subcommand = args.subcommand + kwargs = namespace_to_dict(args)[args.subcommand] + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py new file mode 100755 index 00000000..3f847d29 --- /dev/null +++ b/hyperion/bin/hyperion_tables.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path +from typing import List, Optional, Union + +import numpy as np +import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import ( + ClassInfo, + EnrollmentMap, + FeatureSet, + InfoTable, + PathLike, + RecordingSet, + SegmentSet, +) + +subcommand_list = ["cat", "filter", "make_class_file_from_column"] +table_dict = { + "segments": SegmentSet, + "recordings": RecordingSet, + "features": FeatureSet, + "classes": ClassInfo, + "enrollments": EnrollmentMap, + "generic": InfoTable, +} + + +def add_common_args(parser): + parser.add_argument( + "--table-type", + default="generic", + choices=list(table_dict.keys()), + help=f"Type of table in {list(table_dict.keys())}", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def make_cat_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--input-files", default=None, nargs="+", help="optional list of input files" + ) + parser.add_argument( + "--output-file", + required=True, + help="""output file, if input-files is None, input files names are derived from it""", + ) + parser.add_argument( + "--num-tables", + default=0, + type=int, + help="""number of jobs we used to create the individual tables""", + ) + parser.add_argument( + "--base-idx", + default=1, + type=int, + help="""index of the first job, typically 0 or 1""", + ) + + add_common_args(parser) + return parser + + +def cat( + table_type: str, + input_files: Union[List[PathLike], None], + output_file: PathLike, + num_tables: int, + base_idx: int = 1, +): + assert input_files is not None or num_tables != 0 + output_file = Path(output_file) + if input_files is None: + ext = output_file.suffix + input_file_base = output_file.with_suffix("") + input_files = [] + for i in range(num_tables): + idx = base_idx + i + input_file_i = input_file_base.with_suffix(f".{idx}{ext}") + input_files.append(input_file_i) + + table_class = table_dict[table_type] + tables = [] + for file_path in input_files: + tables.append(table_class.load(file_path)) + + output_table = table_class.cat(tables) + output_table.save(output_file) + + +def make_filter_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input-file", required=True, help="input table file") + parser.add_argument( + "--filter-file", required=True, help="table file that we use as filter" + ) + parser.add_argument( + "--filter-by", default="id", help="column that we use to filter " + ) + parser.add_argument( + "--output-file", + required=True, + help="""output table file""", + ) + parser.add_argument( + "--raise-if-missing", + default=True, + action=ActionYesNo, + help="raise exception if filter values are not in input file", + ) + add_common_args(parser) + return parser + + +def filter( + table_type: str, + input_file: PathLike, + filter_file: PathLike, + output_file: PathLike, + filter_by: str, + raise_if_missing: bool, +): + + input_file = Path(input_file) + filter_file = Path(filter_file) + output_file = Path(output_file) + + table_class = table_dict[table_type] + input_table = table_class.load(input_file) + filter_table = table_class.load(filter_file) + output_table = input_table.filter( + items=filter_table[filter_by], by=filter_by, raise_if_missing=raise_if_missing + ) + output_table.save(output_file) + + +def make_make_class_file_from_column_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input-file", required=True, help="input table file") + + parser.add_argument( + "--column", + required=True, + help="column that we want to use to create a class-file", + ) + parser.add_argument( + "--output-file", + required=True, + help="""output class-file table""", + ) + + add_common_args(parser) + return parser + + +def make_class_file_from_column( + table_type: str, + input_file: PathLike, + output_file: PathLike, + column: str, +): + + input_file = Path(input_file) + output_file = Path(output_file) + + table_class = table_dict[table_type] + input_table = table_class.load(input_file) + class_ids = np.unique(input_table[column]) + df = pd.DataFrame({"id": class_ids}) + output_table = ClassInfo(df) + output_table.save(output_file) + + +def main(): + parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in subcommand_list: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(subcommand, subparser) + + args = parser.parse_args() + subcommand = args.subcommand + kwargs = namespace_to_dict(args)[args.subcommand] + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/make-babble-noise-audio-files.py b/hyperion/bin/make-babble-noise-audio-files.py deleted file mode 100755 index 460f4044..00000000 --- a/hyperion/bin/make-babble-noise-audio-files.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2020 Jesus Villalba (Johns Hopkins University) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging - -import math -import numpy as np -from scipy import signal, ndimage - -from hyperion.hyp_defs import config_logger -from hyperion.utils import Utt2Info -from hyperion.io import RandomAccessAudioReader as AR -from hyperion.io import AudioWriter as Writer -from hyperion.io import VADReaderFactory as VRF - - -def make_noise(xs): - - lens = np.array([x.shape[0] for x in xs]) - max_len = np.max(lens) - num_tiles = np.ceil(max_len / lens) - for i in range(len(xs)): - xs[i] = np.tile(xs[i], int(num_tiles[i]))[:max_len] - - for i in range(1, len(xs)): - xs[0] += xs[i] - xs[i].mean() - - return xs[0] - - -def make_babble_noise_audio_files( - input_path, - output_path, - output_script, - write_time_durs_spec, - min_spks=3, - max_spks=7, - num_reuses=5, - random_seed=112358, - **kwargs -): - - input_args = AR.filter_args(**kwargs) - output_args = Writer.filter_args(**kwargs) - logging.info("input_args={}".format(input_args)) - logging.info("output_args={}".format(output_args)) - - rng = np.random.RandomState(seed=random_seed) - - if write_time_durs_spec is not None: - okeys = [] - info = [] - - count = 0 - t1 = time.time() - with AR(input_path, **input_args) as reader: - keys = reader.keys - with Writer(output_path, output_script, **output_args) as writer: - - for iters in range(num_reuses): - keys = rng.permutation(keys) - - cur_spks = min_spks - utt_list = [] - for utt_idx in range(len(keys)): - if len(utt_list) < cur_spks: - utt_list.append(keys[utt_idx]) - continue - - x, fs = reader.read(utt_list) - fs = fs[0] - y = make_noise(x) - babble_id = "babble-%05d" % (count) - logging.info("writing file % s" % (babble_id)) - writer.write([babble_id], [y], [fs]) - if write_time_durs_spec is not None: - okeys.append(babble_id) - info.append(y.shape[0] / fs) - - count += 1 - utt_list = [] - cur_spks += 1 - if cur_spks > max_spks: - cur_spks = min_spks - - if write_time_durs_spec is not None: - logging.info("writing time durations to %s" % (write_time_durs_spec)) - u2td = Utt2Info.create(okeys, info) - u2td.save(write_time_durs_spec) - - logging.info("finished making babble files, elapsed-time=%f" % (time.time() - t1)) - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Creates babble noise by adding speech files") - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) - parser.add_argument("--output-path", required=True) - parser.add_argument("--output-script", required=True) - parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None) - - AR.add_class_args(parser) - Writer.add_class_args(parser) - - parser.add_argument("--min-spks", default=3, type=int) - parser.add_argument("--max-spks", default=10, type=int) - parser.add_argument("--num-reuses", default=5, type=int) - parser.add_argument("--random-seed", default=112358, type=int) - parser.add_argument( - "-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int, - help="Verbose level", - ) - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - make_babble_noise_audio_files(**namespace_to_dict(args)) diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py new file mode 100755 index 00000000..43d6ab91 --- /dev/null +++ b/hyperion/bin/make_babble_noise_audio_files.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python +""" + Copyright 2020 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import math +import os +import sys +import time + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import AudioWriter as Writer +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.utils import Utt2Info + + +def make_noise(xs, max_value): + lens = np.array([x.shape[0] for x in xs]) + max_len = np.max(lens) + num_tiles = np.ceil(max_len / lens) + for i in range(len(xs)): + xs[i] = np.tile(xs[i], int(num_tiles[i]))[:max_len] + + xs[0] -= xs[0].mean() + for i in range(1, len(xs)): + xs[0] += xs[i] - xs[i].mean() + + max_x = np.max(np.abs(xs[0])) + if max_x > max_value: + xs[0] *= max_value / max_x + + return xs[0] + + +def make_babble_noise_audio_files( + recordings_file, + output_path, + output_recordings_file, + write_time_durs, + min_spks=3, + max_spks=7, + num_reuses=5, + random_seed=112358, + **kwargs, +): + input_args = AR.filter_args(**kwargs) + output_args = Writer.filter_args(**kwargs) + logging.info(f"input_args={input_args}") + logging.info(f"output_args={output_args}") + + rng = np.random.default_rng(seed=random_seed) + + if write_time_durs is not None: + okeys = [] + info = [] + + count = 0 + t1 = time.time() + with AR(recordings_file, **input_args) as reader, Writer( + output_path, output_recordings_file, **output_args + ) as writer: + keys = reader.keys + for iters in range(num_reuses): + keys = rng.permutation(keys) + + cur_spks = min_spks + utt_list = [] + for utt_idx in range(len(keys)): + if len(utt_list) < cur_spks: + utt_list.append(keys[utt_idx]) + continue + + x, fs = reader.read(utt_list) + fs = fs[0] + y = make_noise(x, reader.wav_scale) + babble_id = "babble-%05d" % (count) + logging.info("writing file %s", babble_id) + writer.write([babble_id], [y], [fs]) + if write_time_durs is not None: + okeys.append(babble_id) + info.append(y.shape[0] / fs) + + count += 1 + utt_list = [] + cur_spks += 1 + if cur_spks > max_spks: + cur_spks = min_spks + + if write_time_durs is not None: + logging.info("writing time durations to %s", write_time_durs) + u2td = Utt2Info.create(okeys, info) + u2td.save(write_time_durs) + + logging.info("finished making babble files, elapsed-time=%f", time.time() - t1) + + +def main(): + parser = ArgumentParser(description="Creates babble noise by adding speech files") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument("--output-recordings-file", required=True) + parser.add_argument("--write-time-durs", default=None) + + AR.add_class_args(parser) + Writer.add_class_args(parser) + + parser.add_argument("--min-spks", default=3, type=int) + parser.add_argument("--max-spks", default=10, type=int) + parser.add_argument("--num-reuses", default=5, type=int) + parser.add_argument("--random-seed", default=112358, type=int) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + make_babble_noise_audio_files(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/make_wav2xvector.py b/hyperion/bin/make_wav2xvector.py new file mode 100755 index 00000000..b3a1a2d5 --- /dev/null +++ b/hyperion/bin/make_wav2xvector.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +""" + Copyright 2023 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger + +# from hyperion.torch import TorchModelLoader as TML +from hyperion.torch import TorchModel + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import Wav2ResNet1dXVector as W2R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as W2RXVec +from hyperion.torch.narchs import AudioFeatsMVN as AF + + +def init_feats(feats): + feat_args = AF.filter_args(**feats) + logging.info(f"feat args={feat_args}") + logging.info("initializing feature extractor") + feat_extractor = AF(trans=True, **feat_args) + logging.info(f"feat-extractor={feat_extractor}") + return feat_extractor + + +def load_model(model_path): + logging.info("loading model %s", model_path) + model = TorchModel.auto_load(model_path) + logging.info(f"xvector-model={model}") + return model + + +def make_wav2xvector(feats, xvector_path, output_path): + feats = init_feats(feats) + xvector_model = load_model(xvector_path) + if isinstance(xvector_model, RXVec): + model = W2RXVec(feats, xvector_model) + elif isinstance(xvector_model, R1dXVec): + model = W2R1dXVec(feats, xvector_model) + else: + TypeError( + "Conversion of xvector class=%s not available", xvector_model.__class__ + ) + + logging.info("saving model of class %s to %s", model.__class__, output_path) + model.save(output_path) + + +def main(): + parser = ArgumentParser( + description="""Combines the feature extractor config with XVector model + to produce a Wav2XVector model with integrated feature extraction""" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + AF.add_class_args(parser, prefix="feats") + parser.add_argument("--xvector-path", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + del args.cfg + logging.debug(args) + + make_wav2xvector(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/merge_scores.py b/hyperion/bin/merge_scores.py new file mode 100755 index 00000000..72ab6010 --- /dev/null +++ b/hyperion/bin/merge_scores.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import TrialScores + + +def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, base_idx): + output_file = Path(output_file) + output_file.parent.mkdir(exist_ok=True, parents=True) + + ext = output_file.suffix + + if input_files is None: + if ext in [".h5", ".csv", ".tsv"]: + input_file_base = output_file + else: + input_file_base = output_file.parent / (output_file.name + ".txt") + ext = "" + + logging.info("merging %s* -> %s", input_file_base.with_suffix(""), output_file) + input_files = [] + for i in range(num_enroll_parts): + idx_i = base_idx + i + for j in range(num_test_parts): + idx_j = base_idx + j + input_file_i = input_file_base.with_suffix(f".{idx_i}.{idx_j}{ext}") + input_files.append(input_file_i) + else: + logging.info("merging %s -> %s", " + ".join(input_files), output_file) + + if ext == ".h5": + # if files are h5 we need to load everything in RAM + score_list = [] + for score_file in input_files: + scores = TrialScores.load(score_file) + score_list.append(scores) + + scores = TrialScores.merge(score_list) + scores.save(output_file) + else: + has_header = ext in [".csv", ".tsv"] + write_header = True + with open(output_file, "w", encoding="utf-8") as f_out: + for score_file in input_files: + with open(score_file) as f_in: + for i, line in enumerate(f_in): + if i == 0 and has_header and not write_header: + continue + f_out.write(line) + write_header = False + + +def main(): + parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--input-files", default=None, nargs="+", help="optional list of input files" + ) + parser.add_argument( + "--output-file", + required=True, + help="""output file, if input-files is None, input files names are derived from it""", + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts we divided the enrollment set""", + ) + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts we divided the test set""", + ) + + parser.add_argument( + "--base-idx", + default=1, + type=int, + help="""index of the first job, typically 0 or 1""", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + args = parser.parse_args() + kwargs = namespace_to_dict(args) + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + merge_scores(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/pack-wav-rirs.py b/hyperion/bin/pack-wav-rirs.py deleted file mode 100755 index 00177988..00000000 --- a/hyperion/bin/pack-wav-rirs.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2020 Jesus Villalba (Johns Hopkins University) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging - -import math -import numpy as np - -from hyperion.hyp_defs import config_logger -from hyperion.io import SequentialAudioReader as AR -from hyperion.io import DataWriterFactory as DWF - - -def pack_wav_rirs(input_path, output_spec, **kwargs): - - writer = DWF.create(output_spec, scp_sep=" ", compress=False) - t1 = time.time() - with AR(input_path, wav_scale=1) as reader: - for data in reader: - key, h, fs = data - if h.ndim == 2: - h = h[:, 0] - h_delay = np.argmax(np.abs(h)) - h_max = h[h_delay] - h /= h_max - h[h < 1e-3] = 0 - h = np.trim_zeros(h) - logging.info( - "Packing rir %s h_max=%f h_delay=%d h-length=%d" - % (key, h_max, h_delay, len(h)) - ) - writer.write([key], [h]) - - logging.info("Packed RIRS elapsed-time=%.f" % (time.time() - t1)) - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Packs RIRs in wave format to h5/ark files") - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) - parser.add_argument("--output", dest="output_spec", required=True) - parser.add_argument( - "-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int, - help="Verbose level", - ) - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - pack_wav_rirs(**namespace_to_dict(args)) diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py new file mode 100755 index 00000000..bf88d674 --- /dev/null +++ b/hyperion/bin/pack_wav_rirs.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +""" + Copyright 2020 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import math +import os +import sys +import time + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR + + +def pack_wav_rirs(input_path, output_spec, **kwargs): + writer = DWF.create(output_spec, compress=False) + t1 = time.time() + with AR(input_path, wav_scale=1) as reader: + for data in reader: + key, h, fs = data + if h.ndim == 2: + h = h[:, 0] + h_delay = np.argmax(np.abs(h)) + h_max = h[h_delay] + h /= h_max + h[h < 1e-3] = 0 + h = np.trim_zeros(h) + logging.info( + "Packing rir %s h_max=%f h_delay=%d h-length=%d", + key, + h_max, + h_delay, + len(h), + ) + writer.write([key], [h]) + + logging.info("Packed RIRS elapsed-time=%.f", time.time() - t1) + + +def main(): + parser = ArgumentParser(description="Packs RIRs in wave format to h5/ark files") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + pack_wav_rirs(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/plot-vector-tsne.py b/hyperion/bin/plot-vector-tsne.py deleted file mode 100755 index 030d7e39..00000000 --- a/hyperion/bin/plot-vector-tsne.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import sys -import os -import argparse -import time -import logging - -import numpy as np -import matplotlib - -matplotlib.use("Agg") -import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D as plt3d - -from sklearn.manifold import TSNE - -from hyperion.hyp_defs import config_logger -from hyperion.io import DataWriterFactory as DWF -from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, PCA - -colors = ["b", "g", "r", "c", "m", "y", "k"] -markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] - - -def plot_vector_tsne( - iv_file, - v_list, - preproc_file, - output_path, - save_embed, - output_dim, - perplexity, - exag, - lr, - num_iter, - init_method, - rng_seed, - verbose, - pca_dim, - max_classes, - **kwargs -): - - if preproc_file is not None: - preproc = TransformList.load(preproc_file) - else: - preproc = None - - vr_args = VCR.filter_args(**kwargs) - vcr = VCR(iv_file, v_list, preproc, **vr_args) - - x, class_ids = vcr.read() - - t1 = time.time() - - if pca_dim > 0: - pca = PCA(pca_dim=pca_dim) - pca.fit(x) - x = pca.predict(x) - - if not os.path.exists(output_path): - os.makedirs(ouput_path) - - tsne_obj = lambda n: TSNE( - n_components=n, - perplexity=perplexity, - early_exaggeration=exag, - learning_rate=lr, - n_iter=num_iter, - init=init_method, - random_state=rng_seed, - verbose=verbose, - ) - - if max_classes > 0: - index = class_ids < max_classes - x = x[index] - class_ids = class_ids[index] - - if output_dim > 3: - tsne = tsne_obj(output_dim) - y = tsne.fit_transform(x) - - if save_embed: - h5_file = "%s/embed_%dd.h5" % (output_path, ouput_dim) - hw = DWF.create(h5_file) - hw.write(vcr.u2c.key, y) - - tsne = tsne_obj(2) - y = tsne.fit_transform(x) - if save_embed: - h5_file = "%s/embed_2d.h5" % output_path - hw = DWF.create(h5_file) - hw.write(vcr.u2c.key, y) - - fig_file = "%s/tsne_2d.pdf" % (output_path) - # plt.scatter(y[:,0], y[:,1], c=class_ids, marker='x') - - color_marker = [(c, m) for m in markers for c in colors] - for c in np.unique(class_ids): - idx = class_ids == c - plt.scatter( - y[idx, 0], - y[idx, 1], - c=color_marker[c][0], - marker=color_marker[c][1], - label=vcr.class_names[c], - ) - - plt.legend() - plt.grid(True) - plt.show() - plt.savefig(fig_file) - plt.clf() - - # if max_classes > 0: - # fig_file = '%s/tsne_2d_n%d.pdf' % (output_path, max_classes) - # index = class_ids < max_classes - # plt.scatter(y[index,0], y[index,1], c=class_ids[index], marker='x') - # plt.grid(True) - # plt.show() - # plt.savefig(fig_file) - # plt.clf() - - tsne = tsne_obj(3) - y = tsne.fit_transform(x) - if save_embed: - h5_file = "%s/embed_3d.h5" % output_path - hw = DWF.create(h5_file) - hw.write(vcr.u2c.key, y) - - fig_file = "%s/tsne_3d.pdf" % (output_path) - fig = plt.figure() - ax = fig.add_subplot(111, projection="3d") - # ax.scatter(y[:,0], y[:,1], y[:,2], c=class_ids, marker='x') - for c in np.unique(class_ids): - idx = class_ids == c - ax.scatter( - y[idx, 0], - y[idx, 1], - y[idx, 2], - c=color_marker[c][0], - marker=color_marker[c][1], - label=vcr.class_names[c], - ) - - plt.grid(True) - plt.show() - plt.savefig(fig_file) - plt.clf() - - # if max_classes > 0: - # fig_file = '%s/tsne_3d_n%d.pdf' % (output_path, max_classes) - # index = class_ids < max_classes - # ax = fig.add_subplot(111, projection='3d') - # ax.scatter(y[index,0], y[index,1], y[index,2], c=class_ids[index], marker='x') - # plt.grid(True) - # plt.show() - # plt.savefig(fig_file) - # plt.clf() - - logging.info("Elapsed time: %.2f s." % (time.time() - t1)) - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars="@", - description="Plots TSNE embeddings", - ) - - parser.add_argument("--iv-file", dest="iv_file", required=True) - parser.add_argument("--v-list", dest="v_list", required=True) - parser.add_argument("--preproc-file", dest="preproc_file", default=None) - - VCR.add_argparse_args(parser) - - parser.add_argument("--output-path", dest="output_path", required=True) - parser.add_argument( - "--save-embed", dest="save_embed", default=False, action="store_true" - ) - - parser.add_argument("--output-dim", dest="output_dim", type=int, default=3) - parser.add_argument("--perplexity", dest="perplexity", type=float, default=30) - parser.add_argument("--exag", dest="exag", type=float, default=12) - parser.add_argument("--lr", dest="lr", type=float, default=200) - parser.add_argument("--num-iter", dest="num_iter", type=int, default=1000) - parser.add_argument( - "--init-method", dest="init_method", default="pca", choices=["random", "pca"] - ) - parser.add_argument("--rng-seed", dest="rng_seed", type=int, default=1024) - parser.add_argument("--pca-dim", dest="pca_dim", type=int, default=50) - parser.add_argument("--max-classes", dest="max_classes", type=int, default=10) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - logging.debug(args) - - plot_vector_tsne(**vars(args)) diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py new file mode 100755 index 00000000..60d7ac5c --- /dev/null +++ b/hyperion/bin/plot_embedding_tsne.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time +from pathlib import Path + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import PCA, LNorm, SklTSNE +from hyperion.utils import SegmentSet + +matplotlib.use("Agg") +colors = ["b", "g", "r", "c", "m", "y", "k"] +markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] + +color_marker = [(c, m) for m in markers for c in colors] + + +def plot_embedding_tsne( + train_v_file, + train_list, + pca_var_r, + prob_plot, + lnorm, + title, + max_classes, + unlabeled, + plot_class_names, + output_dir, + **kwargs, +): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + train_reader = DRF.create(train_v_file) + x_trn = train_reader.read(train_segs["id"], squeeze=True) + del train_reader + logging.info("loaded %d samples", x_trn.shape[0]) + if lnorm: + x_trn = LNorm().predict(x_trn) + + if pca_var_r < 1: + pca = PCA(pca_var_r=pca_var_r) + pca.fit(x_trn) + x_pca = pca.predict(x_trn) + logging.info("pca-dim=%d", x_pca.shape[1]) + else: + x_pca = x_trn + + tsne_args = SklTSNE.filter_args(**kwargs["tsne"]) + tsne = SklTSNE(**tsne_args) + x_tsne = tsne.fit(x_pca) + p = np.random.rand(x_tsne.shape[0]) <= prob_plot + x_tsne = x_tsne[p] + logging.info("plots %d samples", x_tsne.shape[0]) + + if unlabeled: + plot_class_names = ["none"] + + for col in plot_class_names: + fig_file = f"{output_dir}/train_tsne_{col}.png" + if not unlabeled: + classes = train_segs.loc[p, col] + classes, class_ids = np.unique(classes, return_inverse=True) + if max_classes is not None: + index = class_ids < max_classes + x_tsne_filtered = x_tsne[index] + class_ids = class_ids[index] + else: + x_tsne_filtered = x_tsne + + else: + class_ids = np.zeros((len(x_tsne.shape[0]),), dtype=np.int) + classes = [None] + + for c in range(np.max(class_ids) + 1): + idx = class_ids == c + if not unlabeled: + logging.info("plot class %s with %d samples", classes[c], np.sum(idx)) + plt.scatter( + x_tsne_filtered[idx, 0], + x_tsne_filtered[idx, 1], + c=color_marker[c][0], + marker=color_marker[c][1], + label=classes[c], + ) + + if not unlabeled: + plt.legend() + plt.grid(True) + plt.title(title) + plt.savefig(fig_file) + plt.clf() + + # fig_file = "%s/tsne_3d.pdf" % (output_dir) + # fig = plt.figure() + # ax = fig.add_subplot(111, projection="3d") + # # ax.scatter(y[:,0], y[:,1], y[:,2], c=class_ids, marker='x') + # for c in np.unique(class_ids): + # idx = class_ids == c + # ax.scatter( + # y[idx, 0], + # y[idx, 1], + # y[idx, 2], + # c=color_marker[c][0], + # marker=color_marker[c][1], + # label=vcr.class_names[c], + # ) + + # plt.grid(True) + # plt.show() + # plt.savefig(fig_file) + # plt.clf() + + +def main(): + parser = ArgumentParser(description="Projects embeddings using TSNE") + + parser.add_argument("--train-v-file", required=True) + parser.add_argument("--train-list", required=True) + + parser.add_argument("--pca-var-r", default=0.95, type=float) + parser.add_argument("--prob-plot", default=0.1, type=float) + parser.add_argument("--lnorm", default=False, action=ActionYesNo) + parser.add_argument("--unlabeled", default=False, action=ActionYesNo) + parser.add_argument( + "--plot-class-names", + default=["class_id"], + nargs="+", + help="names of the class columns we plot", + ) + parser.add_argument("--title", default="") + SklTSNE.add_class_args(parser, prefix="tsne") + + parser.add_argument( + "--max-classes", default=None, type=int, help="max number of clases to plot" + ) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + plot_embedding_tsne(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() + +# #!/usr/bin/env python +# """ +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# """ + +# import sys +# import os +# from jsonargparse import ( +# ArgumentParser, +# ActionConfigFile, +# ActionParser, +# namespace_to_dict, +# ) +# import time +# import logging + +# import numpy as np +# import pandas as pd +# import matplotlib + +# import matplotlib.pyplot as plt +# from mpl_toolkits.mplot3d import Axes3D as plt3d + +# from sklearn.manifold import TSNE + +# from hyperion.hyp_defs import config_logger +# from hyperion.io import DataWriterFactory as DWF +# from hyperion.helpers import VectorClassReader as VCR +# from hyperion.np.transforms import TransformList, PCA + +# matplotlib.use("Agg") +# colors = ["b", "g", "r", "c", "m", "y", "k"] +# markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] + + +# def plot_embedding_tsne( +# v_file, +# v_list, +# preproc_file, +# output_dir, +# save_embed, +# output_dim, +# perplexity, +# exag, +# lr, +# num_iter, +# init_method, +# rng_seed, +# verbose, +# pca_dim, +# max_classes, +# **kwargs +# ): + +# if preproc_file is not None: +# preproc = TransformList.load(preproc_file) +# else: +# preproc = None + +# vr_args = VCR.filter_args(**kwargs) +# vcr = VCR(iv_file, v_list, preproc, **vr_args) + +# x, class_ids = vcr.read() + +# t1 = time.time() + +# if pca_dim > 0: +# pca = PCA(pca_dim=pca_dim) +# pca.fit(x) +# x = pca.predict(x) + +# if not os.path.exists(output_path): +# os.makedirs(ouput_path) + +# tsne_obj = lambda n: TSNE( +# n_components=n, +# perplexity=perplexity, +# early_exaggeration=exag, +# learning_rate=lr, +# n_iter=num_iter, +# init=init_method, +# random_state=rng_seed, +# verbose=verbose, +# ) + +# if max_classes > 0: +# index = class_ids < max_classes +# x = x[index] +# class_ids = class_ids[index] + +# if output_dim > 3: +# tsne = tsne_obj(output_dim) +# y = tsne.fit_transform(x) + +# if save_embed: +# h5_file = "%s/embed_%dd.h5" % (output_path, ouput_dim) +# hw = DWF.create(h5_file) +# hw.write(vcr.u2c.key, y) + +# tsne = tsne_obj(2) +# y = tsne.fit_transform(x) +# if save_embed: +# h5_file = "%s/embed_2d.h5" % output_path +# hw = DWF.create(h5_file) +# hw.write(vcr.u2c.key, y) + +# fig_file = "%s/tsne_2d.pdf" % (output_path) +# # plt.scatter(y[:,0], y[:,1], c=class_ids, marker='x') + +# color_marker = [(c, m) for m in markers for c in colors] +# for c in np.unique(class_ids): +# idx = class_ids == c +# plt.scatter( +# y[idx, 0], +# y[idx, 1], +# c=color_marker[c][0], +# marker=color_marker[c][1], +# label=vcr.class_names[c], +# ) + +# plt.legend() +# plt.grid(True) +# plt.show() +# plt.savefig(fig_file) +# plt.clf() + +# # if max_classes > 0: +# # fig_file = '%s/tsne_2d_n%d.pdf' % (output_path, max_classes) +# # index = class_ids < max_classes +# # plt.scatter(y[index,0], y[index,1], c=class_ids[index], marker='x') +# # plt.grid(True) +# # plt.show() +# # plt.savefig(fig_file) +# # plt.clf() + +# tsne = tsne_obj(3) +# y = tsne.fit_transform(x) +# if save_embed: +# h5_file = "%s/embed_3d.h5" % output_path +# hw = DWF.create(h5_file) +# hw.write(vcr.u2c.key, y) + +# fig_file = "%s/tsne_3d.pdf" % (output_path) +# fig = plt.figure() +# ax = fig.add_subplot(111, projection="3d") +# # ax.scatter(y[:,0], y[:,1], y[:,2], c=class_ids, marker='x') +# for c in np.unique(class_ids): +# idx = class_ids == c +# ax.scatter( +# y[idx, 0], +# y[idx, 1], +# y[idx, 2], +# c=color_marker[c][0], +# marker=color_marker[c][1], +# label=vcr.class_names[c], +# ) + +# plt.grid(True) +# plt.show() +# plt.savefig(fig_file) +# plt.clf() + +# # if max_classes > 0: +# # fig_file = '%s/tsne_3d_n%d.pdf' % (output_path, max_classes) +# # index = class_ids < max_classes +# # ax = fig.add_subplot(111, projection='3d') +# # ax.scatter(y[index,0], y[index,1], y[index,2], c=class_ids[index], marker='x') +# # plt.grid(True) +# # plt.show() +# # plt.savefig(fig_file) +# # plt.clf() + +# logging.info("Elapsed time: %.2f s." % (time.time() - t1)) diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py new file mode 100755 index 00000000..08e4ef70 --- /dev/null +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time +from pathlib import Path + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.clustering import AHC +from hyperion.np.transforms import PCA, LNorm, SklTSNE +from hyperion.utils import SegmentSet +from hyperion.utils.math_funcs import cosine_scoring + +matplotlib.use("Agg") +colors = ["b", "g", "r", "c", "m", "y", "k"] +markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] + +color_marker = [(c, m) for m in markers for c in colors] + + +def plot_embedding_tsne( + train_v_file, + train_list, + pca_var_r, + prob_plot, + lnorm, + title, + max_classes, + plot_class_name, + do_ahc, + cluster_tsne, + num_clusters, + ahc_thr, + output_dir, + **kwargs, +): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + train_reader = DRF.create(train_v_file) + x_trn = train_reader.read(train_segs["id"], squeeze=True) + del train_reader + logging.info("loaded %d samples", x_trn.shape[0]) + if lnorm: + x_trn = LNorm().predict(x_trn) + + if pca_var_r < 1: + pca = PCA(pca_var_r=pca_var_r) + pca.fit(x_trn) + x_pca = pca.predict(x_trn) + logging.info("pca-dim=%d", x_pca.shape[1]) + else: + x_pca = x_trn + + class_ids = train_segs[plot_class_name] + classes, class_idx = np.unique(class_ids, return_inverse=True) + if max_classes is not None: + index = class_idx < max_classes + x_pca = x_pca[index] + class_idx = class_idx[index] + + tsne_args = SklTSNE.filter_args(**kwargs["tsne"]) + tsne = SklTSNE(**tsne_args) + if do_ahc: + ahc = AHC() + global_subclass_idx = np.zeros_like(class_idx) + + for c in range(np.max(class_idx) + 1): + fig_file = f"{output_dir}/train_tsne_{plot_class_name}_{classes[c]}.png" + idx = class_idx == c + logging.info("plot class %s with %d samples", classes[c], np.sum(idx)) + x_c = x_pca[idx] + x_tsne = tsne.fit(x_c) + if do_ahc: + if cluster_tsne: + # in the low dim space, we cannot use cosine scoring + x2 = np.sum(x_tsne**2, axis=1)[:, None] + d2 = x2 - 2 * np.dot(x_tsne, x_tsne.T) + x2.T + d2 = np.clip(d2, a_min=0, a_max=None) + scores = -np.sqrt(d2) + else: + scores = cosine_scoring(x_c, x_c) + ahc.fit(scores) + if num_clusters is None: + subclass_idx_c = ahc.get_flat_clusters(ahc_thr) + else: + subclass_idx_c = ahc.get_flat_clusters(num_clusters, "num_clusters") + global_subclass_idx[idx] = subclass_idx_c + + p = np.random.rand(x_tsne.shape[0]) <= prob_plot + x_tsne = x_tsne[p] + logging.info("plots %d samples", x_tsne.shape[0]) + if do_ahc: + subclass_idx_c = subclass_idx_c[p] + for sc in range(min(np.max(subclass_idx_c) + 1, len(color_marker))): + idx_sc = subclass_idx_c == sc + plt.scatter( + x_tsne[idx_sc, 0], + x_tsne[idx_sc, 1], + c=color_marker[sc][0], + marker=color_marker[sc][1], + ) + else: + plt.scatter( + x_tsne[:, 0], + x_tsne[:, 1], + c=color_marker[0][0], + marker=color_marker[0][1], + ) + + # plt.legend() + plt.grid(True) + plt.title(f"{title} {classes[c]}") + plt.savefig(fig_file) + plt.clf() + + if do_ahc: + # subclass_ids = [f"{a}-{b}" for a, b in zip(class_ids, global_subclass_idx)] + # _, subclass_idx = np.unique(subclass_ids, return_inverse=True) + # train_segs["subclass_id"] = subclass_ids + train_segs["subclass_idx"] = global_subclass_idx + train_segs.save(output_dir / "segments.csv") + + +def main(): + parser = ArgumentParser( + description=( + "Projects embeddings using TSNE, " + "plots a TSNE per class to discover subclusters inside of the classes" + ) + ) + + parser.add_argument("--train-v-file", required=True) + parser.add_argument("--train-list", required=True) + + parser.add_argument("--pca-var-r", default=0.95, type=float) + parser.add_argument("--prob-plot", default=0.1, type=float) + parser.add_argument("--lnorm", default=False, action=ActionYesNo) + parser.add_argument( + "--plot-class-name", + default="class_id", + help="names of the class column we plot", + ) + parser.add_argument("--title", default="") + SklTSNE.add_class_args(parser, prefix="tsne") + + parser.add_argument( + "--max-classes", default=None, type=int, help="max number of clases to plot" + ) + parser.add_argument( + "--do-ahc", default=False, action=ActionYesNo, help="Do AHC on each class" + ) + parser.add_argument( + "--cluster-tsne", + default=False, + action=ActionYesNo, + help="if true, clustering is done after TSNE, otherwise after PCA", + ) + + parser.add_argument( + "--num-clusters", + default=None, + type=int, + help="if not None, number of clusters for AHC, discards ahc-threshold", + ) + parser.add_argument("--ahc-thr", default=0.7, type=float, help="AHC threshold") + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + plot_embedding_tsne(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py new file mode 100755 index 00000000..dd1bde27 --- /dev/null +++ b/hyperion/bin/prepare_data.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.data_prep import DataPrep +from hyperion.hyp_defs import config_logger + + +def make_parser(data_prep_class): + parser = ArgumentParser() + data_prep_class.add_class_args(parser) + return parser + + +def main(): + parser = ArgumentParser( + description="""Prepares a dataset into relational database tables""" + ) + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in DataPrep.registry.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + config_logger(1) + data_prep_class = DataPrep.registry[args.subcommand] + args = namespace_to_dict(args)[args.subcommand] + data_prep = data_prep_class(**args) + data_prep.prepare() + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/preprocess-audio-files.py b/hyperion/bin/preprocess-audio-files.py deleted file mode 100755 index 67b1cf61..00000000 --- a/hyperion/bin/preprocess-audio-files.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2020 Jesus Villalba (Johns Hopkins University) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging - -import math -import numpy as np -from scipy import signal, ndimage - -from hyperion.hyp_defs import config_logger -from hyperion.utils import Utt2Info -from hyperion.io import SequentialAudioReader as AR -from hyperion.io import AudioWriter as Writer -from hyperion.io import VADReaderFactory as VRF - - -def process_vad(vad, length, fs, dilation, erosion): - vad = signal.resample(vad, length) > 0.5 - if dilation > 0: - iters = int(dilation * fs) - vad = ndimage.binary_dilation(vad, iterations=iters) - - if erosion > 0: - iters = int(erosion * fs) - vad = ndimage.binary_erosion(vad, iterations=iters, border_value=True) - - return vad - - -def process_audio_files( - input_path, - output_path, - output_script, - write_time_durs_spec, - vad_spec, - vad_path_prefix, - vad_fs=100, - vad_dilation=0, - vad_erosion=0, - remove_dc_offset=False, - **kwargs -): - - input_args = AR.filter_args(**kwargs) - output_args = Writer.filter_args(**kwargs) - logging.info("input_args={}".format(input_args)) - logging.info("output_args={}".format(output_args)) - - if write_time_durs_spec is not None: - keys = [] - info = [] - - with AR(input_path, **input_args) as reader: - with Writer(output_path, output_script, **output_args) as writer: - - if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) - - t1 = time.time() - for data in reader: - key, x, fs = data - logging.info("Processing audio %s" % (key)) - t2 = time.time() - - tot_samples = x.shape[0] - if vad_spec is not None: - num_vad_frames = int(round(tot_samples * vad_fs / fs)) - vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype( - "bool", copy=False - ) - logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) - vad = process_vad(vad, tot_samples, fs, vad_dilation, vad_erosion) - logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) - x = x[vad] - - logging.info( - "utt %s detected %f/%f secs (%.2f %%) speech " - % ( - key[0], - x.shape[0] / fs, - tot_samples / fs, - x.shape[0] / tot_samples * 100, - ) - ) - - if x.shape[0] > 0: - if remove_dc_offset: - x -= np.mean(x) - - writer.write([key], [x], [fs]) - if write_time_durs_spec is not None: - keys.append(key) - info.append(x.shape[0] / fs) - - xmax = np.max(x) - xmin = np.min(x) - else: - xmax = 0 - xmin = 0 - - t3 = time.time() - dt2 = (t2 - t1) * 1000 - dt3 = (t3 - t1) * 1000 - time_dur = len(x) / fs - rtf = (time_dur * 1000) / dt3 - logging.info( - ( - "Packed audio %s length=%0.3f secs " - "elapsed-time=%.2f ms. " - "read-time=%.2f ms. write-time=%.2f ms. " - "real-time-factor=%.2f" - "x-range=[%f-%f]" - ) - % (key, time_dur, dt3, dt2, dt3 - dt2, rtf, xmin, xmax) - ) - t1 = time.time() - - if write_time_durs_spec is not None: - logging.info("writing time durations to %s" % (write_time_durs_spec)) - u2td = Utt2Info.create(keys, info) - u2td.save(write_time_durs_spec) - - -if __name__ == "__main__": - - parser = ArgumentParser( - description="Process pipes in wav.scp file, optionally applies vad and save all audios in the same format" - ) - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) - parser.add_argument("--output-path", required=True) - parser.add_argument("--output-script", required=True) - parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None) - parser.add_argument("--vad", dest="vad_spec", default=None) - parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") - ) - - parser.add_argument( - "--vad-fs", default=100, type=float, help=("vad sampling frequency") - ) - - parser.add_argument( - "--vad-dilation", - default=0, - type=float, - help=("applies dilation operation to vad, in secs"), - ) - - parser.add_argument( - "--vad-erosion", - default=0, - type=float, - help=("applies erosion operation to vad (after dilation), in secs"), - ) - - AR.add_class_args(parser) - Writer.add_class_args(parser) - parser.add_argument( - "--remove-dc-offset", - default=False, - action="store_true", - help="removes dc offset from file", - ) - parser.add_argument( - "-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int, - help="Verbose level", - ) - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - process_audio_files(**namespace_to_dict(args)) diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py new file mode 100755 index 00000000..5e98a477 --- /dev/null +++ b/hyperion/bin/preprocess_audio_files.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python +""" + Copyright 2020 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import math +import os +import sys +import time + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from scipy import ndimage, signal + +from hyperion.hyp_defs import config_logger +from hyperion.io import AudioWriter as Writer +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.utils import Utt2Info + + +def resample_vad(vad, length): + step = (len(vad) - 1) / length + assert step < 1 + idx = step * np.arange(length, dtype=float) + idx = np.round(idx).astype(int) + return vad[idx] + + +def process_vad(vad, length, fs, dilation, erosion): + # vad = signal.resample(vad, length) > 0.5 + vad = resample_vad(vad, length) + if dilation > 0: + iters = int(dilation * fs) + vad = ndimage.binary_dilation(vad, iterations=iters) + + if erosion > 0: + iters = int(erosion * fs) + vad = ndimage.binary_erosion(vad, iterations=iters, border_value=True) + + return vad + + +def process_audio_files( + recordings_file, + output_path, + output_recordings_file, + write_time_durs_spec, + vad_spec, + vad_path_prefix, + vad_fs=100, + vad_dilation=0, + vad_erosion=0, + remove_dc_offset=False, + **kwargs, +): + input_args = AR.filter_args(**kwargs) + output_args = Writer.filter_args(**kwargs) + logging.info(f"input_args={input_args}") + logging.info(f"output_args={output_args}") + + if write_time_durs_spec is not None: + keys = [] + info = [] + + with AR(recordings_file, **input_args) as reader, Writer( + output_path, output_recordings_file, **output_args + ) as writer: + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + t1 = time.time() + for data in reader: + key, x, fs = data + logging.info("Processing audio %s", key) + t2 = time.time() + + tot_samples = x.shape[0] + if vad_spec is not None: + num_vad_frames = int(round(tot_samples * vad_fs / fs)) + vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype( + "bool", copy=False + ) + logging.info("vad=%d/%d", np.sum(vad == 1), len(vad)) + vad = process_vad(vad, tot_samples, fs, vad_dilation, vad_erosion) + logging.info("vad=%d/%d", np.sum(vad == 1), len(vad)) + x = x[vad] + + logging.info( + "utt %s detected %f/%f secs (%.2f %%) speech ", + key[0], + x.shape[0] / fs, + tot_samples / fs, + x.shape[0] / tot_samples * 100, + ) + + if x.shape[0] > 0: + if remove_dc_offset: + x -= np.mean(x) + + writer.write([key], [x], [fs]) + if write_time_durs_spec is not None: + keys.append(key) + info.append(x.shape[0] / fs) + + xmax = np.max(x) + xmin = np.min(x) + else: + xmax = 0 + xmin = 0 + + t3 = time.time() + dt2 = (t2 - t1) * 1000 + dt3 = (t3 - t1) * 1000 + time_dur = len(x) / fs + rtf = (time_dur * 1000) / dt3 + logging.info( + ( + "Packed audio %s length=%0.3f secs " + "elapsed-time=%.2f ms. " + "read-time=%.2f ms. write-time=%.2f ms. " + "real-time-factor=%.2f " + "x-range=[%f - %f]" + ), + key, + time_dur, + dt3, + dt2, + dt3 - dt2, + rtf, + xmin, + xmax, + ) + t1 = time.time() + + if write_time_durs_spec is not None: + logging.info("writing time durations to %s", write_time_durs_spec) + u2td = Utt2Info.create(keys, info) + u2td.save(write_time_durs_spec) + + +def main(): + parser = ArgumentParser( + description="Process pipes in wav.scp file, optionally applies vad and save all audios in the same format" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument("--output-recordings-file", required=True) + parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + parser.add_argument( + "--vad-fs", default=100, type=float, help=("vad sampling frequency") + ) + + parser.add_argument( + "--vad-dilation", + default=0, + type=float, + help=("applies dilation operation to vad, in secs"), + ) + + parser.add_argument( + "--vad-erosion", + default=0, + type=float, + help=("applies erosion operation to vad (after dilation), in secs"), + ) + + AR.add_class_args(parser) + Writer.add_class_args(parser) + parser.add_argument( + "--remove-dc-offset", + default=False, + action="store_true", + help="removes dc offset from file", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + process_audio_files(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/split_dataset_into_trials_and_cohort.py b/hyperion/bin/split_dataset_into_trials_and_cohort.py new file mode 100755 index 00000000..a5935910 --- /dev/null +++ b/hyperion/bin/split_dataset_into_trials_and_cohort.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import HypDataset + + +def main(): + parser = ArgumentParser( + description=( + """Split speakers in dataset into test speaker to create ASV trials and + cohort speakers for S-Norm""" + ) + ) + + parser.add_argument("--data-dir", required=True, help="Path to dataset") + parser.add_argument( + "--num-1k-tar-trials", type=int, default=30, help="thousands of target trials" + ) + parser.add_argument( + "--num-trial-speakers", + type=int, + default=1000, + help="number of speakers to create trials", + ) + parser.add_argument( + "--intra-gender", + default=True, + action=ActionYesNo, + help="Whether we create intra gender trials or not", + ) + parser.add_argument("--seed", type=int, default=1123, help="random seed") + parser.add_argument( + "--trials-dir", default=None, help="Path to output trials dataset" + ) + parser.add_argument( + "--cohort-dir", default=None, help="Path to output cohort dataset" + ) + + args = parser.parse_args() + config_logger(1) + data_dir = args.data_dir + cohort_dir = args.cohort_dir + cohort_dir = f"{data_dir}_cohort" if cohort_dir is None else cohort_dir + trials_dir = args.trials_dir + trials_dir = f"{data_dir}_trials" if trials_dir is None else trials_dir + + del args.data_dir + del args.cohort_dir + del args.trials_dir + args = namespace_to_dict(args) + + dataset = HypDataset.load(data_dir) + trials_dataset, cohort_dataset = dataset.split_into_trials_and_cohort(**args) + trials_dataset.save(trials_dir) + cohort_dataset.save(cohort_dir) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/torch-eval-xvec-logits-from-wav.py b/hyperion/bin/torch-eval-xvec-logits-from-wav.py deleted file mode 100755 index 58cc9005..00000000 --- a/hyperion/bin/torch-eval-xvec-logits-from-wav.py +++ /dev/null @@ -1,311 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2019 Jesus Villalba (Johns Hopkins University) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging - -import numpy as np -import pandas as pd - -import torch - -from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info -from hyperion.io import DataWriterFactory as DWF -from hyperion.io import SequentialAudioReader as AR -from hyperion.io import VADReaderFactory as VRF -from hyperion.augment import SpeechAugment - -from hyperion.torch.utils import open_device -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch import TorchModelLoader as TML - - -def init_device(use_gpu): - set_float_cpu("float32") - num_gpus = 1 if use_gpu else 0 - logging.info("initializing devices num_gpus={}".format(num_gpus)) - device = open_device(num_gpus=num_gpus) - return device - - -def init_feats(device, **kwargs): - feat_args = AF.filter_args(**kwargs["feats"]) - logging.info("feat args={}".format(feat_args)) - logging.info("initializing feature extractor") - feat_extractor = AF(trans=False, **feat_args) - logging.info("feat-extractor={}".format(feat_extractor)) - feat_extractor.eval() - feat_extractor.to(device) - return feat_extractor - - -def load_model(model_path, device): - logging.info("loading model {}".format(model_path)) - model = TML.load(model_path) - logging.info("xvector-model={}".format(model)) - model.to(device) - model.eval() - return model - - -def augment(key0, x0, augmenter, aug_df, aug_id): - if augmenter is None: - x = x0 - key = key0 - else: - x, aug_info = augmenter(x0) - key = "%s-aug-%02d" % (key0, aug_id) - aug_df_row = { - "key_aug": key, - "key_orig": key0, - "noise_type": aug_info["noise"]["noise_type"], - "snr": aug_info["noise"]["snr"], - "rir_type": aug_info["reverb"]["rir_type"], - "srr": aug_info["reverb"]["srr"], - "sdr": aug_info["sdr"], - } - - aug_df.append(pd.DataFrame(aug_df_row, index=[0])) - - return key, x - - -def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) - if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) - x = x[:, first_frame : first_frame + utt_length] - logging.info( - "extract-random-utt %s of length=%d first-frame=%d" - % (key, x.shape[1], first_frame) - ) - return x - - -def eval_xvec( - input_spec, - output_spec, - vad_spec, - write_num_frames_spec, - scp_sep, - vad_path_prefix, - model_path, - chunk_length, - random_utt_length, - min_utt_length, - max_utt_length, - aug_cfg, - num_augs, - aug_info_path, - use_gpu, - **kwargs -): - - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) - device = init_device(use_gpu) - feat_extractor = init_feats(device, **kwargs) - model = load_model(model_path, device) - - if write_num_frames_spec is not None: - keys = [] - info = [] - - if aug_cfg is not None: - augmenter = SpeechAugment.create(aug_cfg, rng=rng) - aug_df = [] - else: - augmenter = None - aug_df = None - num_augs = 1 - - ar_args = AR.filter_args(**kwargs) - logging.info("opening output stream: %s" % (output_spec)) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: - - logging.info( - "opening input stream: {} with args={}".format(input_spec, ar_args) - ) - with AR(input_spec, **ar_args) as reader: - - if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) - - while not reader.eof(): - t1 = time.time() - key, x0, fs = reader.read(1) - if len(key) == 0: - break - - x0 = x0[0] - key0 = key[0] - t2 = time.time() - - logging.info("processing utt %s" % (key0)) - for aug_id in range(num_augs): - t3 = time.time() - key, x = augment(key0, x0, augmenter, aug_df, aug_id) - t4 = time.time() - with torch.no_grad(): - x = torch.tensor( - x[None, :], dtype=torch.get_default_dtype() - ).to(device) - - x = feat_extractor(x) - t5 = time.time() - tot_frames = x.shape[1] - if vad_spec is not None: - vad = v_reader.read(key0, num_frames=tot_frames)[0] - vad = torch.tensor(vad, dtype=torch.bool).to(device) - x = x[:, vad] - - logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key, - x.shape[1], - tot_frames, - x.shape[1] / tot_frames * 100, - ) - ) - - if random_utt_length: - x = select_random_chunk( - key, x, min_utt_length, max_utt_length, rng - ) - - t6 = time.time() - if x.shape[1] == 0: - y = np.zeros((model.embed_dim,), dtype=float_cpu()) - else: - x = x.transpose(1, 2).contiguous() - y = model(x).cpu().numpy()[0] - - t7 = time.time() - writer.write([key], [y]) - if write_num_frames_spec is not None: - keys.append(key) - info.append(str(x.shape[1])) - - t8 = time.time() - read_time = t2 - t1 - tot_time = read_time + t8 - t3 - logging.info( - ( - "utt %s total-time=%.3f read-time=%.3f " - "aug-time=%.3f feat-time=%.3f " - "vad-time=%.3f embed-time=%.3f write-time=%.3f " - "rt-factor=%.2f" - ) - % ( - key, - tot_time, - read_time, - t4 - t3, - t5 - t4, - t6 - t5, - t7 - t6, - t8 - t7, - x0.shape[0] / fs[0] / tot_time, - ) - ) - - if write_num_frames_spec is not None: - logging.info("writing num-frames to %s" % (write_num_frames_spec)) - u2nf = Utt2Info.create(keys, info) - u2nf.save(write_num_frames_spec) - - if aug_info_path is not None: - aug_df = pd.concat(aug_df, ignore_index=True) - aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") - - -if __name__ == "__main__": - - parser = ArgumentParser( - description=( - "Evaluates x-vectors logits from waveform computing " - "acoustic features on the fly" - ) - ) - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_spec", required=True) - parser.add_argument("--vad", dest="vad_spec", default=None) - parser.add_argument( - "--write-num-frames", dest="write_num_frames_spec", default=None - ) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) - parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") - ) - - AR.add_class_args(parser) - - parser.add_argument("--aug-cfg", default=None) - parser.add_argument("--aug-info-path", default=None) - parser.add_argument( - "--num-augs", default=1, type=int, help="number of augmentations per utterance" - ) - - AF.add_class_args(parser, prefix="feats") - - parser.add_argument("--model-path", required=True) - parser.add_argument( - "--chunk-length", - type=int, - default=0, - help=( - "number of frames used in each forward pass " - "of the x-vector encoder," - "if 0 the full utterance is used" - ), - ) - - parser.add_argument( - "--random-utt-length", - default=False, - action="store_true", - help="calculates x-vector from a random chunk", - ) - parser.add_argument( - "--min-utt-length", - type=int, - default=500, - help=("minimum utterance length when using random utt length"), - ) - parser.add_argument( - "--max-utt-length", - type=int, - default=12000, - help=("maximum utterance length when using random utt length"), - ) - - parser.add_argument("--output", dest="output_spec", required=True) - parser.add_argument( - "--use-gpu", default=False, action="store_true", help="run in gpu" - ) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - eval_xvec(**namespace_to_dict(args)) diff --git a/hyperion/bin/torch-extract-xvectors-from-wav.py b/hyperion/bin/torch-extract-xvectors-from-wav.py deleted file mode 100755 index 0aea084e..00000000 --- a/hyperion/bin/torch-extract-xvectors-from-wav.py +++ /dev/null @@ -1,328 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2019 Jesus Villalba (Johns Hopkins University) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging - -import numpy as np -import pandas as pd - -import torch - -from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info -from hyperion.io import DataWriterFactory as DWF -from hyperion.io import SequentialAudioReader as AR -from hyperion.io import VADReaderFactory as VRF -from hyperion.augment import SpeechAugment - -from hyperion.torch.utils import open_device -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch import TorchModelLoader as TML - - -def init_device(use_gpu): - set_float_cpu("float32") - num_gpus = 1 if use_gpu else 0 - logging.info("initializing devices num_gpus={}".format(num_gpus)) - device = open_device(num_gpus=num_gpus) - return device - - -def init_feats(device, **kwargs): - feat_args = AF.filter_args(**kwargs["feats"]) - logging.info("feat args={}".format(feat_args)) - logging.info("initializing feature extractor") - feat_extractor = AF(trans=False, **feat_args) - logging.info("feat-extractor={}".format(feat_extractor)) - feat_extractor.eval() - feat_extractor.to(device) - return feat_extractor - - -def load_model(model_path, device): - logging.info("loading model {}".format(model_path)) - model = TML.load(model_path) - logging.info("xvector-model={}".format(model)) - model.to(device) - model.eval() - return model - - -def augment(key0, x0, augmenter, aug_df, aug_id): - if augmenter is None: - x = x0 - key = key0 - else: - x, aug_info = augmenter(x0) - key = "%s-aug-%02d" % (key0, aug_id) - aug_df_row = { - "key_aug": key, - "key_orig": key0, - "noise_type": aug_info["noise"]["noise_type"], - "snr": aug_info["noise"]["snr"], - "rir_type": aug_info["reverb"]["rir_type"], - "srr": aug_info["reverb"]["srr"], - "sdr": aug_info["sdr"], - } - - aug_df.append(pd.DataFrame(aug_df_row, index=[0])) - - return key, x - - -def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) - if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) - x = x[:, first_frame : first_frame + utt_length] - logging.info( - "extract-random-utt %s of length=%d first-frame=%d" - % (key, x.shape[1], first_frame) - ) - return x - - -def extract_xvectors( - input_spec, - output_spec, - vad_spec, - write_num_frames_spec, - scp_sep, - vad_path_prefix, - model_path, - chunk_length, - embed_layer, - random_utt_length, - min_utt_length, - max_utt_length, - aug_cfg, - num_augs, - aug_info_path, - use_gpu, - **kwargs -): - - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) - device = init_device(use_gpu) - feat_extractor = init_feats(device, **kwargs) - model = load_model(model_path, device) - - if write_num_frames_spec is not None: - keys = [] - info = [] - - if aug_cfg is not None: - augmenter = SpeechAugment.create(aug_cfg, rng=rng) - aug_df = [] - else: - augmenter = None - aug_df = None - num_augs = 1 - - ar_args = AR.filter_args(**kwargs) - logging.info("opening output stream: %s" % (output_spec)) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: - - logging.info( - "opening input stream: {} with args={}".format(input_spec, ar_args) - ) - with AR(input_spec, **ar_args) as reader: - - if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) - - while not reader.eof(): - t1 = time.time() - key, x0, fs = reader.read(1) - if len(key) == 0: - break - - x0 = x0[0] - key0 = key[0] - t2 = time.time() - - logging.info("processing utt %s" % (key0)) - for aug_id in range(num_augs): - t3 = time.time() - key, x = augment(key0, x0, augmenter, aug_df, aug_id) - t4 = time.time() - with torch.no_grad(): - x = torch.tensor( - x[None, :], dtype=torch.get_default_dtype() - ).to(device) - - x = feat_extractor(x) - t5 = time.time() - tot_frames = x.shape[1] - if vad_spec is not None: - vad = v_reader.read(key0, num_frames=tot_frames)[0] - vad = torch.tensor(vad, dtype=torch.bool).to(device) - x = x[:, vad] - - logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key, - x.shape[1], - tot_frames, - x.shape[1] / tot_frames * 100, - ) - ) - - if random_utt_length: - x = select_random_chunk( - key, x, min_utt_length, max_utt_length, rng - ) - - t6 = time.time() - if x.shape[1] == 0: - y = np.zeros((model.embed_dim,), dtype=float_cpu()) - else: - x = x.transpose(1, 2).contiguous() - y = ( - model.extract_embed( - x, - chunk_length=chunk_length, - embed_layer=embed_layer, - ) - .cpu() - .numpy()[0] - ) - - t7 = time.time() - writer.write([key], [y]) - if write_num_frames_spec is not None: - keys.append(key) - info.append(str(x.shape[1])) - - t8 = time.time() - read_time = t2 - t1 - tot_time = read_time + t8 - t3 - logging.info( - ( - "utt %s total-time=%.3f read-time=%.3f " - "aug-time=%.3f feat-time=%.3f " - "vad-time=%.3f embed-time=%.3f write-time=%.3f " - "rt-factor=%.2f" - ) - % ( - key, - tot_time, - read_time, - t4 - t3, - t5 - t4, - t6 - t5, - t7 - t6, - t8 - t7, - x0.shape[0] / fs[0] / tot_time, - ) - ) - - if write_num_frames_spec is not None: - logging.info("writing num-frames to %s" % (write_num_frames_spec)) - u2nf = Utt2Info.create(keys, info) - u2nf.save(write_num_frames_spec) - - if aug_info_path is not None: - aug_df = pd.concat(aug_df, ignore_index=True) - aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") - - -if __name__ == "__main__": - - parser = ArgumentParser( - description=( - "Extracts x-vectors from waveform computing " "acoustic features on the fly" - ) - ) - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_spec", required=True) - parser.add_argument("--vad", dest="vad_spec", default=None) - parser.add_argument( - "--write-num-frames", dest="write_num_frames_spec", default=None - ) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) - parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") - ) - - AR.add_class_args(parser) - - parser.add_argument("--aug-cfg", default=None) - parser.add_argument("--aug-info-path", default=None) - parser.add_argument( - "--num-augs", default=1, type=int, help="number of augmentations per utterance" - ) - - AF.add_class_args(parser, prefix="feats") - - parser.add_argument("--model-path", required=True) - parser.add_argument( - "--chunk-length", - type=int, - default=0, - help=( - "number of frames used in each forward pass " - "of the x-vector encoder," - "if 0 the full utterance is used" - ), - ) - parser.add_argument( - "--embed-layer", - type=int, - default=None, - help=( - "classifier layer to get the embedding from, " - "if None, it uses layer set in training phase" - ), - ) - - parser.add_argument( - "--random-utt-length", - default=False, - action="store_true", - help="calculates x-vector from a random chunk", - ) - parser.add_argument( - "--min-utt-length", - type=int, - default=500, - help=("minimum utterance length when using random utt length"), - ) - parser.add_argument( - "--max-utt-length", - type=int, - default=12000, - help=("maximum utterance length when using random utt length"), - ) - - parser.add_argument("--output", dest="output_spec", required=True) - parser.add_argument( - "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" - ) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - extract_xvectors(**namespace_to_dict(args)) diff --git a/hyperion/bin/torch-finetune-xvec-dfr-from-wav.py b/hyperion/bin/torch-finetune-xvec-dfr-from-wav.py deleted file mode 100755 index 437c76f0..00000000 --- a/hyperion/bin/torch-finetune-xvec-dfr-from-wav.py +++ /dev/null @@ -1,335 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging -import multiprocessing - -import numpy as np - -import torch -import torch.nn as nn - -from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer -from hyperion.torch.data import AudioDataset as AD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch import TorchModelLoader as TML - - -def init_data( - audio_path, - train_list, - val_list, - train_aug_cfg, - val_aug_cfg, - num_workers, - num_gpus, - rank, - **kwargs -): - - ad_args = AD.filter_args(**kwargs) - sampler_args = Sampler.filter_args(**kwargs) - if rank == 0: - logging.info("audio dataset args={}".format(ad_args)) - logging.info("sampler args={}".format(sampler_args)) - logging.info("init datasets") - - train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) - val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) - - if rank == 0: - logging.info("init samplers") - train_sampler = Sampler(train_data, **sampler_args) - val_sampler = Sampler(val_data, **sampler_args) - - num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ( - {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} - ) - - train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler=train_sampler, **largs - ) - - test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler=val_sampler, **largs - ) - - return train_loader, test_loader - - -def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs["feats"]) - if rank == 0: - logging.info("feat args={}".format(feat_args)) - logging.info("initializing feature extractor") - feat_extractor = AF(trans=True, **feat_args) - if rank == 0: - logging.info("feat-extractor={}".format(feat_extractor)) - return feat_extractor - - -def init_xvector( - num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs -): - - xvec_args = XVec.filter_finetune_args(**kwargs) - if rank == 0: - logging.info("xvector network ft args={}".format(xvec_args)) - xvec_args["num_classes"] = num_classes - model = TML.load(in_model_path) - model.rebuild_output_layer(**xvec_args) - if prior_model_path: - prior_model = TML.load(prior_model_path) - else: - prior_model = model.copy() - prior_model.freeze() - prior_model.eval() - if train_mode == "ft-embed-affine": - model.freeze_preembed_layers() - if rank == 0: - logging.info("x-vector-model={}".format(model)) - return model, prior_model - - -def train_xvec(gpu_id, args): - - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - kwargs = namespace_to_dict(args) - torch.manual_seed(args.seed) - set_float_cpu("float32") - - train_mode = kwargs["train_mode"] - - ddp_args = ddp.filter_ddp_args(**kwargs) - device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs["rank"] = rank - - train_loader, test_loader = init_data(**kwargs) - feat_extractor = init_feats(**kwargs) - model, prior_model = init_xvector(train_loader.dataset.num_classes, **kwargs) - - trn_args = Trainer.filter_args(**kwargs) - if rank == 0: - logging.info("trainer args={}".format(trn_args)) - metrics = {"acc": CategoricalAccuracy()} - trainer = Trainer( - model, - feat_extractor, - prior_model, - device=device, - metrics=metrics, - ddp=world_size > 1, - train_mode=train_mode, - **trn_args - ) - if args.resume: - trainer.load_last_checkpoint() - trainer.fit(train_loader, test_loader) - - ddp.ddp_cleanup() - - -# def train_xvec(audio_path, train_list, val_list, -# train_aug_cfg, val_aug_cfg, -# in_model_path, prior_model_path, -# reg_layers_enc, reg_layers_classif, -# reg_weight_enc, reg_weight_classif, reg_loss, -# num_gpus, resume, num_workers, -# train_mode, **kwargs): - -# set_float_cpu('float32') -# logging.info('initializing devices num_gpus={}'.format(num_gpus)) -# device = open_device(num_gpus=num_gpus) - -# ad_args = AD.filter_args(**kwargs) -# sampler_args = Sampler.filter_args(**kwargs) -# feat_args = AFF.filter_args(prefix='feats', **kwargs) -# mvn_args = MVN.filter_args(prefix='mvn', **kwargs) -# xvec_args = XVec.filter_finetune_args(**kwargs) -# opt_args = OF.filter_args(prefix='opt', **kwargs) -# lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) -# trn_args = Trainer.filter_args(**kwargs) -# logging.info('audio dataset args={}'.format(ad_args)) -# logging.info('sampler args={}'.format(sampler_args)) -# logging.info('feat args={}'.format(feat_args)) -# logging.info('mvn args={}'.format(mvn_args)) -# logging.info('xvector finetune args={}'.format(xvec_args)) -# logging.info('optimizer args={}'.format(opt_args)) -# logging.info('lr scheduler args={}'.format(lrsch_args)) -# logging.info('trainer args={}'.format(trn_args)) - -# logging.info('initializing feature extractor args={}'.format(feat_args)) -# feat_extractor = AFF.create(**feat_args) -# mvn = None -# if mvn_args['norm_mean'] or mvn_args['norm_var']: -# logging.info('initializing short-time mvn') -# mvn = MVN(**mvn_args) - -# feat_extractor = FeatExtractor(feat_extractor, mvn) - -# logging.info('init datasets') -# train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) -# val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) - -# logging.info('init samplers') -# train_sampler = Sampler(train_data, **sampler_args) -# val_sampler = Sampler(val_data, **sampler_args) - -# largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} - -# train_loader = torch.utils.data.DataLoader( -# train_data, batch_sampler = train_sampler, **largs) - -# test_loader = torch.utils.data.DataLoader( -# val_data, batch_sampler = val_sampler, **largs) - -# xvec_args['num_classes'] = train_data.num_classes -# model = TML.load(in_model_path) -# model.rebuild_output_layer(**xvec_args) -# if prior_model_path: -# prior_model = TML.load(prior_model_path) -# else: -# prior_model = model.copy() -# prior_model.freeze() -# prior_model.eval() -# if train_mode == 'ft-embed-affine': -# model.freeze_preembed_layers() -# logging.info(str(model)) - -# optimizer = OF.create(model.parameters(), **opt_args) -# lr_sch = LRSF.create(optimizer, **lrsch_args) -# metrics = { 'acc': CategoricalAccuracy() } - -# if reg_loss == 'l1': -# reg_loss = nn.L1Loss() -# else: -# reg_loss = nn.MSELoss() - -# trainer = Trainer(model, feat_extractor, prior_model, optimizer, -# reg_layers_enc=reg_layers_enc, reg_layers_classif=reg_layers_classif, -# reg_weight_enc=reg_weight_enc, reg_weight_classif=reg_weight_classif, -# reg_loss=reg_loss, -# device=device, metrics=metrics, lr_scheduler=lr_sch, -# data_parallel=(num_gpus>1), train_mode=train_mode, -# **trn_args) -# if resume: -# trainer.load_last_checkpoint() -# trainer.fit(train_loader, test_loader) - - -if __name__ == "__main__": - - parser = ArgumentParser( - description=( - "Fine-tune x-vector model with deep feature loss " - "regularization from audio files" - ) - ) - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--audio-path", required=True) - parser.add_argument("--train-list", dest="train_list", required=True) - parser.add_argument("--val-list", dest="val_list", required=True) - - AD.add_argparse_args(parser) - Sampler.add_argparse_args(parser) - - parser.add_argument( - "--num-workers", type=int, default=5, help="num_workers of data loader" - ) - - parser.add_argument("--train-aug-cfg", default=None) - parser.add_argument("--val-aug-cfg", default=None) - - AF.add_class_args(parser, prefix="feats") - - # AFF.add_argparse_args(parser, prefix='feats') - # MVN.add_argparse_args(parser, prefix='mvn') - - # parser.add_argument('--reg-layers-enc', type=int, default=None, nargs='+', - # help='list of layers from the encoder nnet to use for regularization ') - # parser.add_argument('--reg-layers-classif', type=int, default=None, nargs='+', - # help='list of layers from the classif nnet to use for regularization ') - # parser.add_argument('--reg-weight-enc', type=float, default=0.1, - # help='weight for regularization from enc layers') - # parser.add_argument('--reg-weight-classif', type=float, default=0.1, - # help='weight for regularization from classif layers') - # parser.add_argument('--reg-loss', default='l1', - # choices=['l1', 'mse'], - # help=('type of regularization loss')) - - parser.add_argument("--in-model-path", required=True) - parser.add_argument("--prior-model-path") - - XVec.add_finetune_args(parser) - Trainer.add_class_args(parser) - ddp.add_ddp_args(parser) - - # parser.add_argument('--num-gpus', type=int, default=1, - # help='number of gpus, if 0 it uses cpu') - parser.add_argument( - "--seed", type=int, default=1123581321, help="random seed (default: 1)" - ) - parser.add_argument( - "--resume", - action="store_true", - default=False, - help="resume training from checkpoint", - ) - parser.add_argument( - "--train-mode", - default="ft-embed-affine", - choices=["ft-full", "ft-embed-affine"], - help=( - "ft-full: adapt full x-vector network" - "ft-embed-affine: adapt affine transform before embedding" - ), - ) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - parser.add_argument("--local_rank", default=0, type=int) - - args = parser.parse_args() - gpu_id = args.local_rank - del args.local_rank - - if gpu_id == 0: - try: - config_file = Path(args.exp_path) / "config.yaml" - parser.save(args, str(config_file), format="yaml", overwrite=True) - except: - pass - - # torch docs recommend using forkserver - multiprocessing.set_start_method("forkserver") - train_xvec(gpu_id, args) - - # config_logger(args.verbose) - # del args.verbose - # logging.debug(args) - - # torch.manual_seed(args.seed) - # del args.seed - - # train_xvec(**vars(args)) diff --git a/hyperion/bin/torch-finetune-xvec-dfr.py b/hyperion/bin/torch-finetune-xvec-dfr.py deleted file mode 100755 index a26c14fb..00000000 --- a/hyperion/bin/torch-finetune-xvec-dfr.py +++ /dev/null @@ -1,286 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging -import multiprocessing - -import numpy as np - -import torch -import torch.nn as nn - -from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer -from hyperion.torch.data import FeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch import TorchModelLoader as TML - - -def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): - sd_args = SD.filter_args(**kwargs) - sampler_args = Sampler.filter_args(**kwargs) - if rank == 0: - logging.info("audio dataset args={}".format(sd_args)) - logging.info("sampler args={}".format(sampler_args)) - logging.info("init datasets") - - train_data = SD(data_rspec, train_list, **sd_args) - val_data = SD(data_rspec, val_list, is_val=True, **sd_args) - if rank == 0: - logging.info("init samplers") - train_sampler = Sampler(train_data, **sampler_args) - val_sampler = Sampler(val_data, **sampler_args) - - num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ( - {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} - ) - - train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler=train_sampler, **largs - ) - - test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler=val_sampler, **largs - ) - - return train_loader, test_loader - - -def init_xvector( - num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs -): - - xvec_args = XVec.filter_finetune_args(**kwargs) - if rank == 0: - logging.info("xvector network ft args={}".format(xvec_args)) - xvec_args["num_classes"] = num_classes - model = TML.load(in_model_path) - model.rebuild_output_layer(**xvec_args) - if prior_model_path: - prior_model = TML.load(prior_model_path) - else: - prior_model = model.copy() - prior_model.freeze() - prior_model.eval() - if train_mode == "ft-embed-affine": - model.freeze_preembed_layers() - if rank == 0: - logging.info("x-vector-model={}".format(model)) - return model, prior_model - - -def train_xvec(gpu_id, args): - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - kwargs = namespace_to_dict(args) - torch.manual_seed(args.seed) - set_float_cpu("float32") - - train_mode = kwargs["train_mode"] - - ddp_args = ddp.filter_ddp_args(**kwargs) - device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs["rank"] = rank - train_loader, test_loader = init_data(**kwargs) - model, prior_model = init_xvector(train_loader.dataset.num_classes, **kwargs) - - trn_args = Trainer.filter_args(**kwargs) - if rank == 0: - logging.info("trainer args={}".format(trn_args)) - metrics = {"acc": CategoricalAccuracy()} - - trainer = Trainer( - model, - prior_model, - device=device, - metrics=metrics, - ddp=world_size > 1, - train_mode=train_mode, - **trn_args - ) - if args.resume: - trainer.load_last_checkpoint() - trainer.fit(train_loader, test_loader) - - ddp.ddp_cleanup() - - -# def train_xvec(data_rspec, train_list, val_list, in_model_path, -# prior_model_path, -# reg_layers_enc, reg_layers_classif, -# reg_weight_enc, reg_weight_classif, reg_loss, -# num_gpus, resume, num_workers, -# train_mode, **kwargs): - -# set_float_cpu('float32') -# logging.info('initializing devices num_gpus={}'.format(num_gpus)) -# device = open_device(num_gpus=num_gpus) - -# sd_args = SD.filter_args(**kwargs) -# sampler_args = Sampler.filter_args(**kwargs) -# xvec_args = XVec.filter_finetune_args(**kwargs) -# opt_args = OF.filter_args(prefix='opt', **kwargs) -# lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) -# trn_args = Trainer.filter_args(**kwargs) -# logging.info('seq dataset args={}'.format(sd_args)) -# logging.info('sampler args={}'.format(sampler_args)) -# logging.info('xvector finetune args={}'.format(xvec_args)) -# logging.info('optimizer args={}'.format(opt_args)) -# logging.info('lr scheduler args={}'.format(lrsch_args)) -# logging.info('trainer args={}'.format(trn_args)) - -# logging.info('init datasets') -# train_data = SD(data_rspec, train_list, **sd_args) -# val_data = SD(data_rspec, val_list, is_val=True, **sd_args) - -# logging.info('init samplers') -# train_sampler = Sampler(train_data, **sampler_args) -# val_sampler = Sampler(val_data, **sampler_args) - -# largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} - -# train_loader = torch.utils.data.DataLoader( -# train_data, batch_sampler = train_sampler, **largs) - -# test_loader = torch.utils.data.DataLoader( -# val_data, batch_sampler = val_sampler, **largs) - -# xvec_args['num_classes'] = train_data.num_classes -# model = TML.load(in_model_path) -# model.rebuild_output_layer(**xvec_args) -# if prior_model_path: -# prior_model = TML.load(prior_model_path) -# else: -# prior_model = model.copy() -# prior_model.freeze() -# prior_model.eval() -# if train_mode == 'ft-embed-affine': -# model.freeze_preembed_layers() -# logging.info(str(model)) - -# optimizer = OF.create(model.parameters(), **opt_args) -# lr_sch = LRSF.create(optimizer, **lrsch_args) -# metrics = { 'acc': CategoricalAccuracy() } - -# if reg_loss == 'l1': -# reg_loss = nn.L1Loss() -# else: -# reg_loss = nn.MSELoss() - -# trainer = Trainer(model, prior_model, optimizer, -# reg_layers_enc=reg_layers_enc, -# reg_layers_classif=reg_layers_classif, -# reg_weight_enc=reg_weight_enc, -# reg_weight_classif=reg_weight_classif, -# reg_loss=reg_loss, -# device=device, metrics=metrics, lr_scheduler=lr_sch, -# data_parallel=(num_gpus>1), train_mode=train_mode, -# **trn_args) -# if resume: -# trainer.load_last_checkpoint() -# trainer.fit(train_loader, test_loader) - - -if __name__ == "__main__": - - parser = ArgumentParser( - description="Fine-tune x-vector model with deep feature loss regularization" - ) - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--data-rspec", dest="data_rspec", required=True) - parser.add_argument("--train-list", dest="train_list", required=True) - parser.add_argument("--val-list", dest="val_list", required=True) - - SD.add_argparse_args(parser) - Sampler.add_argparse_args(parser) - parser.add_argument( - "--num-workers", type=int, default=5, help="num_workers of data loader" - ) - - # parser.add_argument('--reg-layers-enc', type=int, default=None, nargs='+', - # help='list of layers from the encoder nnet to use for regularization ') - # parser.add_argument('--reg-layers-classif', type=int, default=None, nargs='+', - # help='list of layers from the classif nnet to use for regularization ') - # parser.add_argument('--reg-weight-enc', type=float, default=0.1, - # help='weight for regularization from enc layers') - # parser.add_argument('--reg-weight-classif', type=float, default=0.1, - # help='weight for regularization from classif layers') - # parser.add_argument('--reg-loss', default='l1', - # choices=['l1', 'mse'], - # help=('type of regularization loss')) - - parser.add_argument("--in-model-path", required=True) - parser.add_argument("--prior-model-path") - XVec.add_finetune_args(parser) - Trainer.add_class_args(parser) - ddp.add_ddp_args(parser) - - # parser.add_argument('--num-gpus', type=int, default=1, - # help='number of gpus, if 0 it uses cpu') - parser.add_argument( - "--seed", type=int, default=1123581321, help="random seed (default: 1)" - ) - parser.add_argument( - "--resume", - action="store_true", - default=False, - help="resume training from checkpoint", - ) - parser.add_argument( - "--train-mode", - default="ft-embed-affine", - choices=["ft-full", "ft-embed-affine"], - help=( - "ft-full: adapt full x-vector network" - "ft-embed-affine: adapt affine transform before embedding" - ), - ) - - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - parser.add_argument("--local_rank", default=0, type=int) - - args = parser.parse_args() - gpu_id = args.local_rank - del args.local_rank - - if gpu_id == 0: - try: - config_file = Path(args.exp_path) / "config.yaml" - parser.save(args, str(config_file), format="yaml", overwrite=True) - except: - pass - - # torch docs recommend using forkserver - multiprocessing.set_start_method("forkserver") - train_xvec(gpu_id, args) - - # config_logger(args.verbose) - # del args.verbose - # logging.debug(args) - - # torch.manual_seed(args.seed) - # del args.seed - - # train_xvec(**vars(args)) diff --git a/hyperion/bin/torch-finetune-xvec-from-wav.py b/hyperion/bin/torch-finetune-xvec-from-wav.py deleted file mode 100755 index e33d9b8e..00000000 --- a/hyperion/bin/torch-finetune-xvec-from-wav.py +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging -import multiprocessing - -import numpy as np - -import torch -import torch.nn as nn - -from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.data import AudioDataset as AD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch import TorchModelLoader as TML - - -def init_data( - audio_path, - train_list, - val_list, - train_aug_cfg, - val_aug_cfg, - num_workers, - num_gpus, - rank, - **kwargs -): - - ad_args = AD.filter_args(**kwargs) - sampler_args = Sampler.filter_args(**kwargs) - if rank == 0: - logging.info("audio dataset args={}".format(ad_args)) - logging.info("sampler args={}".format(sampler_args)) - logging.info("init datasets") - - train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) - val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) - - if rank == 0: - logging.info("init samplers") - train_sampler = Sampler(train_data, **sampler_args) - val_sampler = Sampler(val_data, **sampler_args) - - num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ( - {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} - ) - - train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler=train_sampler, **largs - ) - - test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler=val_sampler, **largs - ) - - return train_loader, test_loader - - -def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs["feats"]) - if rank == 0: - logging.info("feat args={}".format(feat_args)) - logging.info("initializing feature extractor") - feat_extractor = AF(trans=True, **feat_args) - if rank == 0: - logging.info("feat-extractor={}".format(feat_extractor)) - return feat_extractor - - -def init_xvector(num_classes, in_model_path, rank, train_mode, **kwargs): - xvec_args = XVec.filter_finetune_args(**kwargs) - if rank == 0: - logging.info("xvector network ft args={}".format(xvec_args)) - xvec_args["num_classes"] = num_classes - model = TML.load(in_model_path) - model.rebuild_output_layer(**xvec_args) - if train_mode == "ft-embed-affine": - model.freeze_preembed_layers() - if rank == 0: - logging.info("x-vector-model={}".format(model)) - return model - - -def train_xvec(gpu_id, args): - - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - kwargs = namespace_to_dict(args) - torch.manual_seed(args.seed) - set_float_cpu("float32") - - train_mode = kwargs["train_mode"] - - ddp_args = ddp.filter_ddp_args(**kwargs) - device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs["rank"] = rank - - train_loader, test_loader = init_data(**kwargs) - feat_extractor = init_feats(**kwargs) - model = init_xvector(train_loader.dataset.num_classes, **kwargs) - - trn_args = Trainer.filter_args(**kwargs) - if rank == 0: - logging.info("trainer args={}".format(trn_args)) - metrics = {"acc": CategoricalAccuracy()} - trainer = Trainer( - model, - feat_extractor, - device=device, - metrics=metrics, - ddp=world_size > 1, - train_mode=train_mode, - **trn_args - ) - if args.resume: - trainer.load_last_checkpoint() - trainer.fit(train_loader, test_loader) - - ddp.ddp_cleanup() - - -# (audio_path, train_list, val_list, -# train_aug_cfg, val_aug_cfg, -# in_model_path, num_gpus, resume, num_workers, -# train_mode, **kwargs): - -# set_float_cpu('float32') -# logging.info('initializing devices num_gpus={}'.format(num_gpus)) -# device = open_device(num_gpus=num_gpus) - -# ad_args = AD.filter_args(**kwargs) -# sampler_args = Sampler.filter_args(**kwargs) -# feat_args = AFF.filter_args(prefix='feats', **kwargs) -# mvn_args = MVN.filter_args(prefix='mvn', **kwargs) -# xvec_args = XVec.filter_finetune_args(**kwargs) -# opt_args = OF.filter_args(prefix='opt', **kwargs) -# lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) -# trn_args = Trainer.filter_args(**kwargs) -# logging.info('audio dataset args={}'.format(ad_args)) -# logging.info('sampler args={}'.format(sampler_args)) -# logging.info('feat args={}'.format(feat_args)) -# logging.info('mvn args={}'.format(mvn_args)) -# logging.info('xvector finetune args={}'.format(xvec_args)) -# logging.info('optimizer args={}'.format(opt_args)) -# logging.info('lr scheduler args={}'.format(lrsch_args)) -# logging.info('trainer args={}'.format(trn_args)) - -# logging.info('initializing feature extractor args={}'.format(feat_args)) -# feat_extractor = AFF.create(**feat_args) -# mvn = None -# if mvn_args['norm_mean'] or mvn_args['norm_var']: -# logging.info('initializing short-time mvn') -# mvn = MVN(**mvn_args) - -# feat_extractor = FeatExtractor(feat_extractor, mvn) - -# logging.info('init datasets') -# train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) -# val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) - -# logging.info('init samplers') -# train_sampler = Sampler(train_data, **sampler_args) -# val_sampler = Sampler(val_data, **sampler_args) - -# largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} - -# train_loader = torch.utils.data.DataLoader( -# train_data, batch_sampler = train_sampler, **largs) - -# test_loader = torch.utils.data.DataLoader( -# val_data, batch_sampler = val_sampler, **largs) - -# xvec_args['num_classes'] = train_data.num_classes -# model = TML.load(in_model_path) -# model.rebuild_output_layer(**xvec_args) -# if train_mode == 'ft-embed-affine': -# model.freeze_preembed_layers() - -# logging.info('feat-extractor={}'.format(feat_extractor)) -# logging.info('x-vector-model={}'.format(model)) - -# optimizer = OF.create(model.parameters(), **opt_args) -# lr_sch = LRSF.create(optimizer, **lrsch_args) -# metrics = { 'acc': CategoricalAccuracy() } - -# trainer = Trainer(model, feat_extractor, optimizer, -# device=device, metrics=metrics, lr_scheduler=lr_sch, -# data_parallel=(num_gpus>1), train_mode=train_mode, -# **trn_args) -# if resume: -# trainer.load_last_checkpoint() -# trainer.fit(train_loader, test_loader) - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Fine-tune x-vector model from audio files") - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--audio-path", required=True) - parser.add_argument("--train-list", required=True) - parser.add_argument("--val-list", required=True) - - AD.add_argparse_args(parser) - Sampler.add_argparse_args(parser) - - parser.add_argument("--train-aug-cfg", default=None) - parser.add_argument("--val-aug-cfg", default=None) - parser.add_argument( - "--num-workers", type=int, default=5, help="num_workers of data loader" - ) - - AF.add_class_args(parser, prefix="feats") - parser.add_argument("--in-model-path", required=True) - - XVec.add_finetune_args(parser) - Trainer.add_class_args(parser) - ddp.add_ddp_args(parser) - - # parser.add_argument('--num-gpus', type=int, default=1, - # help='number of gpus, if 0 it uses cpu') - parser.add_argument("--seed", type=int, default=1123581321, help="random seed") - parser.add_argument( - "--resume", - action="store_true", - default=False, - help="resume training from checkpoint", - ) - parser.add_argument( - "--train-mode", - default="ft-embed-affine", - choices=["ft-full", "ft-embed-affine"], - help=( - "ft-full: adapt full x-vector network" - "ft-embed-affine: adapt affine transform before embedding" - ), - ) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - parser.add_argument("--local_rank", default=0, type=int) - - args = parser.parse_args() - gpu_id = args.local_rank - del args.local_rank - - if gpu_id == 0: - try: - config_file = Path(args.exp_path) / "config.yaml" - parser.save(args, str(config_file), format="yaml", overwrite=True) - except: - pass - - # torch docs recommend using forkserver - multiprocessing.set_start_method("forkserver") - train_xvec(gpu_id, args) - - # args = parser.parse_args() - # config_logger(args.verbose) - # del args.verbose - # logging.debug(args) - - # torch.manual_seed(args.seed) - # del args.seed - - # train_xvec(**vars(args)) diff --git a/hyperion/bin/torch-finetune-xvec.py b/hyperion/bin/torch-finetune-xvec.py deleted file mode 100755 index ec6386c8..00000000 --- a/hyperion/bin/torch-finetune-xvec.py +++ /dev/null @@ -1,238 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging -import multiprocessing - -import numpy as np - -import torch - -from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorTrainer as Trainer -from hyperion.torch.data import FeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch import TorchModelLoader as TML - - -def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): - sd_args = SD.filter_args(**kwargs) - sampler_args = Sampler.filter_args(**kwargs) - if rank == 0: - logging.info("audio dataset args={}".format(sd_args)) - logging.info("sampler args={}".format(sampler_args)) - logging.info("init datasets") - - train_data = SD(data_rspec, train_list, **sd_args) - val_data = SD(data_rspec, val_list, is_val=True, **sd_args) - if rank == 0: - logging.info("init samplers") - train_sampler = Sampler(train_data, **sampler_args) - val_sampler = Sampler(val_data, **sampler_args) - - num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ( - {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} - ) - - train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler=train_sampler, **largs - ) - - test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler=val_sampler, **largs - ) - - return train_loader, test_loader - - -def init_xvector(num_classes, in_model_path, rank, train_mode, **kwargs): - xvec_args = XVec.filter_finetune_args(**kwargs) - if rank == 0: - logging.info("xvector network ft args={}".format(xvec_args)) - xvec_args["num_classes"] = num_classes - model = TML.load(in_model_path) - model.rebuild_output_layer(**xvec_args) - if train_mode == "ft-embed-affine": - model.freeze_preembed_layers() - if rank == 0: - logging.info("x-vector-model={}".format(model)) - return model - - -def train_xvec(gpu_id, args): - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - kwargs = namespace_to_dict(args) - torch.manual_seed(args.seed) - set_float_cpu("float32") - - train_mode = kwargs["train_mode"] - - ddp_args = ddp.filter_ddp_args(**kwargs) - device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs["rank"] = rank - train_loader, test_loader = init_data(**kwargs) - model = init_xvector(train_loader.dataset.num_classes, **kwargs) - - trn_args = Trainer.filter_args(**kwargs) - if rank == 0: - logging.info("trainer args={}".format(trn_args)) - metrics = {"acc": CategoricalAccuracy()} - trainer = Trainer( - model, - device=device, - metrics=metrics, - ddp=world_size > 1, - train_mode=train_mode, - **trn_args - ) - if args.resume: - trainer.load_last_checkpoint() - trainer.fit(train_loader, test_loader) - - ddp.ddp_cleanup() - - -# (data_rspec, train_list, val_list, in_model_path, -# num_gpus, resume, num_workers, train_mode, **kwargs): - -# set_float_cpu('float32') -# logging.info('initializing devices num_gpus={}'.format(num_gpus)) -# device = open_device(num_gpus=num_gpus) - -# sd_args = SD.filter_args(**kwargs) -# sampler_args = Sampler.filter_args(**kwargs) -# xvec_args = XVec.filter_finetune_args(**kwargs) -# opt_args = OF.filter_args(prefix='opt', **kwargs) -# lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) -# trn_args = Trainer.filter_args(**kwargs) -# logging.info('seq dataset args={}'.format(sd_args)) -# logging.info('sampler args={}'.format(sampler_args)) -# logging.info('xvector finetune args={}'.format(xvec_args)) -# logging.info('optimizer args={}'.format(opt_args)) -# logging.info('lr scheduler args={}'.format(lrsch_args)) -# logging.info('trainer args={}'.format(trn_args)) - -# logging.info('init datasets') -# train_data = SD(data_rspec, train_list, **sd_args) -# val_data = SD(data_rspec, val_list, is_val=True, **sd_args) - -# logging.info('init samplers') -# train_sampler = Sampler(train_data, **sampler_args) -# val_sampler = Sampler(val_data, **sampler_args) - -# largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} - -# train_loader = torch.utils.data.DataLoader( -# train_data, batch_sampler = train_sampler, **largs) - -# test_loader = torch.utils.data.DataLoader( -# val_data, batch_sampler = val_sampler, **largs) - -# xvec_args['num_classes'] = train_data.num_classes -# model = TML.load(in_model_path) -# model.rebuild_output_layer(**xvec_args) -# if train_mode == 'ft-embed-affine': -# model.freeze_preembed_layers() -# logging.info(str(model)) - -# optimizer = OF.create(model.parameters(), **opt_args) -# lr_sch = LRSF.create(optimizer, **lrsch_args) -# metrics = { 'acc': CategoricalAccuracy() } - -# trainer = Trainer(model, optimizer, -# device=device, metrics=metrics, lr_scheduler=lr_sch, -# data_parallel=(num_gpus>1), train_mode=train_mode, -# **trn_args) -# if resume: -# trainer.load_last_checkpoint() -# trainer.fit(train_loader, test_loader) - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Fine-tune x-vector model") - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--data-rspec", required=True) - parser.add_argument("--train-list", required=True) - parser.add_argument("--val-list", required=True) - - SD.add_argparse_args(parser) - Sampler.add_argparse_args(parser) - parser.add_argument( - "--num-workers", type=int, default=5, help="num_workers of data loader" - ) - parser.add_argument("--in-model-path", required=True) - XVec.add_finetune_args(parser) - Trainer.add_class_args(parser) - ddp.add_ddp_args(parser) - - # parser.add_argument('--num-gpus', type=int, default=1, - # help='number of gpus, if 0 it uses cpu') - parser.add_argument( - "--seed", type=int, default=1123581321, help="random seed (default: 1)" - ) - parser.add_argument( - "--resume", - action="store_true", - default=False, - help="resume training from checkpoint", - ) - parser.add_argument( - "--train-mode", - default="ft-embed-affine", - choices=["ft-full", "ft-embed-affine"], - help=( - "ft-full: adapt full x-vector network" - "ft-embed-affine: adapt affine transform before embedding" - ), - ) - - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - parser.add_argument("--local_rank", default=0, type=int) - - args = parser.parse_args() - gpu_id = args.local_rank - del args.local_rank - - if gpu_id == 0: - try: - config_file = Path(args.exp_path) / "config.yaml" - parser.save(args, str(config_file), format="yaml", overwrite=True) - except: - pass - - # torch docs recommend using forkserver - multiprocessing.set_start_method("forkserver") - train_xvec(gpu_id, args) - - # args = parser.parse_args() - # config_logger(args.verbose) - # del args.verbose - # logging.debug(args) - - # torch.manual_seed(args.seed) - # del args.seed - - # train_xvec(**vars(args)) diff --git a/hyperion/bin/torch-generate-adv-attacks-xvector-classif.py b/hyperion/bin/torch-generate-adv-attacks-xvector-classif.py deleted file mode 100755 index 274bdf32..00000000 --- a/hyperion/bin/torch-generate-adv-attacks-xvector-classif.py +++ /dev/null @@ -1,422 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging -from pathlib import Path - -import numpy as np -import pandas as pd -import yaml - -import torch -import torch.nn as nn - -from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessAudioReader as AR -from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx -from hyperion.io import VADReaderFactory as VRF - -from hyperion.torch.utils import open_device -from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack - -from hyperion.torch.adv_attacks import RandomAttackFactory - - -def read_utt_list(list_file, class2int_file, part_idx, num_parts): - logging.info("reading utt list %s" % (list_file)) - utt_list = Utt2Info.load(list_file) - utt_list = utt_list.split(part_idx, num_parts) - logging.info("reading class2int-file %s" % (class2int_file)) - class_info = pd.read_csv(class2int_file, header=None, sep=" ") - class2idx = {str(k): i for i, k in enumerate(class_info[0])} - class_idx = np.array([class2idx[k] for k in utt_list.info], dtype=int) - keys = utt_list.key - class_names = utt_list.info - return keys, class_names, class_idx - - -class MyModel(nn.Module): - def __init__(self, feat_extractor, xvector_model): - super().__init__() - self.feat_extractor = feat_extractor - self.xvector_model = xvector_model - self.vad = None - - def forward(self, s): - f = self.feat_extractor(s) - if self.vad is not None: - n_vad_frames = len(self.vad) - n_feat_frames = f.shape[1] - if n_vad_frames > n_feat_frames: - self.vad = self.vad[:n_feat_frames] - elif n_vad_frames < n_feat_frames: - f = f[:, :n_vad_frames] - - f = f[:, self.vad] - - f = f.transpose(1, 2).contiguous() - score = self.xvector_model(f) - return score - - -def init_device(use_gpu): - set_float_cpu("float32") - num_gpus = 1 if use_gpu else 0 - logging.info("initializing devices num_gpus={}".format(num_gpus)) - device = open_device(num_gpus=num_gpus) - return device - - -def init_model(model_path, **kwargs): - feat_args = AF.filter_args(**kwargs["feats"]) - logging.info("feat args={}".format(feat_args)) - logging.info("initializing feature extractor") - feat_extractor = AF(trans=False, **feat_args) - logging.info("feat-extractor={}".format(feat_extractor)) - - # feat_args = AFF.filter_args(prefix='feats', **kwargs) - # logging.info('initializing feature extractor args={}'.format(feat_args)) - # feat_extractor = AFF.create(**feat_args) - - # mvn_args = MVN.filter_args(prefix='mvn', **kwargs) - # mvn = None - # if mvn_args['norm_mean'] or mvn_args['norm_var']: - # logging.info('initializing short-time mvn args={}'.format(mvn_args)) - # mvn = MVN(**mvn_args) - - logging.info("loading model {}".format(model_path)) - xvector_model = TML.load(model_path) - xvector_model.freeze() - logging.info("xvector-model={}".format(xvector_model)) - - model = MyModel(feat_extractor, xvector_model) - model.eval() - return model - - -def init_attack_factory(wav_scale=1, **kwargs): - attacks_args = RandomAttackFactory.filter_args(**kwargs["attacks"]) - extra_args = { - "eps_scale": wav_scale, - "range_min": -wav_scale, - "range_max": wav_scale, - "loss": nn.functional.cross_entropy, - "time_dim": 1, - } - attacks_args.update(extra_args) - - logging.info("attacks args={}".format(attacks_args)) - attack_factory = RandomAttackFactory(**attacks_args) - return attack_factory - - -def select_random_chunk(key, s, fs, min_utt_length, max_utt_length): - utt_length = torch.randint( - low=min_utt_length * fs, high=max_utt_length * fs + 1, size=(1,) - ).item() - if utt_length < len(s): - first_sample = torch.randint(low=0, high=len(s) - utt_length, size=(1,)).item() - s = s[first_sample : first_sample + utt_length] - logging.info( - "extract-random-utt %s of length=%d first-sample=%d" - % (key, len(s), first_sample) - ) - return s - - -def generate_attacks( - wav_file, - list_file, - vad_spec, - vad_path_prefix, - class2int_file, - model_path, - output_wav_dir, - attack_info_file, - attack_tag, - random_utt_length, - min_utt_length, - max_utt_length, - random_seed, - p_attack, - save_failed, - save_benign, - use_gpu, - part_idx, - num_parts, - **kwargs -): - - device = init_device(use_gpu) - model = init_model(model_path, **kwargs) - model.to(device) - - logging.info("opening audio read stream: %s" % (wav_file)) - audio_args = AR.filter_args(**kwargs) - audio_reader = AR(wav_file) - wav_scale = audio_reader.wav_scale - - logging.info("opening audio write stream: %s" % (output_wav_dir)) - audio_writer = AW(output_wav_dir, audio_format="flac") - - if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") - - keys, class_names, class_ids = read_utt_list( - list_file, class2int_file, part_idx, num_parts - ) - - attack_factory = init_attack_factory(**kwargs) - attacks_info = {} - - for i in range(len(keys)): - key = keys[i] - class_id = class_ids[i] - - t1 = time.time() - logging.info("reading utt %s" % (key)) - s, fs = audio_reader.read([key]) - s = s[0] - fs = fs[0] - - torch.manual_seed( - random_seed + int(s[0]) - ) # this is to make results reproducible - p = torch.rand(1).item() - if p > p_attack: - logging.info("skipping attack for utt %s" % (key)) - continue - - if random_utt_length: - s = select_random_chunk(key, s, fs, min_utt_length, max_utt_length) - - if save_benign: - s_benign = s - - s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) - target = torch.as_tensor([class_id], dtype=torch.long).to(device) - if vad_spec is not None: - vad = v_reader.read([key.seg_set[j]])[0] - tot_frames = len(vad) - speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( - device - ) - model.vad = vad - logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key.seg_set[j], - speech_frames, - tot_frames, - speech_frames / tot_frames * 100, - ) - ) - - t2 = time.time() - with torch.no_grad(): - score_benign = model(s) - - _, pred = torch.max(score_benign, dim=1) - if pred[0] != class_id: - logging.info("utt %s failed benign classification, skipping..." % (key)) - continue - - t3 = time.time() - attack = attack_factory.sample_attack(model) - attack_info = attack.attack_info - s_adv = attack.generate(s, target).detach() - t4 = time.time() - with torch.no_grad(): - score_adv = model(s_adv) - t5 = time.time() - - _, pred = torch.max(score_adv, dim=1) - success = False - if pred[0] != class_id: - success = True - - if success or save_failed: - key_attack = "%s-%s" % (key, attack_tag) - logging.info("utt %s attack successful" % (key)) - - stats_ij = compute_stats_adv_attack(s, s_adv) - stats_ij = [float(stat.detach().cpu().numpy()[0]) for stat in stats_ij] - - s_adv = s_adv.cpu().numpy()[0] - wav_attack = audio_writer.write(key_attack, s_adv, fs)[0] - if save_benign: - key_benign = "%s-benign" % (key_attack) - wav_benign = audio_writer.write(key_benign, s_benign, fs)[0] - else: - key_benign = key - wav_benign = "" - - attack_info.update( - { - "attack_tag": attack_tag, - "wav_path": wav_attack, - "class_name": class_names[i], - "class_id": int(class_id), - "key_benign": key_benign, - "wav_benign": wav_benign, - "snr": stats_ij[0], - "px": stats_ij[1], - "pn": stats_ij[2], - "x_l2": stats_ij[3], - "x_linf": stats_ij[4], - "n_l0": stats_ij[5], - "n_l2": stats_ij[6], - "n_linf": stats_ij[7], - "num_samples": s.shape[-1], - "success": success, - } - ) - attacks_info[key_attack] = attack_info - - else: - logging.info("utt %s attack failed, skipping..." % (key)) - - t6 = time.time() - logging.info( - ( - "utt %s total-time=%.3f read-time=%.3f " - "eval-benign-time=%.3f attack-time=%.3f eval-attack-time=%3f " - "rt-factor=%.4f" - ) - % ( - key, - t6 - t1, - t2 - t1, - t3 - t2, - t4 - t3, - t5 - t4, - s.shape[1] / fs / (t6 - t1), - ) - ) - - logging.info("saving attack info to %s" % (attack_info_file)) - Path(attack_info_file).parent.mkdir(parents=True, exist_ok=True) - - with open(attack_info_file, "w") as f: - # only save if we have successful attacks - if attacks_info: - yaml.dump(attacks_info, f, sort_keys=True) - - -if __name__ == "__main__": - - parser = ArgumentParser( - description="Generate Attacks for speaker classification with x-vectors" - ) - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--wav-file", required=True) - parser.add_argument("--list-file", required=True) - parser.add_argument("--class2int-file", required=True) - parser.add_argument("--attack-tag", required=True) - - AR.add_class_args(parser) - AF.add_class_args(parser, prefix="feats") - - parser.add_argument("--vad", dest="vad_spec", default=None) - parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), - ) - - parser.add_argument("--model-path", required=True) - parser.add_argument( - "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" - ) - - RandomAttackFactory.add_class_args(parser, prefix="attacks") - - parser.add_argument("--part-idx", default=1, type=int, help=("part index")) - parser.add_argument( - "--num-parts", - default=1, - type=int, - help=( - "number of parts in which we divide the list " - "to run evaluation in parallel" - ), - ) - - parser.add_argument( - "--output-wav-dir", default=None, help="output path of adv signals" - ) - parser.add_argument( - "--attack-info-file", - default=None, - help="output path of to save information about the generated attacks", - ) - parser.add_argument( - "--random-seed", default=1234, type=int, help="random seed for pytorch" - ) - - parser.add_argument( - "--random-utt-length", - default=False, - action="store_true", - help="calculates x-vector from a random chunk", - ) - parser.add_argument( - "--min-utt-length", - type=int, - default=5, - help=("minimum utterance length (in secs) when using random utt length"), - ) - parser.add_argument( - "--max-utt-length", - type=int, - default=120, - help=("maximum utterance length (in secs) when using random utt length"), - ) - - parser.add_argument( - "--p-attack", - type=float, - default=1, - help=("probability of generating an attack for a given utterance"), - ) - parser.add_argument( - "--save-failed", - default=False, - action="store_true", - help=("save failed attacks also"), - ) - parser.add_argument( - "--save-benign", - default=False, - action="store_true", - help=("save a copy of the benign sample"), - ) - - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - generate_attacks(**namespace_to_dict(args)) diff --git a/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py b/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py deleted file mode 100755 index 58f73b00..00000000 --- a/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py +++ /dev/null @@ -1,469 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging -from pathlib import Path - -import numpy as np -import pandas as pd -import yaml - -import torch -import torch.nn as nn - -from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR -from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember -from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device -from hyperion.torch.layers import LinBinCalibrator as Calibrator -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack -from hyperion.torch import TorchModelLoader as TML - -from hyperion.torch.adv_attacks import RandomAttackFactory - - -class MyModel(nn.Module): - def __init__( - self, feat_extractor, xvector_model, embed_layer=None, calibrator=None, sigma=0 - ): - super().__init__() - self.feat_extractor = feat_extractor - self.xvector_model = xvector_model - self.x_e = None - self.vad_t = None - self.embed_layer = embed_layer - self.calibrator = calibrator - self.sigma = sigma - - def forward(self, s_t): - # print('sigma0=', self.sigma) - if self.sigma > 0: - s_t = s_t + self.sigma * torch.randn_like(s_t) - # print('sigma1=', self.sigma) - f_t = self.feat_extractor(s_t) - if self.vad_t is not None: - n_vad_frames = len(self.vad_t) - n_feat_frames = f_t.shape[1] - if n_vad_frames > n_feat_frames: - self.vad_t = self.vad_t[:n_feat_frames] - elif n_vad_frames < n_feat_frames: - f_t = f_t[:, :n_vad_frames] - - f_t = f_t[:, self.vad_t] - - f_t = f_t.transpose(1, 2).contiguous() - x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) - x_t = l2_norm(x_t) - x_e = l2_norm(self.x_e) - score = torch.sum(x_e * x_t, dim=-1) - if self.calibrator is not None: - score = self.calibrator(score) - - return score - - -def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - - r = DRF.create(v_file) - enroll = Utt2Info.load(enroll_file) - key = TrialKey.load(key_file) - if num_seg_parts > 1: - key = key.split(1, 1, seg_part_idx, num_seg_parts) - - x_e = r.read(enroll.key, squeeze=True) - f, idx = ismember(key.model_set, enroll.info) - - assert np.all(f) - x_e = x_e[idx] - - return key, x_e - - -def init_model(model_path, embed_layer, cal_file, threshold, **kwargs): - feat_args = AF.filter_args(**kwargs["feats"]) - logging.info("feat args={}".format(feat_args)) - logging.info("initializing feature extractor") - feat_extractor = AF(trans=False, **feat_args) - logging.info("feat-extractor={}".format(feat_extractor)) - - logging.info("loading model {}".format(model_path)) - xvector_model = TML.load(model_path) - xvector_model.freeze() - logging.info("xvector-model={}".format(xvector_model)) - - # feat_args = AFF.filter_args(prefix='feats', **kwargs) - # logging.info('initializing feature extractor args={}'.format(feat_args)) - # feat_extractor = AFF.create(**feat_args) - - # mvn_args = MVN.filter_args(prefix='mvn', **kwargs) - # mvn = None - # if mvn_args['norm_mean'] or mvn_args['norm_var']: - # logging.info('initializing short-time mvn args={}'.format(mvn_args)) - # mvn = MVN(**mvn_args) - - # logging.info('loading model {}'.format(model_path)) - # xvector_model = TML.load(model_path) - # xvector_model.freeze() - - calibrator = None - if cal_file is not None: - logging.info("loading calibration params {}".format(cal_file)) - lr = LR.load(cal_file) - # subting the threshold here will put the decision threshold in 0 - # some attacks use thr=0 to decide if the attack is succesful - calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) - - model = MyModel(feat_extractor, xvector_model, embed_layer, calibrator) - model.eval() - return model - - -def init_attack_factory(wav_scale=1, **kwargs): - attacks_args = RandomAttackFactory.filter_args(**kwargs["attacks"]) - extra_args = { - "eps_scale": wav_scale, - "range_min": -wav_scale, - "range_max": wav_scale, - "loss": nn.functional.binary_cross_entropy_with_logits, - "time_dim": 1, - } - attacks_args.update(extra_args) - - logging.info("attacks args={}".format(attacks_args)) - attack_factory = RandomAttackFactory(**attacks_args) - return attack_factory - - -def init_device(use_gpu): - set_float_cpu("float32") - num_gpus = 1 if use_gpu else 0 - logging.info("initializing devices num_gpus={}".format(num_gpus)) - device = open_device(num_gpus=num_gpus) - return device - - -def skip_attack(is_target, p_tar_attack, p_non_attack): - p = torch.rand(1).item() - if is_target: - if p > p_tar_attack: - return True - else: - if p > p_non_attack: - return True - - return False - - -def generate_attacks( - v_file, - key_file, - enroll_file, - test_wav_file, - vad_spec, - vad_path_prefix, - model_path, - embed_layer, - cal_file, - threshold, - output_wav_dir, - attack_info_file, - attack_tag, - p_tar_attack, - p_non_attack, - save_failed, - use_gpu, - seg_part_idx, - num_seg_parts, - random_seed, - **kwargs -): - - device = init_device(use_gpu) - model = init_model(model_path, embed_layer, cal_file, threshold, **kwargs) - model.to(device) - - tar = torch.as_tensor([1], dtype=torch.float).to(device) - non = torch.as_tensor([0], dtype=torch.float).to(device) - - logging.info("loading key and enrollment x-vectors") - key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) - x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) - - logging.info("opening audio read stream: %s" % (test_wav_file)) - audio_args = AR.filter_args(**kwargs) - audio_reader = AR(test_wav_file) - wav_scale = audio_reader.wav_scale - - logging.info("opening audio write stream: %s" % (output_wav_dir)) - audio_writer = AW(output_wav_dir, audio_format="flac") - - if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") - - attack_factory = init_attack_factory(**kwargs) - attacks_info = {} - - for j in range(key.num_tests): - t1 = time.time() - logging.info("scoring test utt %s" % (key.seg_set[j])) - s, fs = audio_reader.read([key.seg_set[j]]) - s = s[0] - fs = fs[0] - torch.manual_seed( - random_seed + int(s[0]) - ) # this is to make results reproducible - s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) - - if vad_spec is not None: - vad = v_reader.read([key.seg_set[j]])[0] - tot_frames = len(vad) - speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( - device - ) - model.vad_t = vad - logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key.seg_set[j], - speech_frames, - tot_frames, - speech_frames / tot_frames * 100, - ) - ) - - t2 = time.time() - - trial_time = 0 - num_trials = 0 - for i in range(key.num_models): - trial_id = "%s-%s" % (key.model_set[i], key.seg_set[j]) - if key.tar[i, j] or key.non[i, j]: - t3 = time.time() - if skip_attack(key.tar[i, j], p_tar_attack, p_non_attack): - logging.info("skipping attack for tar trial %s" % (trial_id)) - continue - - model.x_e = x_e[i].to(device) - with torch.no_grad(): - score_benign = model(s) - - if key.tar[i, j] and score_benign < 0: - logging.info( - "target trial %s failed benign classification, skipping..." - % (trial_id) - ) - continue - elif key.non[i, j] and score_benign > 0: - logging.info( - "non-target trial %s failed benign classification, skipping..." - % (trial_id) - ) - continue - - attack = attack_factory.sample_attack(model) - if key.tar[i, j]: - t = non if attack.targeted else tar - else: - t = tar if attack.targeted else non - - attack_info = attack.attack_info - s_adv = attack.generate(s, t).detach() - with torch.no_grad(): - # we add the threshold back here to make sure the scores are well calibrated - score_adv = model(s_adv) - - t4 = time.time() - trial_time += t4 - t3 - num_trials += 1 - success = True - if key.tar[i, j] and score_adv > 0: - success = False - if not save_failed: - logging.info( - "attack on target trial %s failed, skipping..." % (trial_id) - ) - continue - elif key.non[i, j] and score_adv < 0: - success = False - if not save_failed: - logging.info( - "attack on non-target trial %s failed benign classification, skipping..." - % (trial_id) - ) - continue - if success: - logging.info("attack on trial %s successful" % (trial_id)) - - stats_ij = compute_stats_adv_attack(s, s_adv) - stats_ij = [float(stat.detach().cpu().numpy()[0]) for stat in stats_ij] - - s_adv = s_adv.cpu().numpy()[0] - key_attack = "%s-%s" % (trial_id, attack_tag) - output_wav = audio_writer.write(key_attack, s_adv, fs) - - attack_info.update( - { - "attack_tag": attack_tag, - "wav_path": output_wav[0], - "class_name": "target" if key.tar[i, j] else "non-target", - "class_id": int(key.tar[i, j]), - "key_benign": trial_id, - "enroll": str(key.model_set[i]), - "test_benign": str(key.seg_set[j]), - "snr": stats_ij[0], - "px": stats_ij[1], - "pn": stats_ij[2], - "x_l2": stats_ij[3], - "x_linf": stats_ij[4], - "n_l0": stats_ij[5], - "n_l2": stats_ij[6], - "n_linf": stats_ij[7], - "num_samples": s.shape[-1], - "success": success, - } - ) - attacks_info[key_attack] = attack_info - - if num_trials > 0: - trial_time /= num_trials - t7 = time.time() - logging.info( - ( - "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " - "rt-factor=%.4f" - ) - % ( - key.seg_set[j], - t7 - t1, - t2 - t1, - trial_time, - num_trials, - num_trials * len(s) / fs / (t7 - t1), - ) - ) - - logging.info("saving attack info to %s" % (attack_info_file)) - Path(attack_info_file).parent.mkdir(parents=True, exist_ok=True) - - with open(attack_info_file, "w") as f: - # only save if we have successful attacks - if attacks_info: - yaml.dump(attacks_info, f, sort_keys=True) - - -if __name__ == "__main__": - - parser = ArgumentParser( - description="Generate Attacks for speaker verification with x-vectors+cos+calibration" - ) - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--v-file", required=True) - parser.add_argument("--key-file", default=None) - parser.add_argument("--enroll-file", required=True) - parser.add_argument("--test-wav-file", required=True) - parser.add_argument("--attack-tag", required=True) - - AR.add_class_args(parser) - AF.add_class_args(parser, prefix="feats") - - parser.add_argument("--vad", dest="vad_spec", default=None) - parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), - ) - - parser.add_argument("--model-path", required=True) - parser.add_argument( - "--embed-layer", - type=int, - default=None, - help=( - "classifier layer to get the embedding from," - "if None the layer set in training phase is used" - ), - ) - - parser.add_argument( - "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" - ) - - parser.add_argument("--cal-file", default=None, help="score calibration file") - parser.add_argument("--threshold", default=0, type=float, help="decision threshold") - - RandomAttackFactory.add_class_args(parser, prefix="attacks") - - parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) - parser.add_argument( - "--num-seg-parts", - default=1, - type=int, - help=( - "number of parts in which we divide the test list " - "to run evaluation in parallel" - ), - ) - - parser.add_argument( - "--output-wav-dir", default=None, help="output path of adv signals" - ) - parser.add_argument( - "--attack-info-file", - default=None, - help="output path of to save information about the generated attacks", - ) - parser.add_argument( - "--random-seed", default=1234, type=int, help="random seed for pytorch" - ) - - parser.add_argument( - "--p-tar-attack", - type=float, - default=1, - help=("probability of generating an attack for a target trial"), - ) - parser.add_argument( - "--p-non-attack", - type=float, - default=1, - help=("probability of generating an attack for a non-target trial"), - ) - parser.add_argument( - "--save-failed", - default=False, - action="store_true", - help=("save failed attacks also"), - ) - - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - generate_attacks(**namespace_to_dict(args)) diff --git a/hyperion/bin/torch-train-xvec-from-wav.py b/hyperion/bin/torch-train-xvec-from-wav.py deleted file mode 100755 index 8dcd0482..00000000 --- a/hyperion/bin/torch-train-xvec-from-wav.py +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging -import multiprocessing - -import numpy as np - -import torch -import torch.nn as nn - -from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.data import AudioDataset as AD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.models import ResNetXVector as RXVec -from hyperion.torch.models import EfficientNetXVector as EXVec -from hyperion.torch.models import TDNNXVector as TDXVec -from hyperion.torch.models import TransformerXVectorV1 as TFXVec - -xvec_dict = { - "resnet": RXVec, - "efficientnet": EXVec, - "tdnn": TDXVec, - "transformer": TFXVec, -} - - -def init_data( - audio_path, - train_list, - val_list, - train_aug_cfg, - val_aug_cfg, - num_workers, - num_gpus, - rank, - **kwargs -): - - ad_args = AD.filter_args(**kwargs) - sampler_args = Sampler.filter_args(**kwargs) - if rank == 0: - logging.info("audio dataset args={}".format(ad_args)) - logging.info("sampler args={}".format(sampler_args)) - logging.info("init datasets") - - train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) - val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) - - if rank == 0: - logging.info("init samplers") - train_sampler = Sampler(train_data, **sampler_args) - val_sampler = Sampler(val_data, **sampler_args) - - num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ( - {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} - ) - - train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler=train_sampler, **largs - ) - - test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler=val_sampler, **largs - ) - - return train_loader, test_loader - - -def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs["feats"]) - if rank == 0: - logging.info("feat args={}".format(feat_args)) - logging.info("initializing feature extractor") - feat_extractor = AF(trans=True, **feat_args) - if rank == 0: - logging.info("feat-extractor={}".format(feat_extractor)) - return feat_extractor - - -def init_xvector(num_classes, rank, xvec_class, **kwargs): - - xvec_args = xvec_class.filter_args(**kwargs) - if rank == 0: - logging.info("xvector network args={}".format(xvec_args)) - xvec_args["num_classes"] = num_classes - model = xvec_class(**xvec_args) - if rank == 0: - logging.info("x-vector-model={}".format(model)) - return model - - -def train_xvec(gpu_id, args): - - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - kwargs = namespace_to_dict(args) - torch.manual_seed(args.seed) - set_float_cpu("float32") - - ddp_args = ddp.filter_ddp_args(**kwargs) - device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs["rank"] = rank - - train_loader, test_loader = init_data(**kwargs) - feat_extractor = init_feats(**kwargs) - model = init_xvector(train_loader.dataset.num_classes, **kwargs) - - trn_args = Trainer.filter_args(**kwargs) - if rank == 0: - logging.info("trainer args={}".format(trn_args)) - metrics = {"acc": CategoricalAccuracy()} - trainer = Trainer( - model, - feat_extractor, - device=device, - metrics=metrics, - ddp=world_size > 1, - **trn_args - ) - if args.resume: - trainer.load_last_checkpoint() - trainer.fit(train_loader, test_loader) - - ddp.ddp_cleanup() - - -def make_parser(xvec_class): - parser = ArgumentParser() - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--audio-path", required=True) - parser.add_argument("--train-list", required=True) - parser.add_argument("--val-list", required=True) - - AD.add_class_args(parser) - Sampler.add_class_args(parser) - - parser.add_argument("--train-aug-cfg", default=None) - parser.add_argument("--val-aug-cfg", default=None) - - parser.add_argument( - "--num-workers", type=int, default=5, help="num_workers of data loader" - ) - - AF.add_class_args(parser, prefix="feats") - xvec_class.add_class_args(parser) - Trainer.add_class_args(parser) - ddp.add_ddp_args(parser) - parser.add_argument("--seed", type=int, default=1123581321, help="random seed") - parser.add_argument( - "--resume", - action="store_true", - default=False, - help="resume training from checkpoint", - ) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - return parser - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Train XVector from audio files") - - parser.add_argument("--cfg", action=ActionConfigFile) - - subcommands = parser.add_subcommands() - - for k, v in xvec_dict.items(): - parser_k = make_parser(v) - subcommands.add_subcommand(k, parser_k) - - args = parser.parse_args() - try: - gpu_id = int(os.environ["LOCAL_RANK"]) - except: - gpu_id = 0 - - xvec_type = args.subcommand - args_sc = vars(args)[xvec_type] - - if gpu_id == 0: - try: - config_file = Path(args_sc.exp_path) / "config.yaml" - parser.save(args, str(config_file), format="yaml", overwrite=True) - except: - pass - - args_sc.xvec_class = xvec_dict[xvec_type] - # torch docs recommend using forkserver - multiprocessing.set_start_method("forkserver") - train_xvec(gpu_id, args_sc) diff --git a/hyperion/bin/train_dino_wav2xvector.py b/hyperion/bin/train_dino_wav2xvector.py new file mode 100755 index 00000000..88d3a556 --- /dev/null +++ b/hyperion/bin/train_dino_wav2xvector.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +from pathlib import Path + +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import DINOAudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.losses import CosineDINOLoss, DINOLoss +from hyperion.torch.metrics import CategoricalAccuracy + +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import Wav2ConformerV1XVector as CXVec +from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as RXVec + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.trainers import DINOXVectorTrainer as Trainer +from hyperion.torch.utils import ddp + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + "conformer": CXVec, + # "efficientnet": EXVec, + # "tdnn": TDXVec, + # "transformer": TFXVec, + # "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_student_xvector(num_classes, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_args(**kwargs["student_model"]) + if rank == 0: + logging.info(f"student xvector network args={xvec_args}") + xvec_args["xvector"]["num_classes"] = num_classes + model = xvec_class(**xvec_args) + if rank == 0: + logging.info(f"student-model={model}") + return model + + +def init_teacher_xvector(student_model, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_args(**kwargs["teacher_model"]) + if rank == 0: + logging.info(f"teacher xvector network args={xvec_args}") + # xvec_args["xvector"]["num_classes"] = num_classes + model = student_model.clone() + model.change_config(**xvec_args) + if rank == 0: + logging.info(f"teacher-model={model}") + return model + + +def init_dino_loss(rank, **kwargs): + loss_args = kwargs["dino_loss"] + if rank == 0: + logging.info(f"dino loss args={loss_args}") + loss = DINOLoss(**loss_args) + if rank == 0: + logging.info(f"dino-loss={loss}") + + return loss + + +def init_cosine_loss(rank, **kwargs): + loss_args = kwargs["cosine_loss"] + if rank == 0: + logging.info(f"cosine loss args={loss_args}") + + if loss_args["scale"] <= 0: + return None + + loss = CosineDINOLoss(**loss_args) + if rank == 0: + logging.info(f"cosine-loss={loss}") + + return loss + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + + dino_loss = init_dino_loss(**kwargs) + cosine_loss = init_cosine_loss(**kwargs) + student_model = init_student_xvector(num_classes=dino_loss.num_classes, **kwargs) + kwargs["student_model"] = student_model + teacher_model = init_teacher_xvector(**kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + student_model, + teacher_model, + dino_loss, + cosine_loss=cosine_loss, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset") + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset") + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + xvec_class.add_class_args(parser, prefix="student_model") + xvec_class.add_dino_teacher_args(parser, prefix="teacher_model") + DINOLoss.add_class_args(parser, prefix="dino_loss") + CosineDINOLoss.add_class_args(parser, prefix="cosine_loss") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser(description="Train Wav2XVector from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except Exception as err: + logging.warning(f"failed saving {args} to {config_file} with {err}") + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_plda.py b/hyperion/bin/train_plda.py new file mode 100644 index 00000000..b33afa31 --- /dev/null +++ b/hyperion/bin/train_plda.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python +""" + Copyright 2024 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.pdfs import PLDAFactory +from hyperion.np.transforms import LDA, PCA, CentWhiten, LNorm, TransformList +from hyperion.utils import SegmentSet + + +def load_data(segments_file, feats_file, class_name): + logging.info("loading data") + segments = SegmentSet.load(segments_file) + reader = DRF.create(feats_file) + x = reader.read(segments["id"], squeeze=True) + _, y = np.unique(segments[class_name], return_inverse=True) + return segments, x, y + + +def train_pca(x, pca_lnorm, pca_args): + pca_var_r = pca_args["pca_var_r"] + logging.info("computing pca pca_var_r=%f", pca_var_r) + pca = None + pca_lnorm = None + if pca_var_r < 1: + if pca_lnorm: + logging.info("LNorm before PCA") + pca_lnorm = LNorm(name="pca_lnorm") + x = pca_lnorm(x) + + pca = PCA(**pca_args) + pca.fit(x) + x = pca(x) + logging.info("pca-dim=%d", x.shape[1]) + + return x, pca_lnorm, pca + + +def train_plda( + segments_file, + feats_file, + class_name, + preproc_file, + plda_file, + pca, + lda, + plda, + pca_lnorm, + do_lda, + lda_lnorm, + plda_lnorm, + plda_center, + plda_whiten, +): + segments, x, y = load_data(segments_file, feats_file, class_name) + transform_list = [] + + x, pca_lnorm, pca_model = train_pca(x, pca_lnorm, pca) + if pca_lnorm is not None: + transform_list.append(pca_lnorm) + + if pca_model is not None: + transform_list.append(pca_model) + + if do_lda and x.shape[1] > lda["lda_dim"]: + if lda_lnorm: + logging.info("LNorm before LDA") + t = LNorm(name="lda_lnorm") + x = t(x) + transform_list.append(t) + + logging.info("Training LDA") + lda_model = LDA(**lda) + lda_model.fit(x, y) + x = lda_model(x) + transform_list.append(lda_model) + + if plda_center or plda_whiten: + if plda_lnorm: + t = LNorm(update_mu=plda_center, update_T=plda_whiten, name="plda_lnorm") + else: + t = CentWhiten(update_mu=plda_center, update_T=plda_whiten, name="plda_cw") + + logging.info("Training Center/Whiten/LNorm") + t.fit(x) + logging.info("Center/Whiten/LNorm before PLDA") + x = t(x) + transform_list.append(t) + elif plda_lnorm: + logging.info("LNorm before PLDA") + t = LNorm(name="plda_lnorm") + x = t(x) + transform_list.append(t) + + logging.info("Training PLDA") + plda["y_dim"] = min(x.shape[1], plda["y_dim"]) + plda = PLDAFactory.create(**plda) + elbo, elbo_norm = plda.fit(x, y) + + logging.info("Saving Models") + if len(transform_list) > 0: + transform_list = TransformList(transform_list) + transform_list.save(preproc_file) + + plda.save(plda_file) + loss_file = Path(plda_file).with_suffix(".csv") + loss_df = pd.DataFrame( + {"epoch": np.arange(1, len(elbo) + 1), "elbo": elbo, "elbo_norm": elbo_norm} + ) + loss_df.to_csv(loss_file, index=False) + + +def main(): + parser = ArgumentParser(description="Trains PLDA model and embedding preprocessor") + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--segments-file", required=True) + parser.add_argument("--class-name", default="speaker") + parser.add_argument("--preproc-file", required=True) + parser.add_argument("--plda-file", required=True) + PCA.add_class_args(parser, prefix="pca") + LDA.add_class_args(parser, prefix="lda") + PLDAFactory.add_class_args(parser, prefix="plda") + parser.add_argument("--pca-lnorm", default=False, action=ActionYesNo) + parser.add_argument("--lda-lnorm", default=False, action=ActionYesNo) + parser.add_argument("--do-lda", default=False, action=ActionYesNo) + parser.add_argument("--plda-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--plda-center", default=True, action=ActionYesNo) + parser.add_argument("--plda-whiten", default=True, action=ActionYesNo) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + config_logger(args.verbose) + logging.debug(args) + del args["verbose"] + del args["cfg"] + train_plda(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_qmf.py b/hyperion/bin/train_qmf.py new file mode 100755 index 00000000..42aabe0c --- /dev/null +++ b/hyperion/bin/train_qmf.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Trains calibration for SRE18 tel condition +""" + +import logging +import os +import sys +import time +from pathlib import Path + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, float_cpu +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.utils.trial_key import TrialKey +from hyperion.utils.trial_scores import TrialScores + + +def print_q_stats(scr, q_names): + for k in q_names: + q_vec = scr.q_measures[k][scr.score_mask] + s = f"{k} stats mean={np.mean(q_vec)} min={np.min(q_vec)} max={np.max(q_vec)} median={np.median(q_vec)}" + logging.info(s) + + +def train_qmf( + score_file, key_file, model_file, prior, lambda_reg, quality_measures, verbose +): + logging.info("load key: %s", key_file) + key = TrialKey.load(key_file) + logging.info("load scores: %s", score_file) + scr = TrialScores.load(score_file) + tar, non = scr.get_tar_non(key) + ntar = len(tar) + nnon = len(non) + + if quality_measures is None: + quality_measures = list(scr.q_measures.keys()) + quality_measures.sort() + + print_q_stats(scr, quality_measures) + q_tar, q_non = scr.get_tar_non_q_measures(key, quality_measures) + + min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + min_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, + ) + + logging.info("train calibration") + # tar = np.vstack((tar, maxnf_tar, minnf_tar, maxcohmu_tar, mincohmu_tar)).T + # non = np.vstack((non, maxnf_non, minnf_non, maxcohmu_non, mincohmu_non)).T + tar = np.hstack((tar[:, None], q_tar)) + non = np.hstack((non[:, None], q_non)) + + x = np.vstack((tar, non)) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) + lr.fit(x, y) + logging.info(f"A={lr.A} b={lr.b}") + logging.info("save calibration at %s", model_file) + lr.save(model_file) + + logging.info("calibrate scores") + tar_cal = lr.predict(tar) + non_cal = lr.predict(non) + act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + act_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, + ) + + score_file = Path(score_file) + output_file = score_file.with_suffix(f".qmf{score_file.suffix}") + scr_out = TrialScores(key.model_set, key.seg_set) + scr_out.scores[key.tar] = tar_cal + scr_out.scores[key.non] = non_cal + scr_out.score_mask = np.logical_or(key.tar, key.non) + scr_out.save(output_file) + + +def main(): + parser = ArgumentParser(description="Trains QMF calibration") + + parser.add_argument("--score-file", required=True) + parser.add_argument("--key-file", required=True) + parser.add_argument("--model-file", required=True) + parser.add_argument("--prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument( + "--quality-measures", + default=None, + nargs="+", + choices=["snorm-mu/s", "snorm-mu", "speech_duration", "num_speech_frames"], + ) + + args = parser.parse_args() + config_logger(args.verbose) + logging.debug(args) + + train_qmf(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_tokenizer.py b/hyperion/bin/train_tokenizer.py new file mode 100644 index 00000000..cd8ab9cf --- /dev/null +++ b/hyperion/bin/train_tokenizer.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python +""" + Copyright 2024 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +from pathlib import Path +from typing import Dict, List + +import sentencepiece as spm +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import PathLike, SegmentSet + +tokenizer_list = ["sentencepiece"] + + +def add_common_args(parser): + parser.add_argument( + "--segments-file", + required=True, + help="input segments file with sentence transcriptions", + ) + parser.add_argument( + "--text-column", default="text", help="text column in segments file" + ) + parser.add_argument("--tokenizer-path", required=True, help="tokenizer model dir") + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def train_sentencepiece( + segments_file: PathLike, + text_column: str, + vocab_size: int, + model_type: str, + char_coverage: str, + sentence_size: int, + user_defined_symbols: List[str], + unk_id: int, + sos_id: int, + eos_id: int, + pad_id: int, + unk_piece: str, + sos_piece: str, + eos_piece: str, + pad_piece: str, + uppercase_text: bool, + tokenizer_path: PathLike, +): + from hyperion.torch.tokenizers import SPTokenizer + + tokenizer_path = Path(tokenizer_path) + tokenizer_path.mkdir(exist_ok=True, parents=True) + + text_file = tokenizer_path / "text" + if not text_file.is_file(): + segments = SegmentSet.load(segments_file) + with open(text_file, "w", encoding="utf-8") as f_text: + for text in segments[text_column]: + if uppercase_text: + text = text.upper() + f_text.write(f"{text}\n") + + model_prefix = tokenizer_path / "tokenizer" + model_file = model_prefix.with_suffix(".model") + if not model_file.is_file(): + spm.SentencePieceTrainer.train( + input=text_file, + vocab_size=vocab_size, + model_type=model_type, + model_prefix=str(model_prefix), + input_sentence_size=sentence_size, + character_coverage=char_coverage, + user_defined_symbols=user_defined_symbols, + unk_id=unk_id, + bos_id=sos_id, + eos_id=eos_id, + pad_id=pad_id, + unk_piece=unk_piece, + bos_piece=sos_piece, + eos_piece=eos_piece, + pad_piece=pad_piece, + ) + + tokenizer = SPTokenizer.load(model_file) + tokenizer.save(model_file.with_suffix(".yaml")) + + # generate_sentencepiece_tokens(model_file, tokenizer_path) + + +def generate_sentencepiece_tokens(model_file: PathLike, tokenizer_path: PathLike): + sp = spm.SentencePieceProcessor() + sp.load(str(model_file)) + token2id: Dict[str, int] = {sp.id_to_piece(i): i for i in range(sp.vocab_size())} + with open(tokenizer_path / "tokens.txt", "w", encoding="utf-8") as f: + for sym, i in token2id.items(): + f.write(f"{sym} {i}\n") + + +def make_sentencepiece_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--vocab-size", default=1000, type=int, help="output vocabulary size" + ) + parser.add_argument( + "--model-type", default="unigram", choices=["unigram", "bpe", "char", "word"] + ) + parser.add_argument("--char-coverage", default=1.0, type=float) + parser.add_argument("--sentence-size", default=100000000, type=int) + parser.add_argument( + "--user-defined-symbols", + default=["", ""], + nargs="+", + help="user defined symbols", + ) + parser.add_argument("--unk-id", default=2, type=int) + parser.add_argument("--sos-id", default=-1, type=int) + parser.add_argument("--eos-id", default=-1, type=int) + parser.add_argument("--pad-id", default=-1, type=int) + parser.add_argument("--unk-piece", default="") + parser.add_argument("--sos-piece", default="") + parser.add_argument("--eos-piece", default="") + parser.add_argument("--pad-piece", default="") + parser.add_argument("--uppercase-text", default=True, action=ActionYesNo) + + add_common_args(parser) + return parser + + +def main(): + parser = ArgumentParser(description="Train sentence piece tokenizer") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in tokenizer_list: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(subcommand, subparser) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + subcommand = f"train_{args.subcommand}" + kwargs = namespace_to_dict(args)[args.subcommand] + if gpu_id == 0: + try: + config_file = Path(kwargs["tokenizer_path"]) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except Exception as err: + logging.warning(f"failed saving {args} err={err}") + + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py new file mode 100755 index 00000000..6dc314ad --- /dev/null +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.models import Wav2ConformerV1RNNTransducer, Wav2RNNRNNTransducer +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.utils import ddp + +model_dict = { + "rnn_rnn_transducer": Wav2RNNRNNTransducer, + "conformer_v1_rnn_transducer": Wav2ConformerV1RNNTransducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + for record in batch: + audio_length.append(record["x"].shape[0]) + audio_length = torch.as_tensor(audio_length) + if not torch.all(audio_length[:-1] >= audio_length[1:]): + sort_idx = torch.argsort(audio_length, descending=True) + batch = [batch[i] for i in sort_idx] + + audio_length = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + audio = pad_sequence(audio) + audio_length = torch.as_tensor(audio_length) + target = k2.RaggedTensor(target) + batch = { + "x": torch.transpose(audio, 0, 1), + "x_lengths": audio_length, + "text": target, + } + return batch + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, + batch_sampler=sampler, + **largs, + collate_fn=dataset.get_collator(), # collate_fn=transducer_collate + ) + return data_loader + + +# def init_model(blank_id, vocab_size, rank, model_class, **kwargs): +# model_args = model_class.filter_args(**kwargs["model"]) +# if rank == 0: +# logging.info("model network args={}".format(model_args)) +# # TODO: check model_args +# model_args["transducer"]["decoder"]["blank_id"] = blank_id +# model_args["transducer"]["decoder"]["vocab_size"] = vocab_size +# model = model_class(**model_args) +# if rank == 0: +# logging.info("model={}".format(model)) +# return model + + +def init_model(rank, model_class, tokenizers, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + + tokenizer = list(tokenizers.items())[0][1] + model_args["transducer"]["rnnt_decoder"]["blank_id"] = tokenizer.blank_id + model_args["transducer"]["rnnt_decoder"]["vocab_size"] = tokenizer.vocab_size + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.enabled = False + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + # model = init_model( + # train_loader.dataset.sp.piece_to_id(""), + # train_loader.dataset.sp.get_piece_size(), + # **kwargs, + # ) + + model = init_model( + tokenizers=train_loader.dataset.tokenizers, + **kwargs, + ) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset") + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset") + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + # parser.add_argument( + # "--data.train.dataset.text_file", + # type=str, + # ) + + # parser.add_argument("--data.val.dataset.text_file", type=str) + + # parser.add_argument( + # "--data.train.dataset.bpe_model", + # type=str, + # ) + + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments( + "data.train.dataset.tokenizer_mappings", "data.val.dataset.tokenizer_mappings" + ) + parser.link_arguments( + "data.train.dataset.tokenizer_files", "data.val.dataset.tokenizer_files" + ) + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") + + model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser(description="Train Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except Exception as err: + logging.warning(f"{err}") + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + # multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py new file mode 100755 index 00000000..fd94f19d --- /dev/null +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.models import ( + HFWav2Vec2ConformerV1RNNTransducer, + HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, +) +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.utils import ddp + +model_dict = { + "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, + "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, + "hf_wav2vec2conformer_v1_rnn_transducer": HFWav2Vec2ConformerV1RNNTransducer, + # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, + # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, + # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, + # "hf_wavlm2rnn_rnn_transducer": HFWavLM2RNNRNNTransducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + } + return batch + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) + return data_loader + + +def init_model(blank_id, vocab_size, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + # TODO: check model_args + model_args["transducer"]["decoder"]["blank_id"] = blank_id + model_args["transducer"]["decoder"]["vocab_size"] = vocab_size + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False + torch.backends.cudnn.enabled = False + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model( + train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + **kwargs, + ) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} # {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset") + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset") + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") + + model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser( + description="Train Wav2Vec2Transducer model from audio files" + ) + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + # multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py new file mode 100755 index 00000000..77a22bb8 --- /dev/null +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import HFWav2Vec2Transducer +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.utils import ddp + +model_dict = { + "hf_wav2vec2transducer": HFWav2Vec2Transducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + audio = pad_sequence(audio) + audio_length = torch.as_tensor(audio_length) + target = k2.RaggedTensor(target) + batch = { + "x": torch.transpose(audio, 0, 1), + "x_lengths": audio_length, + "text": target, + } + return batch + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) + return data_loader + + +def init_model(blank_id, vocab_size, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + # TODO: check model_args + model_args["transducer"]["blank_id"] = blank_id + model_args["transducer"]["vocab_size"] = vocab_size + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False + torch.backends.cudnn.enabled = False + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model( + train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + **kwargs, + ) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} # {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") + + model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser( + description="Train Wav2Vec2Transducer model from audio files" + ) + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + # multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py new file mode 100755 index 00000000..c772fe3c --- /dev/null +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing + +# import sys +import os +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import ( + HFHubert2ConformerV1XVector, + HFHubert2ResNet1dXVector, + HFWav2Vec2ConformerV1XVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ConformerV1XVector, + HFWavLM2ResNet1dXVector, +) +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp + +model_dict = { + "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, + "hf_hubert2resnet1d": HFHubert2ResNet1dXVector, + "hf_wavlm2resnet1d": HFWavLM2ResNet1dXVector, + "hf_wav2vec2conformer": HFWav2Vec2ConformerV1XVector, + "hf_hubert2conformer": HFHubert2ConformerV1XVector, + "hf_wavlm2conformer": HFWavLM2ConformerV1XVector, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_model(num_classes, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + model_args["xvector"]["num_classes"] = num_classes + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info(f"trainer args={trn_args}") + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset") + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset") + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser(description="Train Wav2Vec2XVector model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2xvector.py b/hyperion/bin/train_wav2xvector.py new file mode 100755 index 00000000..bb4a3913 --- /dev/null +++ b/hyperion/bin/train_wav2xvector.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +from pathlib import Path + +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy + +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import Wav2ConformerV1XVector as CXVec +from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as RXVec + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + "conformer": CXVec, + # "efficientnet": EXVec, + # "tdnn": TDXVec, + # "transformer": TFXVec, + # "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_xvector(num_classes, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["xvector"]["num_classes"] = num_classes + model = xvec_class(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset") + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset") + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + xvec_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser(description="Train Wav2XVector from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + logging.warning(f"failed saving {args} to {config_file}") + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py new file mode 100755 index 00000000..c79e444f --- /dev/null +++ b/hyperion/bin/train_xvector_from_feats.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import ConformerV1XVector as CXVec +from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import SpineNetXVector as SpineXVec +from hyperion.torch.models import TDNNXVector as TDXVec +from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + "efficientnet": EXVec, + "tdnn": TDXVec, + "transformer": TFXVec, + "spinenet": SpineXVec, + "conformer": CXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + sd_args = SD.filter_args(**kwargs["dataset"]) + sampler_args = Sampler.filter_args(**kwargs["sampler"]) + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, sd_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + sd_args["is_val"] = partition == "val" + dataset = SD(**sd_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = Sampler(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_xvector(num_classes, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = xvec_class(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_xvector(train_loader.dataset.num_classes, **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + SD.add_class_args(train_parser, prefix="dataset") + Sampler.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + SD.add_class_args(val_parser, prefix="dataset") + Sampler.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_file", "data.val.dataset.class_file" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + parser.link_arguments( + "data.train.sampler.batch_size", "data.val.sampler.batch_size" + ) + + xvec_class.add_class_args(parser, prefix="model") + Trainer.add_class_args(parser, prefix="trainer") + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser(description="Train XVector from audio files") + + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py new file mode 100755 index 00000000..eb251ad9 --- /dev/null +++ b/hyperion/bin/train_xvector_from_wav.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +from pathlib import Path + +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import ConformerV1XVector as CXVec +from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import SpineNetXVector as SpineXVec +from hyperion.torch.models import TDNNXVector as TDXVec +from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + "efficientnet": EXVec, + "tdnn": TDXVec, + "transformer": TFXVec, + "spinenet": SpineXVec, + "conformer": CXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_feats(rank, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + if rank == 0: + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=True, **feat_args) + if rank == 0: + logging.info("feat-extractor={}".format(feat_extractor)) + return feat_extractor + + +def init_xvector(num_classes, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = xvec_class(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + feat_extractor = init_feats(**kwargs) + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset") + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset") + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + AF.add_class_args(parser, prefix="feats") + xvec_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser(description="Train XVector from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin_deprec/ark2hyp.py b/hyperion/bin_deprec/ark2hyp.py index 45a20712..abcb4457 100755 --- a/hyperion/bin_deprec/ark2hyp.py +++ b/hyperion/bin_deprec/ark2hyp.py @@ -7,9 +7,9 @@ Converts from Ark format to h5 format (deprecated, use copy-feats.py) """ -import sys -import os import argparse +import os +import sys import time import numpy as np diff --git a/hyperion/bin_deprec/arkvad2nist.py b/hyperion/bin_deprec/arkvad2nist.py index bd15592a..559371be 100755 --- a/hyperion/bin_deprec/arkvad2nist.py +++ b/hyperion/bin_deprec/arkvad2nist.py @@ -7,15 +7,14 @@ Converts from Ark format to NIST OpenSAT """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np - from hyperion.io import KaldiDataReader diff --git a/hyperion/bin_deprec/compute-gmm-post.py b/hyperion/bin_deprec/compute-gmm-post.py index 1b0a8d04..58675336 100755 --- a/hyperion/bin_deprec/compute-gmm-post.py +++ b/hyperion/bin_deprec/compute-gmm-post.py @@ -7,21 +7,20 @@ Computes GMM posteriors """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np - from keras import backend as K -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.io import HypDataWriter from hyperion.helpers import SequenceReader as SR -from hyperion.transforms import TransformList +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import HypDataWriter from hyperion.pdfs import DiagGMM +from hyperion.transforms import TransformList def to_sparse(r, num_comp): diff --git a/hyperion/bin_deprec/eval-2class-performance.py b/hyperion/bin_deprec/eval-2class-performance.py index a10ec5c0..eff16830 100755 --- a/hyperion/bin_deprec/eval-2class-performance.py +++ b/hyperion/bin_deprec/eval-2class-performance.py @@ -7,18 +7,18 @@ Evals EER, DCF, DET """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np from hyperion.hyp_defs import config_logger -from hyperion.utils.trial_scores import TrialScores -from hyperion.utils.trial_key import TrialKey from hyperion.metrics import compute_eer +from hyperion.utils.trial_key import TrialKey +from hyperion.utils.trial_scores import TrialScores def eval_2class_performance(score_file, key_file, output_path): diff --git a/hyperion/bin_deprec/eval-elbo-ubm.py b/hyperion/bin_deprec/eval-elbo-ubm.py index 5cf1aa0d..bf4839db 100755 --- a/hyperion/bin_deprec/eval-elbo-ubm.py +++ b/hyperion/bin_deprec/eval-elbo-ubm.py @@ -7,18 +7,18 @@ Evaluate the likelihood of the ubm on some data """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import SequenceReader as SR -from hyperion.transforms import TransformList +from hyperion.hyp_defs import config_logger, float_cpu from hyperion.pdfs import DiagGMM +from hyperion.transforms import TransformList def eval_elbo( diff --git a/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py b/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py index 9e2880f8..4548e49b 100755 --- a/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py +++ b/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py @@ -7,21 +7,21 @@ Evals Q-scoring back-end """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np +from hyperion.classifiers import QScoringHomoGBE as GBE +from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger -from hyperion.utils.trial_ndx import TrialNdx -from hyperion.utils.trial_scores import TrialScores from hyperion.io import HypDataWriter as HDW -from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.transforms import TransformList -from hyperion.classifiers import QScoringHomoGBE as GBE +from hyperion.utils.trial_ndx import TrialNdx +from hyperion.utils.trial_scores import TrialScores def eval_qscoring_gbe( diff --git a/hyperion/bin_deprec/eval-score-norm.py b/hyperion/bin_deprec/eval-score-norm.py index fd6e2e00..4b620518 100755 --- a/hyperion/bin_deprec/eval-score-norm.py +++ b/hyperion/bin_deprec/eval-score-norm.py @@ -7,18 +7,18 @@ Score Normalization """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np from hyperion.hyp_defs import config_logger from hyperion.score_norm import * -from hyperion.utils.trial_scroes import TrialScores from hyperion.utils.trial_ndx import TrialNdx +from hyperion.utils.trial_scroes import TrialScores def load_scores(score_file, enr_coh_file, coh_test_file, coh_coh_file): diff --git a/hyperion/bin_deprec/h5vad2nist.py b/hyperion/bin_deprec/h5vad2nist.py index 804c8637..fb45c22b 100755 --- a/hyperion/bin_deprec/h5vad2nist.py +++ b/hyperion/bin_deprec/h5vad2nist.py @@ -7,11 +7,11 @@ Converts from Ark format to NIST OpenSAT """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np diff --git a/hyperion/bin_deprec/init-ubm.py b/hyperion/bin_deprec/init-ubm.py index 8a162314..204ca855 100755 --- a/hyperion/bin_deprec/init-ubm.py +++ b/hyperion/bin_deprec/init-ubm.py @@ -8,20 +8,19 @@ Initialize UBM """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np - from keras import backend as K -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.utils.multithreading import threadsafe_generator from hyperion.helpers import SequenceReader as SR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.pdfs import DiagGMM +from hyperion.utils.multithreading import threadsafe_generator @threadsafe_generator diff --git a/hyperion/bin_deprec/scores2lre_format.py b/hyperion/bin_deprec/scores2lre_format.py index 50e9147f..717c1535 100755 --- a/hyperion/bin_deprec/scores2lre_format.py +++ b/hyperion/bin_deprec/scores2lre_format.py @@ -4,12 +4,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time -import re import logging +import os +import re +import sys +import time import numpy as np diff --git a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py index 9adb2cfd..608a5271 100755 --- a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ConformerEncoderV1 as Encoder -from hyperion.torch.narchs import ConformerEncoderV1 as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ConformerEncoderV1 as Decoder +from hyperion.torch.narchs import ConformerEncoderV1 as Encoder from hyperion.torch.trainers import VQDVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py index d227a8b2..a4cc54e6 100755 --- a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ConformerEncoderV1 as Encoder -from hyperion.torch.narchs import ConformerEncoderV1 as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ConformerEncoderV1 as Decoder +from hyperion.torch.narchs import ConformerEncoderV1 as Encoder from hyperion.torch.trainers import VQVAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-dc1d-dvae.py b/hyperion/bin_deprec/torch-train-dc1d-dvae.py index 343807c2..1b88beba 100755 --- a/hyperion/bin_deprec/torch-train-dc1d-dvae.py +++ b/hyperion/bin_deprec/torch-train-dc1d-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import DC1dEncoder as Encoder -from hyperion.torch.narchs import DC1dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import DC1dDecoder as Decoder +from hyperion.torch.narchs import DC1dEncoder as Encoder from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-dc1d-vae.py b/hyperion/bin_deprec/torch-train-dc1d-vae.py index daa67b3e..dd5d2e72 100755 --- a/hyperion/bin_deprec/torch-train-dc1d-vae.py +++ b/hyperion/bin_deprec/torch-train-dc1d-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs.dc1d_encoder import DC1dEncoder as Encoder -from hyperion.torch.narchs.dc1d_decoder import DC1dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs.dc1d_decoder import DC1dDecoder as Decoder +from hyperion.torch.narchs.dc1d_encoder import DC1dEncoder as Encoder from hyperion.torch.trainers.vae_trainer import VAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-dc2d-dvae.py b/hyperion/bin_deprec/torch-train-dc2d-dvae.py index 2e32b9f9..3f7cb17d 100755 --- a/hyperion/bin_deprec/torch-train-dc2d-dvae.py +++ b/hyperion/bin_deprec/torch-train-dc2d-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import DC2dEncoder as Encoder -from hyperion.torch.narchs import DC2dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import DC2dDecoder as Decoder +from hyperion.torch.narchs import DC2dEncoder as Encoder from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-dc2d-vae.py b/hyperion/bin_deprec/torch-train-dc2d-vae.py index d8675ae9..5b97f55c 100755 --- a/hyperion/bin_deprec/torch-train-dc2d-vae.py +++ b/hyperion/bin_deprec/torch-train-dc2d-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import DC2dEncoder as Encoder -from hyperion.torch.narchs import DC2dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import DC2dDecoder as Decoder +from hyperion.torch.narchs import DC2dEncoder as Encoder from hyperion.torch.trainers import VAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet1d-dvae.py b/hyperion/bin_deprec/torch-train-resnet1d-dvae.py index 420cf7b2..ca6f6996 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-dvae.py @@ -3,27 +3,26 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet1dEncoder as Encoder -from hyperion.torch.narchs import ResNet1dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import ResNet1dDecoder as Decoder +from hyperion.torch.narchs import ResNet1dEncoder as Encoder from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vae.py b/hyperion/bin_deprec/torch-train-resnet1d-vae.py index a8edb3c3..a6218567 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet1dEncoder as Encoder -from hyperion.torch.narchs import ResNet1dDecoder as Decoder from hyperion.torch.models.vae.vae import VAE +from hyperion.torch.narchs import ResNet1dDecoder as Decoder +from hyperion.torch.narchs import ResNet1dEncoder as Encoder from hyperion.torch.trainers import VAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py b/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py index 9571eff8..89448754 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet1dEncoder as Encoder -from hyperion.torch.narchs import ResNet1dDecoder as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ResNet1dDecoder as Decoder +from hyperion.torch.narchs import ResNet1dEncoder as Encoder from hyperion.torch.trainers import VQDVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py b/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py index 373be8f3..4a84bbff 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet1dEncoder as Encoder -from hyperion.torch.narchs import ResNet1dDecoder as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ResNet1dDecoder as Decoder +from hyperion.torch.narchs import ResNet1dEncoder as Encoder from hyperion.torch.trainers import VQVAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet2d-dvae.py b/hyperion/bin_deprec/torch-train-resnet2d-dvae.py index 6845750f..3f6cd6ba 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet2dEncoder as Encoder -from hyperion.torch.narchs import ResNet2dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import ResNet2dDecoder as Decoder +from hyperion.torch.narchs import ResNet2dEncoder as Encoder from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vae.py b/hyperion/bin_deprec/torch-train-resnet2d-vae.py index 575c5575..4e853230 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vae.py @@ -3,32 +3,27 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.narchs import ResNet2dEncoder as Encoder -from hyperion.torch.narchs import ResNet2dDecoder as Decoder +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.models import VAE +from hyperion.torch.narchs import ResNet2dDecoder as Decoder +from hyperion.torch.narchs import ResNet2dEncoder as Encoder from hyperion.torch.trainers import VAETrainer as Trainer -from hyperion.torch.data import FeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py b/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py index 95eb3923..5e0add50 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet2dEncoder as Encoder -from hyperion.torch.narchs import ResNet2dDecoder as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ResNet2dDecoder as Decoder +from hyperion.torch.narchs import ResNet2dEncoder as Encoder from hyperion.torch.trainers import VQDVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py b/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py index 07f25d5f..6398d959 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet2dEncoder as Encoder -from hyperion.torch.narchs import ResNet2dDecoder as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ResNet2dDecoder as Decoder +from hyperion.torch.narchs import ResNet2dEncoder as Encoder from hyperion.torch.trainers import VQVAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py index 39ee2974..0137e101 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py @@ -4,27 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import TransformerEncoderV1 as Encoder -from hyperion.torch.narchs import TransformerEncoderV1 as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import TransformerEncoderV1 as Decoder +from hyperion.torch.narchs import TransformerEncoderV1 as Encoder from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py index 9f5cbdf8..71021825 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import TransformerEncoderV1 as Encoder -from hyperion.torch.narchs import TransformerEncoderV1 as Decoder from hyperion.torch.models.vae.vae import VAE +from hyperion.torch.narchs import TransformerEncoderV1 as Decoder +from hyperion.torch.narchs import TransformerEncoderV1 as Encoder from hyperion.torch.trainers import VAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py index c6246fe3..a6908c4f 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import TransformerEncoderV1 as Encoder -from hyperion.torch.narchs import TransformerEncoderV1 as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import TransformerEncoderV1 as Decoder +from hyperion.torch.narchs import TransformerEncoderV1 as Encoder from hyperion.torch.trainers import VQDVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py index 4659e0d8..b3b07682 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import TransformerEncoderV1 as Encoder -from hyperion.torch.narchs import TransformerEncoderV1 as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import TransformerEncoderV1 as Decoder +from hyperion.torch.narchs import TransformerEncoderV1 as Encoder from hyperion.torch.trainers import VQVAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-xvector.py b/hyperion/bin_deprec/torch-train-xvector.py index 4cc443ae..4c69eb25 100755 --- a/hyperion/bin_deprec/torch-train-xvector.py +++ b/hyperion/bin_deprec/torch-train-xvector.py @@ -3,26 +3,27 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -import torch -from torch.utils.data import DataLoader -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.torch.torch_defs import float_torch -from hyperion.torch.utils import open_device -from hyperion.torch.data import SeqDataset, ClassWeightedSeqSampler as Sampler -from hyperion.torch.helpers import TorchNALoader +import torch +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset from hyperion.torch.helpers import OptimizerFactory as OF -from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF +from hyperion.torch.helpers import TorchNALoader from hyperion.torch.layers import GlobalPool1dFactory as PF -from hyperion.torch.seq_embed import XVector, XVectorTrainer +from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.seq_embed import XVector, XVectorTrainer +from hyperion.torch.torch_defs import float_torch +from hyperion.torch.utils import open_device +from torch.utils.data import DataLoader def train_xvector( diff --git a/hyperion/bin_deprec/train-q-scoring-homo-gbe.py b/hyperion/bin_deprec/train-q-scoring-homo-gbe.py index 69780865..8a348728 100755 --- a/hyperion/bin_deprec/train-q-scoring-homo-gbe.py +++ b/hyperion/bin_deprec/train-q-scoring-homo-gbe.py @@ -7,18 +7,18 @@ Trains Q-scoring back-end """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger +from hyperion.classifiers import QScoringHomoGBE as GBE from hyperion.helpers import VectorClassReader as VCR +from hyperion.hyp_defs import config_logger from hyperion.transforms import TransformList -from hyperion.classifiers import QScoringHomoGBE as GBE def train_qscoring_backend(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin_deprec/vectors2scores.py b/hyperion/bin_deprec/vectors2scores.py index cc936115..ab4be8ac 100755 --- a/hyperion/bin_deprec/vectors2scores.py +++ b/hyperion/bin_deprec/vectors2scores.py @@ -4,11 +4,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time +import os import re +import sys +import time import numpy as np diff --git a/hyperion/bin/apply-mvn-select-frames.py b/hyperion/bin_deprec2/apply-mvn-select-frames.py similarity index 95% rename from hyperion/bin/apply-mvn-select-frames.py rename to hyperion/bin_deprec2/apply-mvn-select-frames.py index 71c52cda..a2456dc9 100755 --- a/hyperion/bin/apply-mvn-select-frames.py +++ b/hyperion/bin_deprec2/apply-mvn-select-frames.py @@ -4,27 +4,23 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from hyperion.hyp_defs import config_logger -from hyperion.utils.kaldi_matrix import compression_methods -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF -from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import RandomAccessDataReaderFactory as RDRF -from hyperion.feats import MeanVarianceNorm as MVN -from hyperion.feats import FrameSelector as FSel +from hyperion.io import SequentialDataReaderFactory as DRF +from hyperion.np.feats import FrameSelector as FSel +from hyperion.np.feats import MeanVarianceNorm as MVN +from hyperion.utils import Utt2Info +from hyperion.utils.kaldi_matrix import compression_methods def process_feats( diff --git a/hyperion/bin/compute-mfcc-feats.py b/hyperion/bin_deprec2/compute-mfcc-feats.py similarity index 95% rename from hyperion/bin/compute-mfcc-feats.py rename to hyperion/bin_deprec2/compute-mfcc-feats.py index 589d3188..a83f95d1 100755 --- a/hyperion/bin/compute-mfcc-feats.py +++ b/hyperion/bin_deprec2/compute-mfcc-feats.py @@ -3,25 +3,21 @@ Copyright 2018 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF -from hyperion.io import DataWriterFactory as DWF from hyperion.io import compression_methods -from hyperion.feats import MFCC +from hyperion.np.feats import MFCC def compute_mfcc_feats( diff --git a/hyperion/bin/copy-feats.py b/hyperion/bin_deprec2/copy-feats.py similarity index 99% rename from hyperion/bin/copy-feats.py rename to hyperion/bin_deprec2/copy-feats.py index 1ef044f5..0385cc55 100755 --- a/hyperion/bin/copy-feats.py +++ b/hyperion/bin_deprec2/copy-feats.py @@ -5,18 +5,17 @@ Copy features/vectors and change format """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np from hyperion.hyp_defs import config_logger from hyperion.io import CopyFeats as CF - if __name__ == "__main__": parser = argparse.ArgumentParser( diff --git a/hyperion/bin/eval-cos-1vs1.py b/hyperion/bin_deprec2/eval-cos-1vs1.py similarity index 94% rename from hyperion/bin/eval-cos-1vs1.py rename to hyperion/bin_deprec2/eval-cos-1vs1.py index 123221f2..de508333 100755 --- a/hyperion/bin/eval-cos-1vs1.py +++ b/hyperion/bin_deprec2/eval-cos-1vs1.py @@ -7,19 +7,19 @@ Evals cosine scoring """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger +from hyperion.helpers import TrialDataReader as TDR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.np.transforms import LNorm, TransformList from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores -from hyperion.helpers import TrialDataReader as TDR -from hyperion.transforms import TransformList, LNorm def eval_cos( diff --git a/hyperion/bin/eval-linear-gbe-up.py b/hyperion/bin_deprec2/eval-linear-gbe-up.py similarity index 93% rename from hyperion/bin/eval-linear-gbe-up.py rename to hyperion/bin_deprec2/eval-linear-gbe-up.py index 287117fd..d82bf967 100755 --- a/hyperion/bin/eval-linear-gbe-up.py +++ b/hyperion/bin_deprec2/eval-linear-gbe-up.py @@ -7,21 +7,21 @@ Evals linear GBE with uncertainty propagation. """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger +from hyperion.helpers import ClassifTrialDataReader as TDR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import HypDataWriter as HDW +from hyperion.np.classifiers import LinearGBEUP as GBE +from hyperion.np.transforms import TransformList from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores -from hyperion.io import HypDataWriter as HDW -from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearGBEUP as GBE def eval_linear_gbe( diff --git a/hyperion/bin/eval-linear-gbe.py b/hyperion/bin_deprec2/eval-linear-gbe.py similarity index 93% rename from hyperion/bin/eval-linear-gbe.py rename to hyperion/bin_deprec2/eval-linear-gbe.py index a93b6c39..cf788392 100755 --- a/hyperion/bin/eval-linear-gbe.py +++ b/hyperion/bin_deprec2/eval-linear-gbe.py @@ -7,21 +7,21 @@ Evals linear GBE """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger +from hyperion.helpers import ClassifTrialDataReader as TDR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import HypDataWriter as HDW +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.transforms import TransformList from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores -from hyperion.io import HypDataWriter as HDW -from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearGBE as GBE def eval_linear_gbe( diff --git a/hyperion/bin/eval-linear-svmc.py b/hyperion/bin_deprec2/eval-linear-svmc.py similarity index 93% rename from hyperion/bin/eval-linear-svmc.py rename to hyperion/bin_deprec2/eval-linear-svmc.py index ff7b1faa..ba4c5e81 100755 --- a/hyperion/bin/eval-linear-svmc.py +++ b/hyperion/bin_deprec2/eval-linear-svmc.py @@ -7,21 +7,21 @@ Evals SVM """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger +from hyperion.helpers import ClassifTrialDataReader as TDR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import HypDataWriter as HDW +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.transforms import TransformList from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores -from hyperion.io import HypDataWriter as HDW -from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearSVMC as SVM def eval_svm( diff --git a/hyperion/bin/eval-logistic-regression.py b/hyperion/bin_deprec2/eval-logistic-regression.py similarity index 93% rename from hyperion/bin/eval-logistic-regression.py rename to hyperion/bin_deprec2/eval-logistic-regression.py index d96e2473..992ca7b8 100755 --- a/hyperion/bin/eval-logistic-regression.py +++ b/hyperion/bin_deprec2/eval-logistic-regression.py @@ -7,21 +7,21 @@ Evals logistic regression """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger +from hyperion.helpers import ClassifTrialDataReader as TDR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import HypDataWriter as HDW +from hyperion.np.classifiers import LogisticRegression as LR +from hyperion.np.transforms import TransformList from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores -from hyperion.io import HypDataWriter as HDW -from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.transforms import TransformList -from hyperion.classifiers import LogisticRegression as LR def eval_lr( diff --git a/hyperion/bin/eval-plda-1vs1.py b/hyperion/bin_deprec2/eval-plda-1vs1.py similarity index 95% rename from hyperion/bin/eval-plda-1vs1.py rename to hyperion/bin_deprec2/eval-plda-1vs1.py index 715d043a..5a810cf7 100755 --- a/hyperion/bin/eval-plda-1vs1.py +++ b/hyperion/bin_deprec2/eval-plda-1vs1.py @@ -7,20 +7,20 @@ Evals PDDA LLR """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger +from hyperion.helpers import PLDAFactory as F +from hyperion.helpers import TrialDataReader as TDR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.np.transforms import TransformList from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores -from hyperion.helpers import TrialDataReader as TDR -from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList def eval_plda( diff --git a/hyperion/bin/eval-plda-nvs1.py b/hyperion/bin_deprec2/eval-plda-nvs1.py similarity index 95% rename from hyperion/bin/eval-plda-nvs1.py rename to hyperion/bin_deprec2/eval-plda-nvs1.py index 30ea2606..5c5d200c 100755 --- a/hyperion/bin/eval-plda-nvs1.py +++ b/hyperion/bin_deprec2/eval-plda-nvs1.py @@ -7,20 +7,20 @@ Evals PLDA LLR """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger +from hyperion.helpers import PLDAFactory as F +from hyperion.helpers import TrialDataReader as TDR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.np.transforms import TransformList from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores -from hyperion.helpers import TrialDataReader as TDR -from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList def eval_plda( diff --git a/hyperion/bin/merge-h5-files.py b/hyperion/bin_deprec2/merge-h5-files.py similarity index 99% rename from hyperion/bin/merge-h5-files.py rename to hyperion/bin_deprec2/merge-h5-files.py index a807c69c..aeda3bab 100755 --- a/hyperion/bin/merge-h5-files.py +++ b/hyperion/bin_deprec2/merge-h5-files.py @@ -6,10 +6,11 @@ """ Merges multiple hdf5 files into one file """ -import sys -import os import argparse +import os +import sys import time + import numpy as np from hyperion.io import H5Merger diff --git a/hyperion/bin/pack-audio-files.py b/hyperion/bin_deprec2/pack-audio-files.py similarity index 99% rename from hyperion/bin/pack-audio-files.py rename to hyperion/bin_deprec2/pack-audio-files.py index 4953d345..5d544df4 100755 --- a/hyperion/bin/pack-audio-files.py +++ b/hyperion/bin_deprec2/pack-audio-files.py @@ -3,19 +3,19 @@ Copyright 2020 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging - import math +import os +import sys +import time + import numpy as np -from scipy import signal, ndimage +from scipy import ndimage, signal from hyperion.hyp_defs import config_logger -from hyperion.io import SequentialAudioReader as AR from hyperion.io import PackedAudioWriter as Writer +from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.io import WSpecifier as WS diff --git a/hyperion/bin/plot-vector-hist.py b/hyperion/bin_deprec2/plot-vector-hist.py similarity index 97% rename from hyperion/bin/plot-vector-hist.py rename to hyperion/bin_deprec2/plot-vector-hist.py index cd86b1c1..75236726 100755 --- a/hyperion/bin/plot-vector-hist.py +++ b/hyperion/bin_deprec2/plot-vector-hist.py @@ -4,21 +4,21 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time -import numpy as np import matplotlib +import numpy as np matplotlib.use("Agg") import matplotlib.pyplot as plt -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList +from hyperion.hyp_defs import config_logger +from hyperion.np.transforms import TransformList def plot_vector_hist( diff --git a/hyperion/bin/rttm-to-bin-vad.py b/hyperion/bin_deprec2/rttm-to-bin-vad.py similarity index 98% rename from hyperion/bin/rttm-to-bin-vad.py rename to hyperion/bin_deprec2/rttm-to-bin-vad.py index 9c51ba2c..19e98d8f 100755 --- a/hyperion/bin/rttm-to-bin-vad.py +++ b/hyperion/bin_deprec2/rttm-to-bin-vad.py @@ -3,18 +3,18 @@ # Apache 2.0. # -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import pandas as pd from hyperion.hyp_defs import config_logger -from hyperion.utils import SegmentList, RTTM from hyperion.io import DataWriterFactory as DWF +from hyperion.utils import RTTM, SegmentList def rttm_to_bin_vad( diff --git a/hyperion/bin/segments-to-bin-vad.py b/hyperion/bin_deprec2/segments-to-bin-vad.py similarity index 96% rename from hyperion/bin/segments-to-bin-vad.py rename to hyperion/bin_deprec2/segments-to-bin-vad.py index 2b3a7d91..24021a4b 100755 --- a/hyperion/bin/segments-to-bin-vad.py +++ b/hyperion/bin_deprec2/segments-to-bin-vad.py @@ -3,23 +3,19 @@ # Apache 2.0. # -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from hyperion.hyp_defs import config_logger -from hyperion.utils import SegmentList from hyperion.io import DataWriterFactory as DWF +from hyperion.utils import SegmentList def segments_to_bin_vad( diff --git a/hyperion/bin/torch-adv-finetune-xvec-from-wav.py b/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py similarity index 97% rename from hyperion/bin/torch-adv-finetune-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py index eb118102..ad33515c 100755 --- a/hyperion/bin/torch-adv-finetune-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py @@ -3,35 +3,29 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import XVector as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.adv_attacks import AttackFactory -from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device def init_data( diff --git a/hyperion/bin/torch-adv-finetune-xvec.py b/hyperion/bin_deprec2/torch-adv-finetune-xvec.py similarity index 98% rename from hyperion/bin/torch-adv-finetune-xvec.py rename to hyperion/bin_deprec2/torch-adv-finetune-xvec.py index ae2cb37b..850233e2 100755 --- a/hyperion/bin/torch-adv-finetune-xvec.py +++ b/hyperion/bin_deprec2/torch-adv-finetune-xvec.py @@ -4,32 +4,27 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorAdvTrainer as Trainer -from hyperion.torch.data import FeatSeqDataset as SD +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.adv_attacks import AttackFactory -from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.models import XVector as XVec +from hyperion.torch.trainers import XVectorAdvTrainer as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/torch-compute-mfcc-feats.py b/hyperion/bin_deprec2/torch-compute-mfcc-feats.py similarity index 95% rename from hyperion/bin/torch-compute-mfcc-feats.py rename to hyperion/bin_deprec2/torch-compute-mfcc-feats.py index 5f7d9f7d..07f71bfb 100755 --- a/hyperion/bin/torch-compute-mfcc-feats.py +++ b/hyperion/bin_deprec2/torch-compute-mfcc-feats.py @@ -3,26 +3,21 @@ Copyright 2018 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging -import torch +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +import torch from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF -from hyperion.io import DataWriterFactory as DWF from hyperion.io import compression_methods from hyperion.torch.layers import AudioFeatsFactory as AFF -from hyperion.feats import MFCC def compute_mfcc_feats( diff --git a/hyperion/bin/torch-eval-vae.py b/hyperion/bin_deprec2/torch-eval-vae.py similarity index 98% rename from hyperion/bin/torch-eval-vae.py rename to hyperion/bin_deprec2/torch-eval-vae.py index dfcdaa38..d676b0f1 100755 --- a/hyperion/bin/torch-eval-vae.py +++ b/hyperion/bin_deprec2/torch-eval-vae.py @@ -3,19 +3,15 @@ Copyright 2020 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import time import logging +import time from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import matplotlib import numpy as np import pandas as pd -import matplotlib +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) matplotlib.use("Agg") # matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) @@ -23,16 +19,14 @@ import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.feats import MeanVarianceNorm as MVN - -from hyperion.torch.utils import open_device +from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py similarity index 97% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py index 8d55b719..aaa91214 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py @@ -3,43 +3,35 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging +# [Added Sonal May21] +from pathlib import Path import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device -from hyperion.torch.layers import LinBinCalibrator as Calibrator -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML - from hyperion.torch.adv_attacks import AttackFactory - -# [Added Sonal May21] -from pathlib import Path from hyperion.torch.adv_defenses.wave_gan_white import WaveGANDefender +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember torch.backends.cudnn.enabled = False diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py similarity index 97% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py index a5783654..437127b2 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py @@ -3,39 +3,32 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device -from hyperion.torch.layers import LinBinCalibrator as Calibrator -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML - from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember class MyModel(nn.Module): diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py similarity index 96% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py index 44a3b98f..8d4add76 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py @@ -4,42 +4,34 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from art.classifiers import PyTorchClassifier +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks.art_attack_factory import \ + ARTAttackFactory as AttackFactory from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack -from hyperion.torch import TorchModelLoader as TML - -from art.classifiers import PyTorchClassifier -from hyperion.torch.adv_attacks.art_attack_factory import ( - ARTAttackFactory as AttackFactory, -) +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember def init_device(use_gpu): diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py similarity index 97% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py index c7bcc50a..0e9493c0 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py @@ -4,35 +4,29 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import RandomAccessAudioReader as AR -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import l2_norm -from hyperion.torch import TorchModelLoader as TML +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember def init_device(use_gpu): diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py similarity index 97% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py index 4b08c7ab..e0754498 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py @@ -3,39 +3,32 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device -from hyperion.torch.layers import LinBinCalibrator as Calibrator -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML - from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember class MyModel(nn.Module): diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py similarity index 97% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py index 9d9d4666..0f9f375d 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py @@ -4,42 +4,34 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from art.classifiers import PyTorchClassifier +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks.art_attack_factory import \ + ARTAttackFactory as AttackFactory from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack -from hyperion.torch import TorchModelLoader as TML - -from art.classifiers import PyTorchClassifier -from hyperion.torch.adv_attacks.art_attack_factory import ( - ARTAttackFactory as AttackFactory, -) +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember class MyModel(nn.Module): diff --git a/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py b/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py similarity index 86% rename from hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py rename to hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py index bf227045..fc494448 100755 --- a/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info, RTTM from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF -from hyperion.augment import SpeechAugment - -from hyperion.torch.utils import open_device -from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import RTTM, Utt2Info def init_device(use_gpu): @@ -154,21 +148,6 @@ def extract_xvectors( t3 = time.time() key, x = augment(key0, x0, augmenter, aug_df, aug_id) - # if augmenter is None: - # x = x0 - # key = key0 - # else: - # x, aug_info = augmenter(x0) - # key = '%s-aug-%02d' % (key0, aug_id) - # aug_df_row = {'key_aug': key, 'key_orig': key0, - # 'noise_type': aug_info['noise']['noise_type'], - # 'snr': aug_info['noise']['snr'], - # 'rir_type': aug_info['reverb']['rir_type'], - # 'srr': aug_info['reverb']['srr'], - # 'sdr': aug_info['sdr']} - - # aug_df.append(pd.DataFrame(aug_df_row, index=[0])) - x_total = x max_samples = x.shape[0] y = np.zeros( @@ -219,17 +198,6 @@ def extract_xvectors( key, x, min_utt_length, max_utt_length, rng ) - # if random_utt_length: - # utt_length = rng.randint( - # low=min_utt_length, high=max_utt_length+1) - # if utt_length < x.shape[1]: - # first_frame = rng.randint( - # low=0, high=x.shape[1]-utt_length) - # x = x[:,first_frame:first_frame+utt_length] - # logging.info( - # 'extract-random-utt %s of length=%d first-frame=%d' % ( - # key, x.shape[1], first_frame)) - t6 = time.time() if x.shape[1] > 0: x = x.transpose(1, 2).contiguous() diff --git a/hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py similarity index 98% rename from hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py rename to hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py index e3ab70e9..c85fe4c9 100755 --- a/hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py @@ -4,33 +4,27 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd import yaml +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF -from hyperion.augment import SpeechAugment - -from hyperion.torch.utils import open_device -from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin/torch-extract-xvectors-slidwin.py b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py similarity index 98% rename from hyperion/bin/torch-extract-xvectors-slidwin.py rename to hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py index 0e2f0173..6da57e16 100755 --- a/hyperion/bin/torch-extract-xvectors-slidwin.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py @@ -4,30 +4,24 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.feats import MeanVarianceNorm as MVN - -from hyperion.torch.utils import open_device +from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin/torch-extract-xvectors-vae-preproc.py b/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py similarity index 91% rename from hyperion/bin/torch-extract-xvectors-vae-preproc.py rename to hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py index 376de911..6edf60ed 100755 --- a/hyperion/bin/torch-extract-xvectors-vae-preproc.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py @@ -4,30 +4,24 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.feats import MeanVarianceNorm as MVN - -from hyperion.torch.utils import open_device +from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): @@ -96,21 +90,6 @@ def extract_xvectors( keys = [] info = [] - # num_gpus = 1 if use_gpu else 0 - # logging.info('initializing devices num_gpus={}'.format(num_gpus)) - # device = open_device(num_gpus=num_gpus) - # logging.info('loading x-vector model {}'.format(xvec_model_path)) - # xvec_model = TML.load(xvec_model_path) - # xvec_model.to(device) - # xvec_model.eval() - # logging.info('x-vector={}'.format(xvec_model)) - - # logging.info('loading vae model {}'.format(vae_model_path)) - # vae_model = TML.load(vae_model_path) - # vae_model.to(device) - # vae_model.eval() - # logging.info('vae={}'.format(vae_model)) - mse_loss = torch.nn.MSELoss() dr_args = DRF.filter_args(**kwargs) @@ -151,7 +130,7 @@ def extract_xvectors( t4 = time.time() if x.shape[0] == 0: - y = np.zeros((model.embed_dim,), dtype=float_cpu()) + y = np.zeros((xvec_model.embed_dim,), dtype=float_cpu()) else: xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) with torch.no_grad(): diff --git a/hyperion/bin/torch-extract-xvectors.py b/hyperion/bin_deprec2/torch-extract-xvectors.py similarity index 97% rename from hyperion/bin/torch-extract-xvectors.py rename to hyperion/bin_deprec2/torch-extract-xvectors.py index 18bab96f..76d941e0 100755 --- a/hyperion/bin/torch-extract-xvectors.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors.py @@ -4,30 +4,24 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.feats import MeanVarianceNorm as MVN - -from hyperion.torch.utils import open_device +from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin/torch-train-dc1d-ae.py b/hyperion/bin_deprec2/torch-train-dc1d-ae.py similarity index 99% rename from hyperion/bin/torch-train-dc1d-ae.py rename to hyperion/bin_deprec2/torch-train-dc1d-ae.py index e7547927..50ac7d42 100755 --- a/hyperion/bin/torch-train-dc1d-ae.py +++ b/hyperion/bin_deprec2/torch-train-dc1d-ae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs.dc1d_encoder import DC1dEncoder as Encoder -from hyperion.torch.narchs.dc1d_decoder import DC1dDecoder as Decoder from hyperion.torch.models import AE +from hyperion.torch.narchs.dc1d_decoder import DC1dDecoder as Decoder +from hyperion.torch.narchs.dc1d_encoder import DC1dEncoder as Encoder from hyperion.torch.trainers import AETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_ae( diff --git a/hyperion/bin/torch-train-dvae.py b/hyperion/bin_deprec2/torch-train-dvae.py similarity index 94% rename from hyperion/bin/torch-train-dvae.py rename to hyperion/bin_deprec2/torch-train-dvae.py index e13c26ac..808bfbba 100755 --- a/hyperion/bin/torch-train-dvae.py +++ b/hyperion/bin_deprec2/torch-train-dvae.py @@ -3,36 +3,30 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.narchs import DC1dEncoder, DC1dDecoder -from hyperion.torch.narchs import DC2dEncoder, DC2dDecoder -from hyperion.torch.narchs import ResNet1dEncoder, ResNet1dDecoder -from hyperion.torch.narchs import ResNet2dEncoder, ResNet2dDecoder -from hyperion.torch.narchs import TransformerEncoderV1 -from hyperion.torch.narchs import ConformerEncoderV1 +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedFeatSeqDataset as SD from hyperion.torch.models import VAE +from hyperion.torch.narchs import (ConformerEncoderV1, DC1dDecoder, + DC1dEncoder, DC2dDecoder, DC2dEncoder, + ResNet1dDecoder, ResNet1dEncoder, + ResNet2dDecoder, ResNet2dEncoder, + TransformerEncoderV1) from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedFeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import ddp, open_device enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin/torch-train-efficientnet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py similarity index 96% rename from hyperion/bin/torch-train-efficientnet-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py index 6d7c41ee..f256f735 100755 --- a/hyperion/bin/torch-train-efficientnet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py @@ -3,32 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import EfficientNetXVector as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import EfficientNetXVector as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device def init_data( diff --git a/hyperion/bin/torch-train-efficientnet-xvec.py b/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py similarity index 95% rename from hyperion/bin/torch-train-efficientnet-xvec.py rename to hyperion/bin_deprec2/torch-train-efficientnet-xvec.py index c259a590..622ac62e 100755 --- a/hyperion/bin/torch-train-efficientnet-xvec.py +++ b/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainer as Trainer -from hyperion.torch.models import EfficientNetXVector as XVec -from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import EfficientNetXVector as XVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/torch-train-resnet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py similarity index 97% rename from hyperion/bin/torch-train-resnet-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py index 436e4001..3d135b18 100755 --- a/hyperion/bin/torch-train-resnet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py @@ -3,41 +3,36 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - -# import torch.multiprocessing as mp - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp - -# from hyperion.torch.helpers import OptimizerFactory as OF -# from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import ResNetXVector as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy - +from hyperion.torch.models import ResNetXVector as XVec # from hyperion.torch.layers import AudioFeatsFactory as AFF # from hyperion.torch.layers import MeanVarianceNorm as MVN from hyperion.torch.narchs import AudioFeatsMVN as AF +# from hyperion.torch.helpers import OptimizerFactory as OF +# from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device + +# import torch.multiprocessing as mp + + + # from torch.utils.data import dataloader # from torch.multiprocessing import reductions diff --git a/hyperion/bin/torch-train-resnet-xvec.py b/hyperion/bin_deprec2/torch-train-resnet-xvec.py similarity index 96% rename from hyperion/bin/torch-train-resnet-xvec.py rename to hyperion/bin_deprec2/torch-train-resnet-xvec.py index 6e7f4242..f976cc6e 100755 --- a/hyperion/bin/torch-train-resnet-xvec.py +++ b/hyperion/bin_deprec2/torch-train-resnet-xvec.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainer as Trainer -from hyperion.torch.models import ResNetXVector as XVec -from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import ResNetXVector as XVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/torch-train-resnet1d-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py similarity index 96% rename from hyperion/bin/torch-train-resnet1d-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py index bf531745..3ee6bf18 100755 --- a/hyperion/bin/torch-train-resnet1d-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py @@ -3,34 +3,27 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import ResNet1dXVector as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy - +from hyperion.torch.models import ResNet1dXVector as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device def init_data( diff --git a/hyperion/bin/torch-train-spinenet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py similarity index 97% rename from hyperion/bin/torch-train-spinenet-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py index 91aa17b1..0857ce5c 100755 --- a/hyperion/bin/torch-train-spinenet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py @@ -4,32 +4,27 @@ Copyright 2020 Magdalena Rybicka Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import SpineNetXVector as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import SpineNetXVector as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp def init_data( diff --git a/hyperion/bin/torch-train-tdnn-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py similarity index 96% rename from hyperion/bin/torch-train-tdnn-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py index 0ab0cb67..7bbbff03 100755 --- a/hyperion/bin/torch-train-tdnn-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py @@ -3,32 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import TDNNXVector as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import TDNNXVector as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device def init_data( diff --git a/hyperion/bin/torch-train-tdnn-xvec.py b/hyperion/bin_deprec2/torch-train-tdnn-xvec.py similarity index 95% rename from hyperion/bin/torch-train-tdnn-xvec.py rename to hyperion/bin_deprec2/torch-train-tdnn-xvec.py index 2075ca34..5614f1b9 100755 --- a/hyperion/bin/torch-train-tdnn-xvec.py +++ b/hyperion/bin_deprec2/torch-train-tdnn-xvec.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainer as Trainer -from hyperion.torch.models import TDNNXVector as XVec -from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import TDNNXVector as XVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/torch-train-transformer-xvec-v1-from-wav.py b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py similarity index 96% rename from hyperion/bin/torch-train-transformer-xvec-v1-from-wav.py rename to hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py index 636fb390..6b361583 100755 --- a/hyperion/bin/torch-train-transformer-xvec-v1-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py @@ -3,32 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import TransformerXVectorV1 as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import TransformerXVectorV1 as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device def init_data( diff --git a/hyperion/bin/torch-train-transformer-xvec-v1.py b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py similarity index 95% rename from hyperion/bin/torch-train-transformer-xvec-v1.py rename to hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py index 033408b6..62164f15 100755 --- a/hyperion/bin/torch-train-transformer-xvec-v1.py +++ b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainer as Trainer -from hyperion.torch.models import TransformerXVectorV1 as XVec -from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import TransformerXVectorV1 as XVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/torch-train-vae.py b/hyperion/bin_deprec2/torch-train-vae.py similarity index 94% rename from hyperion/bin/torch-train-vae.py rename to hyperion/bin_deprec2/torch-train-vae.py index 7ceb3014..4c41d49c 100755 --- a/hyperion/bin/torch-train-vae.py +++ b/hyperion/bin_deprec2/torch-train-vae.py @@ -3,36 +3,30 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.narchs import DC1dEncoder, DC1dDecoder -from hyperion.torch.narchs import DC2dEncoder, DC2dDecoder -from hyperion.torch.narchs import ResNet1dEncoder, ResNet1dDecoder -from hyperion.torch.narchs import ResNet2dEncoder, ResNet2dDecoder -from hyperion.torch.narchs import TransformerEncoderV1 -from hyperion.torch.narchs import ConformerEncoderV1 +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.models import VAE +from hyperion.torch.narchs import (ConformerEncoderV1, DC1dDecoder, + DC1dEncoder, DC2dDecoder, DC2dEncoder, + ResNet1dDecoder, ResNet1dEncoder, + ResNet2dDecoder, ResNet2dEncoder, + TransformerEncoderV1) from hyperion.torch.trainers import VAETrainer as Trainer -from hyperion.torch.data import FeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import ddp, open_device enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin/torch-train-vq-dvae.py b/hyperion/bin_deprec2/torch-train-vq-dvae.py similarity index 94% rename from hyperion/bin/torch-train-vq-dvae.py rename to hyperion/bin_deprec2/torch-train-vq-dvae.py index 6e49df08..5de1bbd4 100755 --- a/hyperion/bin/torch-train-vq-dvae.py +++ b/hyperion/bin_deprec2/torch-train-vq-dvae.py @@ -3,36 +3,30 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.narchs import DC1dEncoder, DC1dDecoder -from hyperion.torch.narchs import DC2dEncoder, DC2dDecoder -from hyperion.torch.narchs import ResNet1dEncoder, ResNet1dDecoder -from hyperion.torch.narchs import ResNet2dEncoder, ResNet2dDecoder -from hyperion.torch.narchs import TransformerEncoderV1 -from hyperion.torch.narchs import ConformerEncoderV1 +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedFeatSeqDataset as SD from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import (ConformerEncoderV1, DC1dDecoder, + DC1dEncoder, DC2dDecoder, DC2dEncoder, + ResNet1dDecoder, ResNet1dEncoder, + ResNet2dDecoder, ResNet2dEncoder, + TransformerEncoderV1) from hyperion.torch.trainers import VQDVAETrainer as Trainer -from hyperion.torch.data import PairedFeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import ddp, open_device enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin/torch-train-vq-vae.py b/hyperion/bin_deprec2/torch-train-vq-vae.py similarity index 94% rename from hyperion/bin/torch-train-vq-vae.py rename to hyperion/bin_deprec2/torch-train-vq-vae.py index fa8b336c..2a95f853 100755 --- a/hyperion/bin/torch-train-vq-vae.py +++ b/hyperion/bin_deprec2/torch-train-vq-vae.py @@ -3,36 +3,30 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.narchs import DC1dEncoder, DC1dDecoder -from hyperion.torch.narchs import DC2dEncoder, DC2dDecoder -from hyperion.torch.narchs import ResNet1dEncoder, ResNet1dDecoder -from hyperion.torch.narchs import ResNet2dEncoder, ResNet2dDecoder -from hyperion.torch.narchs import TransformerEncoderV1 -from hyperion.torch.narchs import ConformerEncoderV1 +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import (ConformerEncoderV1, DC1dDecoder, + DC1dEncoder, DC2dDecoder, DC2dEncoder, + ResNet1dDecoder, ResNet1dEncoder, + ResNet2dDecoder, ResNet2dEncoder, + TransformerEncoderV1) from hyperion.torch.trainers import VQVAETrainer as Trainer -from hyperion.torch.data import FeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import ddp, open_device enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin/train-cw-up.py b/hyperion/bin_deprec2/train-cw-up.py similarity index 96% rename from hyperion/bin/train-cw-up.py rename to hyperion/bin_deprec2/train-cw-up.py index 48b8dfc4..c1c372ad 100755 --- a/hyperion/bin/train-cw-up.py +++ b/hyperion/bin_deprec2/train-cw-up.py @@ -7,18 +7,18 @@ Trains Centering and whitening with uncertainty prop. """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.pdfs.core import Normal -from hyperion.transforms import TransformList, CentWhitenUP, LNormUP +from hyperion.hyp_defs import config_logger +from hyperion.np.pdfs.core import Normal +from hyperion.np.transforms import CentWhitenUP, LNormUP, TransformList def load_model(input_path, with_lnorm, name, **kwargs): diff --git a/hyperion/bin/train-cw.py b/hyperion/bin_deprec2/train-cw.py similarity index 96% rename from hyperion/bin/train-cw.py rename to hyperion/bin_deprec2/train-cw.py index c64d4892..cabca7c2 100755 --- a/hyperion/bin/train-cw.py +++ b/hyperion/bin_deprec2/train-cw.py @@ -7,18 +7,18 @@ Trains Centering and whitening """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.pdfs.core import Normal -from hyperion.transforms import TransformList, CentWhiten, LNorm +from hyperion.hyp_defs import config_logger +from hyperion.np.pdfs.core import Normal +from hyperion.np.transforms import CentWhiten, LNorm, TransformList def load_model(input_path, with_lnorm, name, **kwargs): diff --git a/hyperion/bin/train-gaussianizer.py b/hyperion/bin_deprec2/train-gaussianizer.py similarity index 96% rename from hyperion/bin/train-gaussianizer.py rename to hyperion/bin_deprec2/train-gaussianizer.py index eefd2456..aeb51e46 100755 --- a/hyperion/bin/train-gaussianizer.py +++ b/hyperion/bin_deprec2/train-gaussianizer.py @@ -7,18 +7,18 @@ Trains Gaussianization for i-vectors. """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.pdfs.core import Normal -from hyperion.transforms import TransformList, Gaussianizer +from hyperion.hyp_defs import config_logger +from hyperion.np.pdfs.core import Normal +from hyperion.np.transforms import Gaussianizer, TransformList def load_model(input_path, **kwargs): diff --git a/hyperion/bin/train-lda.py b/hyperion/bin_deprec2/train-lda.py similarity index 97% rename from hyperion/bin/train-lda.py rename to hyperion/bin_deprec2/train-lda.py index 17cd5ab6..1887a72f 100755 --- a/hyperion/bin/train-lda.py +++ b/hyperion/bin_deprec2/train-lda.py @@ -6,17 +6,17 @@ """ Trains LDA """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, LDA, SbSw +from hyperion.hyp_defs import config_logger +from hyperion.np.transforms import LDA, SbSw, TransformList def train_lda( diff --git a/hyperion/bin/train-linear-gbe-up.py b/hyperion/bin_deprec2/train-linear-gbe-up.py similarity index 94% rename from hyperion/bin/train-linear-gbe-up.py rename to hyperion/bin_deprec2/train-linear-gbe-up.py index 3e102b1f..9986b6bc 100755 --- a/hyperion/bin/train-linear-gbe-up.py +++ b/hyperion/bin_deprec2/train-linear-gbe-up.py @@ -7,18 +7,18 @@ Trains linear GBE with uncertainty propagation """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearGBEUP as GBE +from hyperion.hyp_defs import config_logger +from hyperion.np.classifiers import LinearGBEUP as GBE +from hyperion.np.transforms import TransformList def train_linear_gbe(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin/train-linear-gbe.py b/hyperion/bin_deprec2/train-linear-gbe.py similarity index 94% rename from hyperion/bin/train-linear-gbe.py rename to hyperion/bin_deprec2/train-linear-gbe.py index 1428358e..e9455cb8 100755 --- a/hyperion/bin/train-linear-gbe.py +++ b/hyperion/bin_deprec2/train-linear-gbe.py @@ -7,18 +7,18 @@ Trains linear Gaussian back-end """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearGBE as GBE +from hyperion.hyp_defs import config_logger +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.transforms import TransformList def train_linear_gbe(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin/train-linear-svmc.py b/hyperion/bin_deprec2/train-linear-svmc.py similarity index 94% rename from hyperion/bin/train-linear-svmc.py rename to hyperion/bin_deprec2/train-linear-svmc.py index 6c0e2de2..90ff8768 100755 --- a/hyperion/bin/train-linear-svmc.py +++ b/hyperion/bin_deprec2/train-linear-svmc.py @@ -7,18 +7,18 @@ Trains linear SVM classifier """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearSVMC as SVM +from hyperion.hyp_defs import config_logger +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.transforms import TransformList def train_svm(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin/train-logistic-regression.py b/hyperion/bin_deprec2/train-logistic-regression.py similarity index 94% rename from hyperion/bin/train-logistic-regression.py rename to hyperion/bin_deprec2/train-logistic-regression.py index 6a409119..1aa128a3 100755 --- a/hyperion/bin/train-logistic-regression.py +++ b/hyperion/bin_deprec2/train-logistic-regression.py @@ -7,18 +7,18 @@ Trains linear logistic regression classifier """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList -from hyperion.classifiers import LogisticRegression as LR +from hyperion.hyp_defs import config_logger +from hyperion.np.classifiers import LogisticRegression as LR +from hyperion.np.transforms import TransformList def train_lr(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin/train-mvn.py b/hyperion/bin_deprec2/train-mvn.py similarity index 95% rename from hyperion/bin/train-mvn.py rename to hyperion/bin_deprec2/train-mvn.py index 8ddc5e92..2d10b116 100755 --- a/hyperion/bin/train-mvn.py +++ b/hyperion/bin_deprec2/train-mvn.py @@ -7,18 +7,18 @@ Trains global mean and variance normalization of i-vectors. """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.pdfs.core import Normal -from hyperion.transforms import TransformList, MVN, SbSw +from hyperion.hyp_defs import config_logger +from hyperion.np.pdfs.core import Normal +from hyperion.np.transforms import MVN, SbSw, TransformList def train_mvn( diff --git a/hyperion/bin/train-nda.py b/hyperion/bin_deprec2/train-nda.py similarity index 97% rename from hyperion/bin/train-nda.py rename to hyperion/bin_deprec2/train-nda.py index dcc856ed..946a8baa 100755 --- a/hyperion/bin/train-nda.py +++ b/hyperion/bin_deprec2/train-nda.py @@ -7,17 +7,17 @@ Trains NDA """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, NDA, NSbSw +from hyperion.hyp_defs import config_logger +from hyperion.np.transforms import NDA, NSbSw, TransformList def train_nda( diff --git a/hyperion/bin/train-pca.py b/hyperion/bin_deprec2/train-pca.py similarity index 97% rename from hyperion/bin/train-pca.py rename to hyperion/bin_deprec2/train-pca.py index b82a7772..25dcb366 100755 --- a/hyperion/bin/train-pca.py +++ b/hyperion/bin_deprec2/train-pca.py @@ -6,17 +6,17 @@ """ Trains PCA """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, PCA +from hyperion.hyp_defs import config_logger +from hyperion.np.transforms import PCA, TransformList def load_model(input_path, name, **kwargs): diff --git a/hyperion/bin/train-plda.py b/hyperion/bin_deprec2/train-plda.py similarity index 98% rename from hyperion/bin/train-plda.py rename to hyperion/bin_deprec2/train-plda.py index ba9a40c2..520f4cd7 100755 --- a/hyperion/bin/train-plda.py +++ b/hyperion/bin_deprec2/train-plda.py @@ -7,18 +7,18 @@ Trains PLDA """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger -from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.helpers import VectorClassReader as VCR +from hyperion.hyp_defs import config_logger +from hyperion.np.transforms import TransformList def train_plda( diff --git a/hyperion/classifiers/__init__.py b/hyperion/classifiers/__init__.py deleted file mode 100644 index 07da0af8..00000000 --- a/hyperion/classifiers/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -from .linear_gbe import LinearGBE -from .linear_gbe_up import LinearGBEUP -from .logistic_regression import LogisticRegression -from .binary_logistic_regression import BinaryLogisticRegression -from .greedy_fusion import GreedyFusionBinaryLR -from .linear_svmc import LinearSVMC -from .q_scoring_homo_gbe import QScoringHomoGBE diff --git a/hyperion/classifiers/linear_gbe1.py b/hyperion/classifiers/linear_gbe1.py deleted file mode 100644 index 71edd606..00000000 --- a/hyperion/classifiers/linear_gbe1.py +++ /dev/null @@ -1,264 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax - - -class LinearGBE(HypModel): - def __init__( - self, - mu=None, - W=None, - update_mu=True, - update_W=True, - x_dim=1, - num_classes=None, - balance_class_weight=True, - do_map=False, - r_mu=16, - r_W=16, - **kwargs - ): - super(LinearGBE, self).__init__(**kwargs) - if mu is not None: - num_classes = mu.shape[0] - x_dim = mu.shape[1] - - self.mu = mu - self.W = W - self.update_mu = update_mu - self.update_W = update_W - self.x_dim = x_dim - self.num_classes = num_classes - self.balance_class_weight = balance_class_weight - self.A = None - self.b = None - self.do_map = do_map - self.r_mu = r_mu - self.r_W = r_W - - self._compute_Ab() - - def get_config(self): - config = { - "update_mu": self.update_mu, - "update_W": self.update_W, - "x_dim": self.x_dim, - "num_classes": self.num_classes, - "balance_class_weight": self.balance_class_weight, - "do_map": self.do_map, - "r_mu": self.r_mu, - "r_W": self.r_W, - } - base_config = super(LinearGBE, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def predict(self, x, normalize=False, return_full_llk=False): - logp = np.dot(x, self.A) + self.b - - if return_full_llk: - K = 0.5 * logdet_pdmat(self.W) - 0.5 * self.x_dim * np.log(2 * np.pi) - K += -0.5 * np.sum(np.dot(x, self.W) * x, axis=1, keepdims=True) - logp += K - - if normalize: - logp = np.log(softmax(logp, axis=1)) - - return logp - - def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): - - assert class_ids is not None or p_theta is not None - - self.x_dim = x.shape[-1] - if self.num_classes is None: - if class_ids is not None: - self.num_classes = np.max(class_ids) + 1 - else: - self.num_classes = p_theta.shape[-1] - - if class_ids is not None: - p_theta = int2onehot(class_ids, self.num_classes) - - if sample_weight is not None: - p_theta = sample_weight[:, None] * p_theta - - N = np.sum(p_theta, axis=0) - - F = np.dot(p_theta.T, x) - - mu0 = self.mu - xbar = mu0 - if self.update_mu: - xbar = F / N[:, None] - if self.do_map: - alpha = (N / (N + self.r_mu))[:, None] - self.mu = (1 - alpha) * mu0 + alpha * xbar - else: - self.mu = xbar - - if self.update_W: - if self.do_map: - r_W = self.r_W - alpha = (N / (N + r_W))[:, None] - S0 = invert_pdmat(self.W, return_inv=True)[-1] - if self.balance_class_weight: - S = (self.num_classes - np.sum(alpha)) * S0 - else: - S = self.num_classes * self.r_W * S0 - else: - r_W = 0 - S = np.zeros((x.shape[1], x.shape[1]), dtype=float_cpu()) - - for k in range(self.num_classes): - delta = x - xbar[k] - S_k = np.dot(p_theta[:, k] * delta.T, delta) - if self.do_map: - mu_delta = xbar[k] - mu0[k] - S_k += self.r_W * alpha[k] * np.outer(mu_delta, mu_delta) - - if self.balance_class_weight: - S_k /= N[k] + r_W - - S += S_k - - if self.balance_class_weight: - S /= self.num_classes - else: - S /= self.num_classes * r_w + np.sum(N) - - self.W = invert_pdmat(S, return_inv=True)[-1] - - self._compute_Ab() - - def save_params(self, f): - params = {"mu": self.mu, "W": self.W} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "W"] - params = cls._load_params_to_dict(f, config["name"], param_list) - kwargs = dict(list(config.items()) + list(params.items())) - return cls(**kwargs) - - def _compute_Ab(self): - if self.mu is not None and self.W is not None: - self.A = np.dot(self.W, self.mu.T) - self.b = -0.5 * np.sum(self.mu.T * self.A, axis=0) - - @staticmethod - def filter_args(**kwargs): - - valid_args = ( - "update_mu", - "update_W", - "balance_class_weight", - "do_map", - "r_mu", - "r_W", - "name", - ) - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - filter_train_args = filter_args - - @staticmethod - def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - p2 = "" - else: - p1 = "--" + prefix + "." - p2 = prefix + "." - - parser.add_argument( - p1 + "no-update-mu", - default=True, - action="store_false", - help="not update mu", - ) - parser.add_argument( - p1 + "no-update-W", - dest=(p2 + "update_W"), - default=True, - action="store_false", - help="not update W", - ) - parser.add_argument( - p1 + "balance-class-weight", - dest=(p2 + "balance_class_weight"), - default=False, - action="store_true", - help="Balances the weight of each class when computing W", - ) - parser.add_argument( - p1 + "do-map", - dest=(p2 + "do_map"), - default=False, - action="store_true", - help="does MAP adaptation", - ) - parser.add_argument( - p1 + "r-mu", - dest=(p2 + "r_mu"), - default=16, - type=float, - help="relevance factor for the means", - ) - parser.add_argument( - p1 + "r-w", - dest=(p2 + "r_W"), - default=16, - type=float, - help="relevance factor for the variances", - ) - - parser.add_argument( - p1 + "name", dest=(p2 + "name"), default="lgbe", help="model name" - ) - - @staticmethod - def filter_eval_args(**kwargs): - valid_args = ("model_file", "normalize", "return_full_llk") - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod - def add_argparse_eval_args(parser, prefix=None): - if prefix is None: - p1 = "--" - p2 = "" - else: - p1 = "--" + prefix + "." - p2 = prefix + "." - - parser.add_argument( - p1 + "model-file", - dest=(p2 + "model_file"), - required=True, - help=("model file"), - ) - parser.add_argument( - p1 + "normalize", - dest=(p2 + "normalize"), - default=False, - action="store_true", - help=("normalizes the ouput probabilities to sum to one"), - ) - parser.add_argument( - p1 + "return-full-llk", - dest=(p2 + "return_full_llk"), - default=False, - action="store_true", - help=("evaluates full gaussian likelihood instead of linear function"), - ) - - add_argparse_args = add_class_args - add_argparse_train_args = add_class_args - add_argparse_eval_args = add_eval_args diff --git a/hyperion/clustering/__init__.py b/hyperion/clustering/__init__.py deleted file mode 100644 index f22aa6f3..00000000 --- a/hyperion/clustering/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -from .kmeans import KMeans -from .ahc import AHC diff --git a/hyperion/clustering/kmeans.py b/hyperion/clustering/kmeans.py deleted file mode 100644 index 540e70b6..00000000 --- a/hyperion/clustering/kmeans.py +++ /dev/null @@ -1,107 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import sys -import logging -import numpy as np -import h5py - -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel - - -class KMeans(HypModel): - """K-Means clustering class. - - Attributes: - num_clusters: number of clusters. - mu: cluster centers. - rtol: minimum delta in loss function used as stopping criterion. - """ - - def __init__(self, num_clusters, mu=None, rtol=0.001, **kwargs): - super(KMeans, self).__init__(**kwargs) - self.num_clusters = num_clusters - self.mu = mu - self.rtol = rtol - - def fit(self, x, epochs=100): - """Performs the clustering. - - Args: - x: input data (num_samples, feat_dim). - epochs: max. number of epochs. - - Returns: - loss: value of loss function (num_epochs,). - cluster_index: clustering labels as int numpy array with shape=(num_samples,) - """ - loss = np.zeros((epochs,), dtype=float_cpu()) - self.mu = self._choose_seeds(x) - cluster_index, err2 = self.predict(x) - for epoch in range(epochs): - self.mu = self._compute_centroids(x, cluster_index) - cluster_index, err2 = self.predict(x) - loss[epoch] = np.mean(err2) - if epoch > 0: - delta = np.abs(loss[epoch - 1] - loss[epoch]) / loss[epoch - 1] - if delta < self.rtol: - loss = loss[: epoch + 1] - break - - return loss, cluster_index - - def _choose_seeds(self, x): - """Chooses the initial seeds for the clustering. - - Args: - x: input data (num_samples, feat_dim). - - Returns: - Initial centers (num_clusters, feat_dim) - """ - mu = np.zeros((self.num_clusters, x.shape[-1]), dtype=float_cpu()) - mu[0] = x[0] - for i in range(1, self.num_clusters): - d = np.zeros((x.shape[0],), dtype=float_cpu()) - for j in range(i): - d += np.sum(np.square(x - mu[j]), axis=-1) - index = np.argmax(d) - mu[i] = x[index] - return mu - - def _compute_centroids(self, x, index): - """Compute the centroids given cluster assigments. - - Args: - x: input data (num_samples, feat_dim) - index: cluster assignments as integers with shape=(num_samples,) - - Returns: - Cluster centroids (num_clusters, feat_dim) - """ - mu = np.zeros((self.num_clusters, x.shape[-1]), dtype=float_cpu()) - for k in range(self.num_clusters): - r = index == k - if np.sum(r) > 0: - mu[k] = np.mean(x[index == k], axis=0) - return mu - - def predict(self, x): - """Compute the cluster labels for new data. - - Args: - x: input data (num_samples, feat_dim) - - Returns: - Cluster assignments as integer array (num_samples,) - Square distance of each element to the center of its cluster. - """ - err2 = np.zeros((x.shape[0], self.num_clusters), dtype=float_cpu()) - for k in range(self.num_clusters): - err2[:, k] = np.sum(np.square(x - self.mu[k]), axis=-1) - - index = np.argmin(err2, axis=-1) - return index, err2[np.arange(x.shape[0]), index] diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py new file mode 100644 index 00000000..9d885718 --- /dev/null +++ b/hyperion/data_prep/__init__.py @@ -0,0 +1,11 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .data_prep import DataPrep +from .musan import MusanDataPrep +from .rirs import RIRSDataPrep +from .voxceleb2 import VoxCeleb2DataPrep +from .voxceleb1 import VoxCeleb1DataPrep +from .voxsrc22 import VoxSRC22DataPrep diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py new file mode 100644 index 00000000..0f654676 --- /dev/null +++ b/hyperion/data_prep/data_prep.py @@ -0,0 +1,116 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import PathLike + + +class DataPrep: + """Base class for data preparation classes. + + Attributes: + corpus_dir: input data directory + output_dir: output data directory + use_kaldi_ids: puts speaker-id in front of segment id like kaldi + target_sample_freq: target sampling frequency to convert the audios to. + num_threads: number of parallel threads + """ + + registry = {} + + def __init__( + self, + corpus_dir: PathLike, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, + ): + self.corpus_dir = Path(corpus_dir) + self.output_dir = Path(output_dir) + self.use_kaldi_ids = use_kaldi_ids + self.target_sample_freq = target_sample_freq + self.num_threads = num_threads + + self.output_dir.mkdir(exist_ok=True, parents=True) + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + cls.registry[cls.dataset_name()] = cls + + @staticmethod + def dataset_name(): + raise NotImplementedError() + + @staticmethod + def _get_recording_duration(recordings, i, n): + from ..io import SequentialAudioReader as AR + + durations = [] + fss = [] + with AR(recordings, part_idx=i + 1, num_parts=n) as reader: + for data in reader: + key, x, fs = data + duration = x.shape[0] / fs + fss.append(fs) + durations.append(duration) + + return fss, durations + + def get_recording_duration(self, recording_set): + + import itertools + + # from ..utils import SCPList #don't remember why I put this here + + futures = [] + logging.info("submitting threats...") + with ThreadPoolExecutor(max_workers=self.num_threads) as pool: + for i in tqdm(range(self.num_threads)): + future = pool.submit( + DataPrep._get_recording_duration, recording_set, i, self.num_threads + ) + futures.append(future) + + logging.info("waiting threats...") + res = [f.result() for f in tqdm(futures)] + fss = list(itertools.chain(*[r[0] for r in res])) + durations = list(itertools.chain(*[r[1] for r in res])) + + recording_set["duration"] = durations + recording_set["sample_freq"] = fss + + @staticmethod + def add_class_args(parser): + parser.add_argument( + "--corpus-dir", required=True, help="""input data directory""", + ) + parser.add_argument( + "--output-dir", required=True, help="""output data directory""", + ) + parser.add_argument( + "--use-kaldi-ids", + default=False, + action=ActionYesNo, + help="""put speaker-id in front of segment id like kaldi""", + ) + parser.add_argument( + "--target-sample-freq", + default=None, + type=int, + help="""target sampling frequency to convert the audios to""", + ) + + parser.add_argument( + "--num-threads", + default=10, + type=int, + help="""number of parallel threads""", + ) diff --git a/hyperion/data_prep/musan.py b/hyperion/data_prep/musan.py new file mode 100644 index 00000000..b14785b8 --- /dev/null +++ b/hyperion/data_prep/musan.py @@ -0,0 +1,108 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import glob +import logging +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import HypDataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class MusanDataPrep(DataPrep): + """Class for preparing Musan database into tables + + Attributes: + corpus_dir: input data directory + subset: subset of the data noise, music, speech + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + subset: str, + output_dir: PathLike, + target_sample_freq: int, + num_threads: int = 10, + **kwargs, + ): + super().__init__(corpus_dir, output_dir, False, target_sample_freq, num_threads) + self.subset = subset + + @staticmethod + def dataset_name(): + return "musan" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--subset", + choices=["noise", "music", "speech"], + help="""musan subset in [noise, music, speech]""", + required=True, + ) + + def prepare(self): + logging.info( + "Peparing Musan %s corpus_dir:%s -> data_dir:%s", + self.subset, + self.corpus_dir, + self.output_dir, + ) + rec_dir = self.corpus_dir / self.subset + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + + rec_ids = [f.with_suffix("").name for f in rec_files] + storage_paths = [str(f) for f in rec_files] + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": storage_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + "duration": recs.loc[rec_ids, "duration"].values, + "noise_type": self.subset, + } + ) + segments = SegmentSet(segments) + segments.sort() + logging.info("making dataset") + dataset = HypDataset( + segments, + recordings=recs, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", + len(segments), + ) diff --git a/hyperion/data_prep/rirs.py b/hyperion/data_prep/rirs.py new file mode 100644 index 00000000..accf7bad --- /dev/null +++ b/hyperion/data_prep/rirs.py @@ -0,0 +1,111 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import glob +import logging +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import HypDataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class RIRSDataPrep(DataPrep): + """Class for preparing Musan database into tables + + Attributes: + corpus_dir: input data directory + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + output_dir: PathLike, + target_sample_freq: int, + num_threads: int = 10, + **kwargs, + ): + super().__init__(corpus_dir, output_dir, False, target_sample_freq, num_threads) + + @staticmethod + def dataset_name(): + return "rirs" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + + def prepare(self): + logging.info( + "Peparing RIRS corpus_dir:%s -> data_dir:%s", + self.corpus_dir, + self.output_dir, + ) + rec_dir = self.corpus_dir + rirs_file = self.corpus_dir / "rir_list" + if rirs_file.exists(): + rirs_table = pd.read_csv( + rirs_file, + sep=" ", + header=None, + names=["dummy1", "rir_id", "dummy2", "room_id", "rec_files"], + ) + rec_files = [Path(f) for f in rirs_table["rec_files"].values] + room_ids = rirs_table["room_id"].values + else: + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + room_ids = None + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + + rec_ids = [f.with_suffix("").name for f in rec_files] + storage_paths = [str(f) for f in rec_files] + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": storage_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + "duration": recs.loc[rec_ids, "duration"].values, + } + ) + if room_ids is not None: + segments["room_id"] = room_ids + segments = SegmentSet(segments) + segments.sort() + logging.info("making dataset") + dataset = HypDataset( + segments, + recordings=recs, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", + len(segments), + ) diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py new file mode 100644 index 00000000..56cf0c59 --- /dev/null +++ b/hyperion/data_prep/voxceleb1.py @@ -0,0 +1,344 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import glob +import logging +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import ClassInfo, HypDataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class VoxCeleb1DataPrep(DataPrep): + """Class for preparing VoxCeleb1 database into tables, + It prepares the full voxceleb either to train or test with + Original/Entire/Hard. + We don't consider preparing dev for train and test for test Original + + Attributes: + corpus_dir: input data directory + task: train/test + cat_videos: concatenate utterances from the same video. + output_dir: output data directory + use_kaldi_ids: puts speaker-id in front of segment id like kaldi + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + task: str, + cat_videos: bool, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, + ): + use_kaldi_ids = True + super().__init__( + corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads + ) + + self.task = task + assert ( + cat_videos == False or task == "train" + ), "cat-videos is only available for train task" + + self.cat_videos = cat_videos + + @staticmethod + def dataset_name(): + return "voxceleb1" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--task", + default="test", + choices=["test", "train"], + help="""if we prepare the data for [test, train]""", + ) + parser.add_argument( + "--cat-videos", + default=False, + action=ActionYesNo, + help="""concatenate utterances from the same video.""", + ) + + def _get_metadata(self): + file_name = "vox1_meta.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.openslr.org/resources/49/vox1_meta.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_meta = pd.read_csv(file_path, sep="\t") + df_meta.rename(columns=str.strip, inplace=True) + df_meta = df_meta.applymap(lambda x: str.strip(x) if isinstance(x, str) else x) + df_meta.set_index("VoxCeleb1 ID", inplace=True) + return df_meta + + def _get_langs_est(self): + file_name = "lang_vox2_final.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_lang = pd.read_csv(file_path, sep=",") + + if self.cat_videos: + + def get_video(x): + x = re.sub("/[^/]*.wav$", "", x) + return re.sub("/", "-", x) + + elif self.use_kaldi_ids: + + def get_video(x): + x = re.sub(".wav$", "", x) + return re.sub("/", "-", x) + + else: + + def get_video(x): + x = re.sub(".wav$", "", x) + x = re.sub("^[^/]*/", "", x) + return re.sub("/", "-", x) + + df_lang["id"] = df_lang["filename"].apply(get_video) + df_lang.drop(["filename"], axis=1, inplace=True) + df_lang.drop_duplicates(inplace=True) + df_lang.set_index("id", inplace=True) + df_lang["lang"] = df_lang["lang"].apply(str.lower) + return df_lang + + @staticmethod + def make_cat_list(lists_cat_dir, rec_id, rec_files, video_idx, i): + list_file = lists_cat_dir / f"{rec_id}.txt" + with open(list_file, "w") as fw: + rec_idx = (video_idx == i).nonzero()[0] + recs_i = [f"file {rec_files[j]}" for j in rec_idx] + recs_i.sort() + recs_i = "\n".join(recs_i) + fw.write(f"{recs_i}\n") + + file_path = ( + f"ffmpeg -v 8 -f concat -safe 0 -i {list_file} -f wav -acodec pcm_s16le -|" + ) + return file_path + + def make_trials(self): + url_base = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta" + trials_file_names = [ + "veri_test2.txt", + "list_test_hard2.txt", + "list_test_all2.txt", + ] + trials_names = ["trials_o", "trials_h", "trials_e"] + + trials = {} + dfs = [] + logging.info("making trials") + for trial_name, file_name in zip(trials_names, trials_file_names): + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = f"{url_base}/{file_name}" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_in = pd.read_csv( + file_path, + header=None, + sep=" ", + names=["key", "enroll_file", "test_file"], + ) + key = ["target" if k == 1 else "nontarget" for k in df_in["key"]] + + def get_modelid(s): + s = re.sub(r"\.wav", "", s) + return re.sub(r"/", "-", s) + + if self.use_kaldi_ids: + get_segmentid = get_modelid + else: + + def get_segmentid(s): + s = get_modelid(s) + return re.sub(r"[^-]*-", "", s) + + modelid = [get_modelid(f) for f in df_in["enroll_file"]] + segmentid = [get_segmentid(f) for f in df_in["test_file"]] + df_out = pd.DataFrame( + {"modelid": modelid, "segmentid": segmentid, "targettype": key} + ) + df_out.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / f"{trial_name}.csv" + df_out.to_csv(file_path, index=False) + dfs.append(df_out) + trials[trial_name] = file_path + + df_out = pd.concat(dfs, ignore_index=True) + df_out.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / "trials.csv" + df_out.to_csv(file_path, index=False) + trials["trials"] = file_path + + logging.info("making enrollment map") + modelid = df_out["modelid"].sort_values().unique() + if self.use_kaldi_ids: + segmentid = modelid + else: + segmentid = [re.sub(r"[^-]*-", "", s) for s in modelid] + + df_out = pd.DataFrame({"modelid": modelid, "segmentid": segmentid}) + file_path = self.output_dir / "enrollment.csv" + df_out.to_csv(file_path, index=False) + enrollments = {"enrollment": file_path} + + return enrollments, trials + + def prepare(self): + logging.info( + "Peparing VoxCeleb1 for %s corpus_dir:%s -> data_dir:%s", + self.task, + self.corpus_dir, + self.output_dir, + ) + logging.info("getting audio meta-data") + df_meta = self._get_metadata() + logging.info("getting language estimations") + df_lang = self._get_langs_est() + rec_dir = self.corpus_dir + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + + speakers = [f.parents[1].name for f in rec_files] + video_ids = [f.parent.name for f in rec_files] + if self.cat_videos: + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, video_ids)] + lists_cat_dir = self.output_dir / "lists_cat" + lists_cat_dir.mkdir(exist_ok=True, parents=True) + rec_ids, uniq_rec_idx, rec_idx = np.unique( + rec_ids, return_index=True, return_inverse=True + ) + speakers = [speakers[i] for i in uniq_rec_idx] + video_ids = [video_ids[i] for i in uniq_rec_idx] + + file_paths = [] + futures = [] + logging.info("making video cat lists") + logging.info("submitting threats...") + with ThreadPoolExecutor(max_workers=self.num_threads) as pool: + for i, rec_id in tqdm(enumerate(rec_ids)): + future = pool.submit( + VoxCeleb1DataPrep.make_cat_list, + lists_cat_dir, + rec_id, + rec_files, + rec_idx, + i, + ) + futures.append(future) + + logging.info("waiting threats...") + file_paths = [f.result() for f in tqdm(futures)] + else: + file_names = [f.with_suffix("").name for f in rec_files] + if self.use_kaldi_ids: + rec_ids = [ + f"{s}-{v}-{f}" for s, v, f in zip(speakers, video_ids, file_names) + ] + else: + rec_ids = [f"{v}-{f}" for v, f in zip(video_ids, file_names)] + + file_paths = [str(r) for r in rec_files] + + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": file_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + "video_ids": video_ids, + "speaker": speakers, + "gender": df_meta.loc[speakers, "Gender"], + "nationality": df_meta.loc[speakers, "Nationality"], + "language_est": [ + df_lang.loc[r, "lang"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + "language_est_conf": [ + df_lang.loc[r, "confidence"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + "duration": recs.loc[rec_ids, "duration"].values, + } + ) + segments = SegmentSet(segments) + segments.sort() + + logging.info("making speaker info file") + uniq_speakers = np.unique(speakers) + speakers = pd.DataFrame( + { + "id": uniq_speakers, + "vgg_id": df_meta.loc[uniq_speakers, "VGGFace1 ID"], + "gender": df_meta.loc[uniq_speakers, "Gender"], + "nationality": df_meta.loc[uniq_speakers, "Nationality"], + } + ) + speakers = ClassInfo(speakers) + + logging.info("making language info file") + languages = np.unique(df_lang["lang"]) + languages = ClassInfo(pd.DataFrame({"id": languages})) + + if self.task == "test": + enrollments, trials = self.make_trials() + + logging.info("making dataset") + dataset = HypDataset( + segments, + classes={"speaker": speakers, "language_est": languages}, + recordings=recs, + enrollments=enrollments, + trials=trials, + sparse_trials=False, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments, %d speakers", len(segments), len(speakers) + ) diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py new file mode 100644 index 00000000..550af3a8 --- /dev/null +++ b/hyperion/data_prep/voxceleb2.py @@ -0,0 +1,263 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import glob +import logging +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import ClassInfo, HypDataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class VoxCeleb2DataPrep(DataPrep): + """Class for preparing VoxCeleb2 database into tables + + Attributes: + corpus_dir: input data directory + subset: subset of the data dev or test + cat_videos: concatenate utterances from the same video. + output_dir: output data directory + use_kaldi_ids: puts speaker-id in front of segment id like kaldi + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + subset: str, + cat_videos: bool, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, + ): + use_kaldi_ids = True + super().__init__( + corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads + ) + + self.subset = subset + self.cat_videos = cat_videos + + @staticmethod + def dataset_name(): + return "voxceleb2" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--subset", + default="dev", + choices=["dev", "test"], + help="""vox2 subset in [dev, test]""", + ) + parser.add_argument( + "--cat-videos", + default=False, + action=ActionYesNo, + help="""concatenate utterances from the same video.""", + ) + + def _get_metadata(self): + file_name = "vox2_meta.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.openslr.org/resources/49/vox2_meta.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_meta = pd.read_csv(file_path, sep="\t") + df_meta.rename(columns=str.strip, inplace=True) + df_meta = df_meta.applymap(lambda x: str.strip(x) if isinstance(x, str) else x) + df_meta.set_index("VoxCeleb2 ID", inplace=True) + return df_meta + + def _get_langs_est(self): + file_name = "lang_vox2_final.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox2_final.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_lang = pd.read_csv(file_path, sep=",") + + if self.cat_videos: + + def get_video(x): + x = re.sub("/[^/]*.wav$", "", x) + return re.sub("/", "-", x) + + elif self.use_kaldi_ids: + + def get_video(x): + x = re.sub(".wav$", "", x) + return re.sub("/", "-", x) + + else: + + def get_video(x): + x = re.sub(".wav$", "", x) + x = re.sub("^[^/]*/", "", x) + return re.sub("/", "-", x) + + df_lang["id"] = df_lang["filename"].apply(get_video) + df_lang.drop(["filename"], axis=1, inplace=True) + df_lang.drop_duplicates(inplace=True) + df_lang.set_index("id", inplace=True) + df_lang["lang"] = df_lang["lang"].apply(str.lower) + return df_lang + + @staticmethod + def make_cat_list(lists_cat_dir, rec_id, rec_files, video_idx, i): + list_file = lists_cat_dir / f"{rec_id}.txt" + with open(list_file, "w") as fw: + rec_idx = (video_idx == i).nonzero()[0] + recs_i = [f"file {rec_files[j]}" for j in rec_idx] + recs_i.sort() + recs_i = "\n".join(recs_i) + fw.write(f"{recs_i}\n") + + file_path = ( + f"ffmpeg -v 8 -f concat -safe 0 -i {list_file} -f wav -acodec pcm_s16le -|" + ) + return file_path + + def prepare(self): + logging.info( + "Peparing VoxCeleb2 %s corpus_dir:%s -> data_dir:%s", + self.subset, + self.corpus_dir, + self.output_dir, + ) + logging.info("getting audio meta-data") + df_meta = self._get_metadata() + logging.info("getting language estimations") + df_lang = self._get_langs_est() + rec_dir = self.corpus_dir / self.subset + logging.info("searching audio files in %s", str(rec_dir)) + rec_files1 = list(rec_dir.glob("**/*.m4a")) + rec_files = [Path(f) for f in glob.iglob(f"{rec_dir}/**/*.m4a", recursive=True)] + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.m4a", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + + speakers = [f.parents[1].name for f in rec_files] + video_ids = [f.parent.name for f in rec_files] + if self.cat_videos: + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, video_ids)] + lists_cat_dir = self.output_dir / "lists_cat" + lists_cat_dir.mkdir(exist_ok=True, parents=True) + rec_ids, uniq_rec_idx, rec_idx = np.unique( + rec_ids, return_index=True, return_inverse=True + ) + speakers = [speakers[i] for i in uniq_rec_idx] + video_ids = [video_ids[i] for i in uniq_rec_idx] + + file_paths = [] + futures = [] + logging.info("making video cat lists") + logging.info("submitting threats...") + with ThreadPoolExecutor(max_workers=self.num_threads) as pool: + for i, rec_id in tqdm(enumerate(rec_ids)): + future = pool.submit( + VoxCeleb2DataPrep.make_cat_list, + lists_cat_dir, + rec_id, + rec_files, + rec_idx, + i, + ) + futures.append(future) + + logging.info("waiting threats...") + file_paths = [f.result() for f in tqdm(futures)] + else: + file_names = [f.with_suffix("").name for f in rec_files] + if self.use_kaldi_ids: + rec_ids = [ + f"{s}-{v}-{f}" for s, v, f in zip(speakers, video_ids, file_names) + ] + else: + rec_ids = [f"{v}-{f}" for v, f in zip(video_ids, file_names)] + + file_paths = [] + logging.info("making pipe commands") + for rec_file in tqdm(rec_files): + file_path = f"ffmpeg -v 8 -i {rec_file} -f wav -acodec pcm_s16le - |" + file_paths.append(file_path) + + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": file_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + "video_ids": video_ids, + "speaker": speakers, + "gender": df_meta.loc[speakers, "Gender"], + "language_est": [ + df_lang.loc[r, "lang"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + "language_est_conf": [ + df_lang.loc[r, "confidence"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + "duration": recs.loc[rec_ids, "duration"].values, + } + ) + segments = SegmentSet(segments) + segments.sort() + + logging.info("making speaker info file") + uniq_speakers = np.unique(speakers) + speakers = pd.DataFrame( + { + "id": uniq_speakers, + "vgg_id": df_meta.loc[uniq_speakers, "VGGFace2 ID"], + "gender": df_meta.loc[uniq_speakers, "Gender"], + } + ) + speakers = ClassInfo(speakers) + + logging.info("making language info file") + languages = np.unique(df_lang["lang"]) + languages = ClassInfo(pd.DataFrame({"id": languages})) + + logging.info("making dataset") + dataset = HypDataset( + segments, + {"speaker": speakers, "language_est": languages}, + recs, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments, %d speakers", len(segments), len(speakers) + ) diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py new file mode 100644 index 00000000..60192029 --- /dev/null +++ b/hyperion/data_prep/voxsrc22.py @@ -0,0 +1,179 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import glob +import logging +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import ClassInfo, HypDataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class VoxSRC22DataPrep(DataPrep): + """Class to prepare VoxSRC22 dev/test data + Attributes: + corpus_dir: input data directory + vox1_corpus_dir: input data directory for VoxCeleb1 + subset: subset of the data dev or test + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + vox1_corpus_dir: PathLike, + subset: str, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, + ): + use_kaldi_ids = False + super().__init__( + corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads + ) + + assert ( + vox1_corpus_dir is not None or subset == "test" + ), "dev set needs the VoxCeleb1 corpus dir" + self.subset = subset + self.vox1_corpus_dir = ( + None if vox1_corpus_dir is None else Path(vox1_corpus_dir) + ) + + @staticmethod + def dataset_name(): + return "voxsrc22" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--subset", + default="dev", + choices=["dev", "test"], + help="""vox2 subset in [dev, test]""", + ) + parser.add_argument( + "--vox1-corpus-dir", + default=None, + help="""corpus directory of voxceleb 1.""", + ) + + def prepare_track12_dev(self): + logging.info( + "Preparing VoxSRC22 %s corpus:%s + %s -> %s", + self.subset, + self.corpus_dir, + self.vox1_corpus_dir, + self.output_dir, + ) + logging.info("making trials") + trials_file = self.corpus_dir / "voxsrc2022_dev.txt" + df_in = pd.read_csv( + trials_file, + header=None, + sep=" ", + names=["key", "enroll_file", "test_file"], + ) + key = ["target" if k == 1 else "nontarget" for k in df_in["key"]] + + modelid = df_in["enroll_file"] + segmentid = df_in["test_file"] + df_trials = pd.DataFrame( + {"modelid": modelid, "segmentid": segmentid, "targettype": key} + ) + df_trials.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / "trials.csv" + df_trials.to_csv(file_path, index=False) + trials = {"trials": file_path} + modelid = df_trials["modelid"].sort_values().unique() + uniq_segmentid = df_trials["segmentid"].sort_values().unique() + uniq_segmentid = np.unique(np.concatenate((uniq_segmentid, modelid), axis=0)) + + logging.info("making enrollment map") + df_enroll = pd.DataFrame({"modelid": modelid, "segmentid": modelid}) + file_path = self.output_dir / "enrollment.csv" + df_enroll.to_csv(file_path, index=False) + enrollments = {"enrollment": file_path} + + logging.info("making RecordingSet") + vox1_segmentid = [] + vox22_segmentid = [] + for s in uniq_segmentid: + if "VoxSRC2022_dev" in s: + vox22_segmentid.append(s) + else: + vox1_segmentid.append(s) + + vox1_rec_files = [ + glob.glob(f"{self.vox1_corpus_dir}/**/{s}")[0] for s in vox1_segmentid + ] + # vox22_rec_files = [ + # glob.glob(f"{self.corpus_dir}/**/{s}")[0] for s in vox22_segmentid + # ] + vox22_rec_files = [f"{self.corpus_dir}/{s}" for s in vox22_segmentid] + + rec_ids = vox22_segmentid + vox1_segmentid + rec_files = vox22_rec_files + vox1_rec_files + + assert len(vox22_rec_files) > 0, "vox22 recording files not found" + assert len(vox1_rec_files) > 0, "vox1 recording files not found" + + recs = pd.DataFrame({"id": rec_ids, "storage_path": rec_files}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + } + ) + segments = SegmentSet(segments) + segments.sort() + + logging.info("making dataset") + dataset = HypDataset( + segments, + recordings=recs, + enrollments=enrollments, + trials=trials, + sparse_trials=False, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", + len(segments), + ) + + def prepare_track12_test(self): + logging.info( + "Preparing VoxSRC22 %s corpus:%s -> %s", + self.subset, + self.corpus_dir, + self.output_dir, + ) + + def prepare(self): + if self.subset == "dev": + self.prepare_track12_dev() + else: + self.prepare_track12_test() diff --git a/hyperion/feats/__init__.py b/hyperion/feats/__init__.py deleted file mode 100644 index 9d77e032..00000000 --- a/hyperion/feats/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -# - -from .filter_banks import FilterBankFactory -from .feature_windows import FeatureWindowFactory -from .stft import * -from .mfcc import MFCC -from .energy_vad import EnergyVAD -from .frame_selector import FrameSelector -from .feature_normalization import MeanVarianceNorm diff --git a/hyperion/helpers/__init__.py b/hyperion/helpers/__init__.py index eeaf2cce..8b48b161 100644 --- a/hyperion/helpers/__init__.py +++ b/hyperion/helpers/__init__.py @@ -3,16 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .vector_reader import VectorReader -from .vector_class_reader import VectorClassReader - -from .trial_data_reader import TrialDataReader +from .classif_trial_data_reader import ClassifTrialDataReader from .multi_test_trial_data_reader import MultiTestTrialDataReader from .multi_test_trial_data_reader_v2 import MultiTestTrialDataReaderV2 -from .classif_trial_data_reader import ClassifTrialDataReader - -# from .sequence_reader import SequenceReader -# from .sequence_class_reader import SequenceClassReader -# from .sequence_post_reader import SequencePostReader -# from .sequence_post_class_reader import SequencePostClassReader from .plda_factory import PLDAFactory +from .trial_data_reader import TrialDataReader +from .vector_class_reader import VectorClassReader +from .vector_reader import VectorReader diff --git a/hyperion/helpers/classif_trial_data_reader.py b/hyperion/helpers/classif_trial_data_reader.py index f5d74640..2f577621 100644 --- a/hyperion/helpers/classif_trial_data_reader.py +++ b/hyperion/helpers/classif_trial_data_reader.py @@ -3,18 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -import logging import argparse -import time import copy +import logging +import os +import sys +import time import numpy as np from ..io import HypDataReader -from ..utils import TrialNdx, SCPList -from ..transforms import TransformList +from ..np.transforms import TransformList +from ..utils import SCPList, TrialNdx class ClassifTrialDataReader(object): diff --git a/hyperion/helpers/multi_test_trial_data_reader.py b/hyperion/helpers/multi_test_trial_data_reader.py index 57355cd0..bd2d5a35 100644 --- a/hyperion/helpers/multi_test_trial_data_reader.py +++ b/hyperion/helpers/multi_test_trial_data_reader.py @@ -3,17 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np from ..io import RandomAccessDataReaderFactory as DRF -from ..utils import TrialNdx, TrialKey, Utt2Info -from ..transforms import TransformList +from ..np.transforms import TransformList +from ..utils import TrialKey, TrialNdx, Utt2Info class MultiTestTrialDataReader(object): diff --git a/hyperion/helpers/multi_test_trial_data_reader_v2.py b/hyperion/helpers/multi_test_trial_data_reader_v2.py index 306f75ae..226131bf 100644 --- a/hyperion/helpers/multi_test_trial_data_reader_v2.py +++ b/hyperion/helpers/multi_test_trial_data_reader_v2.py @@ -3,17 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np from ..io import RandomAccessDataReaderFactory as DRF -from ..utils import Utt2Info, TrialNdx, TrialKey -from ..transforms import TransformList +from ..np.transforms import TransformList +from ..utils import TrialKey, TrialNdx, Utt2Info class MultiTestTrialDataReaderV2(object): diff --git a/hyperion/helpers/plda_factory.py b/hyperion/helpers/plda_factory.py index b9c2ec60..0b90b334 100644 --- a/hyperion/helpers/plda_factory.py +++ b/hyperion/helpers/plda_factory.py @@ -3,16 +3,30 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from enum import Enum + import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ..np.pdfs.plda import FRPLDA, PLDA, SPLDA +from ..utils.misc import filter_func_args + + +class PLDAType(str, Enum): + frplda = "frplda" + splda = "splda" + plda = "plda" -from ..pdfs.plda import FRPLDA, SPLDA, PLDA + @staticmethod + def choices(): + return [PLDAType.frplda, PLDAType.splda, PLDAType.plda] class PLDAFactory(object): """Class to create PLDA objects.""" @staticmethod - def create_plda( + def create( plda_type, y_dim=None, z_dim=None, @@ -27,8 +41,7 @@ def create_plda( name="plda", **kwargs ): - - if plda_type == "frplda": + if plda_type == PLDAType.frplda: return FRPLDA( fullcov_W=fullcov_W, update_mu=update_mu, @@ -37,7 +50,7 @@ def create_plda( name=name, **kwargs ) - if plda_type == "splda": + if plda_type == PLDAType.splda: return SPLDA( y_dim=y_dim, fullcov_W=fullcov_W, @@ -48,7 +61,7 @@ def create_plda( **kwargs ) - if plda_type == "plda": + if plda_type == PLDAType.plda: return PLDA( y_dim=y_dim, z_dim=z_dim, @@ -71,7 +84,9 @@ def load_plda(plda_type, model_file): return PLDA.load(model_file) @staticmethod - def filter_train_args(prefix=None, **kwargs): + def filter_args(**kwargs): + return filter_func_args(PLDAFactory.create, kwargs) + valid_args = ( "plda_type", "y_dim", @@ -109,7 +124,7 @@ def filter_train_args(prefix=None, **kwargs): "update_D", ) - for a, b in zip(ne_args1, neg_args2): + for a, b in zip(neg_args1, neg_args2): d[b] = not d[a] del d[a] @@ -117,63 +132,62 @@ def filter_train_args(prefix=None, **kwargs): @staticmethod def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "plda-type", - default="splda", - choices=["frplda", "splda", "plda"], + "--plda-type", + default=PLDAType.splda, + choices=PLDAType.choices(), help="PLDA type", ) parser.add_argument( - p1 + "y-dim", type=int, default=150, help="num. of eigenvoices" + "--y-dim", type=int, default=150, help="num. of eigenvoices" ) parser.add_argument( - p1 + "z-dim", type=int, default=400, help="num. of eigenchannels" + "--z-dim", type=int, default=400, help="num. of eigenchannels" ) parser.add_argument( - p1 + "diag-W", - default=False, - action="store_false", - help="use diagonal covariance W", + "--fullcov-W", + default=True, + action=ActionYesNo, + help="use full covariance W", ) parser.add_argument( - p1 + "no-update-mu", - default=False, - action="store_true", + "--update-mu", + default=True, + action=ActionYesNo, help="not update mu", ) parser.add_argument( - p1 + "no-update-V", default=False, action="store_true", help="not update V" + "--update-V", default=True, action=ActionYesNo, help="update V" ) parser.add_argument( - p1 + "no-update-U", default=False, action="store_true", help="not update U" + "--update-U", default=True, action=ActionYesNo, help="update U" ) parser.add_argument( - p1 + "no-update-B", default=False, action="store_true", help="not update B" + "--update-B", default=True, action=ActionYesNo, help="update B" ) parser.add_argument( - p1 + "no-update-W", default=False, action="store_true", help="not update W" + "--update-W", default=True, action=ActionYesNo, help="update W" ) parser.add_argument( - p1 + "no-update-D", default=False, action="store_true", help="not update D" + "--update-D", default=True, action=ActionYesNo, help="update D" ) parser.add_argument( - p1 + "floor-iD", + "--floor-iD", type=float, default=1e-5, help="floor for inverse of D matrix", ) - parser.add_argument(p1 + "epochs", type=int, default=40, help="num. of epochs") + parser.add_argument("--epochs", type=int, default=40, help="num. of epochs") parser.add_argument( - p1 + "ml-md", + "--ml-md", default="ml+md", choices=["ml+md", "ml", "md"], help=("optimization type"), @@ -187,7 +201,12 @@ def add_class_args(parser, prefix=None): help=("epochs in which we do MD, if None we do it in all the epochs"), ) - parser.add_argument(p1 + "name", default="plda", help="model name") + parser.add_argument("--name", default="plda", help="model name") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) @staticmethod def filter_eval_args(prefix=None, **kwargs): diff --git a/hyperion/helpers/tracking_data_reader.py b/hyperion/helpers/tracking_data_reader.py index 6dfc9a19..f6741d9a 100644 --- a/hyperion/helpers/tracking_data_reader.py +++ b/hyperion/helpers/tracking_data_reader.py @@ -3,17 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np from ..io import RandomAccessDataReaderFactory as DRF -from ..utils import Utt2Info, TrialNdx, ExtSegmentList -from ..transforms import TransformList +from ..np.transforms import TransformList +from ..utils import ExtSegmentList, TrialNdx, Utt2Info class TrackingDataReader(object): diff --git a/hyperion/helpers/trial_data_reader.py b/hyperion/helpers/trial_data_reader.py index 984cdb1f..85904eb2 100644 --- a/hyperion/helpers/trial_data_reader.py +++ b/hyperion/helpers/trial_data_reader.py @@ -2,21 +2,21 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np from ..io import RandomAccessDataReaderFactory as DRF +from ..np.transforms import TransformList +from ..utils import TrialKey, TrialNdx # , SparseTrialNdx, SparseTrialKey from ..utils.utt2info import Utt2Info -from ..utils import TrialNdx, TrialKey -from ..transforms import TransformList -class TrialDataReader(object): +class TrialDataReader: """ Loads Ndx, enroll file and x-vectors to evaluate PLDA. """ @@ -34,6 +34,7 @@ def __init__( num_seg_parts=1, eval_set="enroll-test", tlist_sep=" ", + sparse=False, ): self.r = DRF.create(v_file) @@ -45,10 +46,16 @@ def __init__( test = Utt2Info.load(test_file, sep=tlist_sep) ndx = None if ndx_file is not None: - try: - ndx = TrialNdx.load(ndx_file) - except: - ndx = TrialKey.load(ndx_file).to_ndx() + if sparse: + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + else: + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() ndx, enroll = TrialNdx.parse_eval_set(ndx, enroll, test, eval_set) if num_model_parts > 1 or num_seg_parts > 1: diff --git a/hyperion/helpers/vector_class_reader.py b/hyperion/helpers/vector_class_reader.py index 4f893aac..a9993768 100644 --- a/hyperion/helpers/vector_class_reader.py +++ b/hyperion/helpers/vector_class_reader.py @@ -3,18 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np from ..io import RandomAccessDataReaderFactory as DRF -from ..utils.utt2info import Utt2Info +from ..np.transforms import TransformList from ..utils.tensors import to3D_by_class -from ..transforms import TransformList +from ..utils.utt2info import Utt2Info class VectorClassReader(object): @@ -49,7 +49,7 @@ def __init__( v[0]: int(v[1]) for v in [line.rstrip().split() for line in f] } - self.rng = np.random.RandomState(vcr_seed) + self.rng = np.random.default_rng(vcr_seed) self.csplit_max_spc = csplit_max_spc self.csplit_min_spc = csplit_min_spc self.csplit_mode = csplit_mode diff --git a/hyperion/helpers/vector_reader.py b/hyperion/helpers/vector_reader.py index 3f0fa1d2..4f480d6d 100644 --- a/hyperion/helpers/vector_reader.py +++ b/hyperion/helpers/vector_reader.py @@ -2,18 +2,18 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np +from jsonargparse import ActionParser, ArgumentParser from ..io import RandomAccessDataReaderFactory as DRF +from ..np.transforms import TransformList from ..utils.scp_list import SCPList -from ..transforms import TransformList class VectorReader(object): diff --git a/hyperion/hyp_model.py b/hyperion/hyp_model.py deleted file mode 100644 index 0ffd2285..00000000 --- a/hyperion/hyp_model.py +++ /dev/null @@ -1,139 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -from abc import ABCMeta, abstractmethod -import os -import json -from copy import deepcopy - -import numpy as np -import h5py - -from .hyp_defs import float_save, float_cpu - - -class HypModel(object): - __metaclass__ = ABCMeta - - def __init__(self, name=None, **kwargs): - self.name = name - self._is_init = False - - def copy(self): - return deepcopy(self) - - @property - def is_init(self): - return self._is_init - - def init_to_false(self): - self._is_init = False - - @abstractmethod - def initialize(self): - pass - - @abstractmethod - def fit(self, x, sample_weights=None, x_val=None, sample_weights_val=None): - pass - - @abstractmethod - def fit_generator(self, x, x_val=None): - pass - - @abstractmethod - def save(self, file_path): - file_dir = os.path.dirname(file_path) - if not (os.path.isdir(file_dir)): - os.makedirs(file_dir, exist_ok=True) - with h5py.File(file_path, "w") as f: - config = self.to_json() - f.create_dataset("config", data=np.array(config, dtype="S")) - self.save_params(f) - - @abstractmethod - def save_params(self, f): - assert True, "save_params method not defined for %s" % (self.__class__.__name__) - - def _save_params_from_dict(self, f, params, dtypes=None): - if dtypes is None: - dtypes = dict((k, float_save()) for k in params) - - if self.name is None: - prefix = "" - else: - prefix = self.name + "/" - for k, v in params.items(): - if v is None: - continue - if not isinstance(v, np.ndarray): - v = np.asarray(v) - p_name = prefix + k - f.create_dataset(p_name, data=v.astype(dtypes[k], copy=False)) - - @classmethod - def load_config(cls, file_path): - try: - with h5py.File(file_path, "r") as f: - json_str = str(np.asarray(f["config"]).astype("U")) - return cls.load_config_from_json(json_str) - except: - with open(file_path, "r") as f: - return cls.load_config_from_json(f.read()) - - @classmethod - def load(cls, file_path): - with h5py.File(file_path, "r") as f: - json_str = str(np.asarray(f["config"]).astype("U")) - config = cls.load_config_from_json(json_str) - return cls.load_params(f, config) - - @classmethod - def load_params(cls, f, config): - return cls(name=config["name"]) - - @staticmethod - def _load_params_to_dict(f, name, params, dtypes=None): - if dtypes is None: - dtypes = dict((k, float_cpu()) for k in params) - if name is None: - prefix = "" - else: - prefix = name + "/" - - param_dict = {} - for k in params: - p_name = prefix + k - if p_name in f: - param_dict[k] = np.asarray(f[p_name]).astype( - dtype=dtypes[k], copy=False - ) - else: - param_dict[k] = None - return param_dict - - @abstractmethod - def get_config(self): - config = {"class_name": self.__class__.__name__, "name": self.name} - return config - - def to_json(self, **kwargs): - # Piece of code borrowed from keras - def get_json_type(obj): - # if obj is any numpy type - if type(obj).__module__ == np.__name__: - return obj.item() - - # if obj is a python 'type' - if type(obj).__name__ == type.__name__: - return obj.__name__ - - raise TypeError("Not JSON Serializable:", obj) - - config = self.get_config() - return json.dumps(config, default=get_json_type, **kwargs) - - @staticmethod - def load_config_from_json(json_str): - return json.loads(json_str) diff --git a/hyperion/io/__init__.py b/hyperion/io/__init__.py index 5ddf131b..aa5ac653 100644 --- a/hyperion/io/__init__.py +++ b/hyperion/io/__init__.py @@ -5,29 +5,21 @@ from .ark_data_reader import * from .ark_data_writer import * -from .h5_data_reader import * -from .h5_data_writer import * -from .data_rw_factory import * -from .copy_feats import CopyFeats - - -from .bin_vad_reader import BinVADReader -from .segment_vad_reader import SegmentVADReader -from .vad_rw_factory import VADReaderFactory - from .audio_reader import * from .audio_writer import * +from .bin_vad_reader import BinVADReader +from .copy_feats import CopyFeats +from .data_rw_factory import * +from .h5_data_reader import * +from .h5_data_writer import * +from .h5_merger import * +from .hyp_data_reader import * +from .hyp_data_writer import * +from .kaldi_data_reader import * from .packed_audio_reader import ( - SequentialPackedAudioReader, RandomAccessPackedAudioReader, + SequentialPackedAudioReader, ) from .packed_audio_writer import PackedAudioWriter - - -from .hyp_data_reader import * -from .hyp_data_writer import * -from .h5_merger import * -from .kaldi_data_reader import * - - -# from .queues import * +from .segment_vad_reader import SegmentVADReader +from .vad_rw_factory import VADReaderFactory diff --git a/hyperion/io/ark_data_reader.py b/hyperion/io/ark_data_reader.py index 7f6ec350..eaf76d49 100644 --- a/hyperion/io/ark_data_reader.py +++ b/hyperion/io/ark_data_reader.py @@ -3,15 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import numpy as np import multiprocessing as threading +from typing import Union, Optional, List, Callable, Tuple + +import numpy as np from ..hyp_defs import float_cpu -from ..utils.scp_list import SCPList -from ..utils.kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix -from ..utils.kaldi_io_funcs import is_token, read_token, peek, init_kaldi_input_stream -from .data_reader import SequentialDataReader, RandomAccessDataReader +from ..utils.kaldi_io_funcs import init_kaldi_input_stream, is_token, peek, read_token +from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix + +from ..utils import FeatureSet, PathLike +from .data_reader import RandomAccessDataReader, SequentialDataReader class SequentialArkDataReader(SequentialDataReader): @@ -25,10 +27,9 @@ class SequentialArkDataReader(SequentialDataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.f = None self.lock = threading.Lock() @@ -40,7 +41,7 @@ def close(self): self.f.close() self.f = None - def _seek(self, offset): + def _seek(self, offset: int): """Moves the pointer of the input file. Args: @@ -50,7 +51,7 @@ def _seek(self, offset): delta = offset - cur_pos self.f.seek(delta, 1) - def _open_archive(self, file_path, offset=0): + def _open_archive(self, file_path: PathLike, offset: int = 0): """Opens the current file if it is not open and moves the file pointer to a given position. Closes previous open Ark files. @@ -67,7 +68,7 @@ def _open_archive(self, file_path, offset=0): if offset > 0: self._seek(offset) - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -84,7 +85,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return keys, num_rows - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -118,10 +119,8 @@ class SequentialArkFileDataReader(SequentialArkDataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): - super(SequentialArkFileDataReader, self).__init__( - file_path, permissive=False, **kwargs - ) + def __init__(self, file_path: PathLike, **kwargs): + super().__init__(file_path, permissive=False, **kwargs) self._open_archive(self.file_path) self._eof = False self._keys = None @@ -149,7 +148,7 @@ def keys(self): return self._keys - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -186,7 +185,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -204,12 +209,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): key: List of recording names. data: List of feature matrices/vectors or 3D/2D numpy array. """ - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] count = 0 @@ -222,8 +223,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): self._eof = True break - row_offset_i = row_offset[i] if row_offset_is_list else row_offset - num_rows_i = num_rows[i] if num_rows_is_list else num_rows + row_offset_i = row_offset[count] if row_offset_is_list else row_offset + num_rows_i = num_rows[count] if num_rows_is_list else num_rows binary = init_kaldi_input_stream(self.f) data_i = KaldiMatrix.read( @@ -262,28 +263,25 @@ class SequentialArkScriptDataReader(SequentialArkDataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): - super(SequentialArkScriptDataReader, self).__init__( - file_path, permissive=False, **kwargs - ) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): + super().__init__(file_path, permissive=False, **kwargs) + self.feature_set = FeatureSet.load(self.file_path) if self.num_parts > 1: - self.scp = self.scp.split( - self.part_idx, self.num_parts, group_by_key=self.split_by_key - ) + self.feature_set = self.feature_set.split(self.part_idx, self.num_parts) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) self.cur_item = 0 @property def keys(self): - return self.scp.key + return self.feature_set["id"] def reset(self): """Closes all the open Ark files and puts the read pointer pointing @@ -293,9 +291,9 @@ def reset(self): def eof(self): """Returns True when all the elements in the scp have been read.""" - return self.cur_item == len(self.scp) + return self.cur_item == len(self.feature_set) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -316,15 +314,18 @@ def read_shapes(self, num_records=0, assert_same_dim=True): for i in range(num_records): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + offset = feature_spec["storage_byte"] + file_path = feature_spec["storage_path"] self._open_archive(file_path, offset) binary = init_kaldi_input_stream(self.f) shape_i = KaldiMatrix.read_shape(self.f, binary, sequential_mode=True) - - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) keys.append(key) shapes.append(shape_i) @@ -336,7 +337,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -357,12 +364,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if num_records == 0: num_records = len(self.scp) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] @@ -371,7 +374,14 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + offset = feature_spec["storage_byte"] + file_path = feature_spec["storage_path"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows @@ -415,21 +425,24 @@ class RandomAccessArkDataReader(RandomAccessDataReader): features after reading them from disk. permissive: If True, if the data that we want to read is not in the file it returns an empty matrix, if False it raises an exception. - scp_sep: Separator for scp files (default ' '). """ def __init__( - self, file_path, path_prefix=None, transform=None, permissive=False, scp_sep=" " + self, + file_path: PathLike, + path_prefix: Optional[PathLike] = None, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, ): - super(RandomAccessArkDataReader, self).__init__( - file_path, transform, permissive - ) + super().__init__(file_path, transform, permissive) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) - archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True) + archives, archive_idx = np.unique( + self.feature_set["storage_path"], return_inverse=True + ) self.archives = archives self.archive_idx = archive_idx self.f = [None] * len(self.archives) @@ -446,7 +459,7 @@ def close(self): f.close() self.f = [None] * len(self.f) - def _open_archive(self, key_idx, offset=0): + def _open_archive(self, key_idx: int, offset: int = 0): """Opens the Ark file correspoding to a given feature/matrix if it is not already open and moves the file pointer to the point where we can read that feature matrix. @@ -471,7 +484,9 @@ def _open_archive(self, key_idx, offset=0): return f, self.locks[archive_idx] - def read_num_rows(self, keys, assert_same_dim=True): + def read_num_rows( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -487,7 +502,9 @@ def read_num_rows(self, keys, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=np.int) return num_rows - def read_dims(self, keys, assert_same_dim=True): + def read_dims( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -505,7 +522,9 @@ def read_dims(self, keys, assert_same_dim=True): assert np.all(dims == dims[0]) return dims - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -523,25 +542,26 @@ def read_shapes(self, keys, assert_same_dim=True): shapes = [] for key in keys: - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: shapes.append((0,)) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + offset = feature_spec["storage_byte"] f, lock = self._open_archive(index) with lock: f.seek(offset, 0) binary = init_kaldi_input_stream(f) shape_i = KaldiMatrix.read_shape(f, binary, sequential_mode=False) - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) shapes.append(shape_i) @@ -551,7 +571,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -572,12 +598,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -586,15 +608,20 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): data = [] for i, key in enumerate(keys): - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: data.append(np.array([], dtype=float_cpu())) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + offset = feature_spec["storage_byte"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows diff --git a/hyperion/io/ark_data_writer.py b/hyperion/io/ark_data_writer.py index 50fdd3f6..26f77112 100644 --- a/hyperion/io/ark_data_writer.py +++ b/hyperion/io/ark_data_writer.py @@ -3,13 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import numpy as np +from typing import Union, Optional, List, Dict +import numpy as np +import pandas as pd from ..hyp_defs import float_save -from ..utils.scp_list import SCPList -from ..utils.kaldi_io_funcs import is_token, write_token, init_kaldi_output_stream -from ..utils.kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix +from ..utils.kaldi_io_funcs import init_kaldi_output_stream, is_token, write_token +from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix +from ..utils import PathLike from .data_writer import DataWriter @@ -26,11 +27,17 @@ class ArkDataWriter(DataWriter): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). + """ - def __init__(self, archive_path, script_path=None, binary=True, **kwargs): - super(ArkDataWriter, self).__init__(archive_path, script_path, **kwargs) + def __init__( + self, + archive_path: PathLike, + script_path: Optional[PathLike] = None, + binary: bool = True, + **kwargs, + ): + super().__init__(archive_path, script_path, **kwargs) self.binary = binary if binary: @@ -38,10 +45,12 @@ def __init__(self, archive_path, script_path=None, binary=True, **kwargs): else: self.f = open(archive_path, "w") - if script_path is not None: - self.f_script = open(script_path, "w") - else: - self.f_script = None + if script_path is not None and not self.script_is_scp: + columns = ["id", "storage_path", "storage_byte"] + if self.metadata_columns is not None: + columns += self.metadata_columns + row = self.script_sep.join(columns) + self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type @@ -65,7 +74,7 @@ def flush(self): if self.f_script is not None: self.f_script.flush() - def _convert_data(self, data): + def _convert_data(self, data: np.array): """Converts the feature matrix from numpy array to KaldiMatrix or KaldiCompressedMatrix. """ @@ -87,7 +96,12 @@ def _convert_data(self, data): raise ValueError("Data is not ndarray or KaldiMatrix") - def write(self, keys, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, + ): """Writes data to file. Args: @@ -97,9 +111,7 @@ def write(self, keys, data): it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ - if isinstance(keys, str): - keys = [keys] - data = [data] + keys, data, metadata = self.standardize_write_args(keys, data, metadata) for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -112,9 +124,15 @@ def write(self, keys, data): data_i.write(self.f, self.binary) if self.f_script is not None: - self.f_script.write( - "%s%s%s:%d\n" % (key_i, self.scp_sep, self.archive_path, pos) - ) + if self.script_is_scp: + self.f_script.write(f"{key_i} {self.archive_path}:{pos}\n") + else: + columns = [key_i, str(self.archive_path), str(pos)] + if metadata is not None: + metadata_i = [str(m[i]) for m in metadata] + columns += metadata_i + row = self.script_sep.join(columns) + self.f_script.write(f"{row}\n") if self._flush: self.flush() diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index c6bdeab8..a1adaef0 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -3,17 +3,20 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -import logging import io +import logging import math +import os import subprocess -import soundfile as sf import numpy as np +import pandas as pd +import soundfile as sf +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from typing import Union, Optional, List from ..hyp_defs import float_cpu -from ..utils import SCPList, SegmentList +from ..utils import RecordingSet, SegmentSet, PathLike valid_ext = [ ".wav", @@ -33,7 +36,7 @@ ".sds", ".sf", ".voc", - "w64", + ".w64", ".wve", ".xi", ] @@ -43,38 +46,36 @@ class AudioReader(object): """Class to read audio files from wav, flac or pipe Attributes: - file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object. - segments_path: segments file with format: segment_id file_id tbeg tend + recordings: RecordingSet or file path to RecordingSet + segments: SegmentSet or file path to SegmentSet wav_scale: multiplies signal by scale factor """ - def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): - self.file_path = file_path - if isinstance(file_path, SCPList): - self.scp = file_path - else: - self.scp = SCPList.load(file_path, sep=" ", is_wav=True) - - self.segments_path = segments_path - if segments_path is None: - self.segments = None - self.with_segments = False - else: + def __init__( + self, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 1.0, + ): + if not isinstance(recordings, RecordingSet): + recordings = RecordingSet.load(recordings) + + self.recordings = recordings + + self.with_segments = False + if segments is not None: self.with_segments = True - if isinstance(file_path, SegmentList): - self.segments = segments_path - else: - self.segments = SegmentList.load( - segments_path, sep=" ", index_by_file=False - ) + if not isinstance(segments, SegmentSet): + segments = SegmentSet.load(segments) + self.segments = segments self.wav_scale = wav_scale @property def keys(self): if self.with_segments: - return np.asarray(self.segments["segment_id"]) - return self.scp.key + return self.segments["id"].values + return self.recordings["id"].values def __enter__(self): """Function required when entering contructions of type @@ -93,7 +94,12 @@ def __exit__(self, exc_type, exc_value, traceback): pass @staticmethod - def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0): + def read_wavspecifier( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0.0, + time_dur: float = 0.0, + ): """Reads an audiospecifier (audio_file/pipe) It reads from pipe or from all the files that can be read by `libsndfile ` @@ -109,47 +115,29 @@ def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0): wavspecifier = wavspecifier.strip() if wavspecifier[-1] == "|": wavspecifier = wavspecifier[:-1] - x, fs = AudioReader.read_pipe(wavspecifier, scale) - if time_offset == 0 and time_dur == 0: - return x, fs - - start_sample = int(math.floor(time_offset * fs)) - num_samples = int(math.floor(time_dur * fs)) - if num_samples == 0: - return x[start_sample:], fs - - end_sample = start_sample + num_samples - assert end_sample <= len(x) - return x[start_sample:end_sample], fs + return AudioReader.read_pipe(wavspecifier, scale, time_offset, time_dur) ext = os.path.splitext(wavspecifier)[1] if ext in valid_ext: - if time_offset == 0 and time_dur == 0: - x, fs = sf.read(wavspecifier, dtype=float_cpu()) - x *= scale - return x, fs - - with sf.SoundFile(wavspecifier, "r") as f: - fs = f.samplerate - start_sample = int(math.floor(time_offset * fs)) - num_samples = int(math.floor(time_dur * fs)) - f.seek(start_sample) - if num_samples > 0: - x = scale * f.read(num_samples, dtype=float_cpu()) - else: - x = scale * f.read(dtype=float_cpu()) - return x, fs + return AudioReader.read_file(wavspecifier, scale, time_offset, time_dur) raise Exception("Unknown format for %s" % (wavspecifier)) @staticmethod - def read_pipe(wavspecifier, scale=2 ** 15): + def read_pipe( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): """Reads wave file from a pipe Args: wavspecifier: Shell command with pipe output scale: Multiplies signal by scale factor """ - # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + if wavspecifier[-1] == "|": + wavspecifier = wavspecifier[:-1] + proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE) pipe = proc.communicate()[0] if proc.returncode != 0: @@ -159,9 +147,91 @@ def read_pipe(wavspecifier, scale=2 ** 15): ) x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu()) x *= scale - return x, fs + if time_offset == 0 and time_dur == 0: + return x, fs + + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + if num_samples == 0: + return x[start_sample:], fs - def _read_segment(self, segment, time_offset=0, time_dur=0): + end_sample = start_sample + num_samples + assert end_sample <= len(x) + return x[start_sample:end_sample], fs + + @staticmethod + def read_file_sf( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): + if time_offset == 0 and time_dur == 0: + x, fs = sf.read(wavspecifier, dtype=float_cpu()) + x *= scale + return x, fs + + with sf.SoundFile(wavspecifier, "r") as f: + fs = f.samplerate + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + f.seek(start_sample) + if num_samples > 0: + x = scale * f.read(num_samples, dtype=float_cpu()) + else: + x = scale * f.read(dtype=float_cpu()) + + return x, fs + + @staticmethod + def read_file( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): + try: + return AudioReader.read_file_sf(wavspecifier, scale, time_offset, time_dur) + except: + # some files produce error in the fseek after reading the data, + # this seems an issue from pysoundfile or soundfile lib itself + # we try to read from + # time-offset to the end of the file, and remove the extra frames later, + # this solves the problem in most cases + logging.info( + ( + "error-1 reading keys=%s offset=%f duration=%f" + "retrying reading until end-of-file ..." + ), + wavspecifier, + time_offset, + time_dur, + ) + try: + x, fs = AudioReader.read_file_sf(wavspecifier, scale, time_offset) + num_samples = int(math.floor(time_dur * fs)) + x = x[:num_samples] + return x, fs + except: + logging.info( + ( + "error-2 reading keys=%s offset=%f duration=%f" + "retrying reading full file ..." + ), + wavspecifier, + time_offset, + time_dur, + ) + + x, fs = AudioReader.read_file_sf(wavspecifier, scale) + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + x = x[start_sample : start_sample + num_samples] + return x, fs + + def _read_segment( + self, segment: pd.Series, time_offset: float = 0, time_dur: float = 0 + ): """Reads a wave segment Args: @@ -169,29 +239,11 @@ def _read_segment(self, segment, time_offset=0, time_dur=0): Returns: Wave, sampling frequency """ - file_id = segment["file_id"] - t_beg = segment["tbeg"] + time_offset - t_end = segment["tend"] - if time_dur > 0: - t_end_new = t_beg + time_dur - assert t_end_new <= t_end - t_end = t_end_new - - file_path, _, _ = self.scp[file_id] - x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale) - num_samples_i = len(x_i) - s_beg = int(t_beg * fs_i) - if s_beg >= num_samples_i: - raise Exception( - "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" - % (key, tbeg, sbeg, file_id, num_samples_i) - ) - - s_end = int(t_end * fs_i) - if s_end > num_samples_i or t_end < 0: - s_end = num_samples_i - - x_i = x_i[s_beg:s_end] + recording_id = segment["recording_id"] + t_start = segment["start"] + time_offset + t_dur = segment["duration"] + storage_path = self.recordings.loc[recording_id, "storage_path"] + x_i, fs_i = self.read_wavspecifier(storage_path, self.wav_scale, t_start, t_dur) return x_i, fs_i def read(self): @@ -201,13 +253,13 @@ def read(self): class SequentialAudioReader(AudioReader): def __init__( self, - file_path, - segments_path=None, - wav_scale=2 ** 15 - 1, - part_idx=1, - num_parts=1, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 1.0, + part_idx: int = 1, + num_parts: int = 1, ): - super().__init__(file_path, segments_path, wav_scale=wav_scale) + super().__init__(recordings, segments, wav_scale=wav_scale) self.cur_item = 0 self.part_idx = part_idx self.num_parts = num_parts @@ -215,9 +267,7 @@ def __init__( if self.with_segments: self.segments = self.segments.split(self.part_idx, self.num_parts) else: - self.scp = self.scp.split( - self.part_idx, self.num_parts, group_by_key=False - ) + self.recordings = self.recordings.split(self.part_idx, self.num_parts) def __iter__(self): """Needed to build an iterator, e.g.: @@ -257,9 +307,9 @@ def eof(self): """ if self.with_segments: return self.cur_item == len(self.segments) - return self.cur_item == len(self.scp) + return self.cur_item == len(self.recordings) - def read(self, num_records=0, time_offset=0, time_durs=0): + def read(self, num_records: int = 0, time_offset: float = 0, time_durs: float = 0): """Reads next num_records audio files Args: @@ -276,7 +326,7 @@ def read(self, num_records=0, time_offset=0, time_durs=0): if self.with_segments: num_records = len(self.segments) - self.cur_item else: - num_records = len(self.scp) - self.cur_item + num_records = len(self.recordings) - self.cur_item offset_is_list = isinstance(time_offset, (list, np.ndarray)) dur_is_list = isinstance(time_durs, (list, np.ndarray)) @@ -292,11 +342,13 @@ def read(self, num_records=0, time_offset=0, time_durs=0): dur_i = time_durs[i] if dur_is_list else time_durs if self.with_segments: - segment = self.segments[self.cur_item] - key = segment["segment_id"] + segment = self.segments.iloc[self.cur_item] + key = segment["id"] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: - key, file_path, _, _ = self.scp[self.cur_item] + segment = self.recordings.iloc[self.cur_item] + key = segment["id"] + file_path = segment["storage_path"] x_i, fs_i = self.read_wavspecifier( file_path, self.wav_scale, offset_i, dur_i ) @@ -314,21 +366,21 @@ def filter_args(**kwargs): return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + def add_class_args(parser, prefix: Optional[str] = None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "wav-scale", - default=2 ** 15 - 1, + "--wav-scale", + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) try: parser.add_argument( - p1 + "part-idx", + "--part-idx", type=int, default=1, help=( @@ -336,7 +388,7 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "num-parts", + "--num-parts", type=int, default=1, help=( @@ -346,18 +398,35 @@ def add_class_args(parser, prefix=None): except: pass + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + add_argparse_args = add_class_args class RandomAccessAudioReader(AudioReader): - def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): - super().__init__(file_path, segments_path, wav_scale) + def __init__( + self, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 1.0, + ): + super().__init__(recordings, segments, wav_scale) - def _read(self, keys, time_offset=0, time_durs=0): + def read( + self, + keys: Union[str, List, np.array], + time_offset: float = 0, + time_durs: float = 0, + ): """Reads the waveforms for the recordings in keys. Args: keys: List of recording/segment_ids names. + time_offset: float or float list with time-offsets + time_durs: float or float list with durations Returns: data: List of waveforms @@ -376,16 +445,16 @@ def _read(self, keys, time_offset=0, time_durs=0): dur_i = time_durs[i] if dur_is_list else time_durs if self.with_segments: - if not (key in self.segments): + if not (key in self.segments.index): raise Exception("Key %s not found" % key) - segment = self.segments[key] + segment = self.segments.loc[key] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: - if not (key in self.scp): + if not (key in self.recordings.index): raise Exception("Key %s not found" % key) - file_path, _, _ = self.scp[key] + file_path = self.recordings.loc[key, "storage_path"] x_i, fs_i = self.read_wavspecifier( file_path, self.wav_scale, offset_i, dur_i ) @@ -395,57 +464,52 @@ def _read(self, keys, time_offset=0, time_durs=0): return data, fs - def read(self, keys, time_offset=0, time_durs=0): - """Reads the waveforms for the recordings in keys. - - Args: - keys: List of recording/segment_ids names. - - Returns: - data: List of waveforms - fs: List of sampling freq. - """ - try: - x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) - except: - if isinstance(keys, str): - keys = [keys] - - if not isinstance(time_offset, (list, np.ndarray)): - time_offset = [time_offset] * len(keys) - if not isinstance(time_durs, (list, np.ndarray)): - time_durs = [time_durs] * len(keys) - - try: - # some files produce error in the fseek after reading the data, - # this seems an issue from pysoundfile or soundfile lib itself - # we try to read from - # time-offset to the end of the file, and remove the extra frames later, - # this solves the problem in most cases - logging.info( - ( - "error-1 reading at keys={} offset={} " - "retrying reading until end-of-file ..." - ).format(keys, time_offset) - ) - x, fs = self._read(keys, time_offset=time_offset) - for i in range(len(x)): - end_sample = int(time_durs[i] * fs[i]) - x[i] = x[i][:end_sample] - except: - # try to read the full file - logging.info( - ( - "error-2 reading at key={}, " "retrying reading full file ..." - ).format(keys) - ) - x, fs = self._read(keys) - for i in range(len(x)): - start_sample = int(time_offset[i] * fs[i]) - end_sample = start_sample + int(time_durs[i] * fs[i]) - x[i] = x[i][start_sample:end_sample] - - return x, fs + # def read(self, keys, time_offset=0, time_durs=0): + # """Reads the waveforms for the recordings in keys. + + # Args: + # keys: List of recording/segment_ids names. + + # Returns: + # data: List of waveforms + # fs: List of sampling freq. + # """ + # try: + # x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) + # except: + # if isinstance(keys, str): + # keys = [keys] + + # if not isinstance(time_offset, (list, np.ndarray)): + # time_offset = [time_offset] * len(keys) + # if not isinstance(time_durs, (list, np.ndarray)): + # time_durs = [time_durs] * len(keys) + + # try: + # logging.info( + # ( + # "error-1 reading at keys={} offset={} " + # "retrying reading until end-of-file ..." + # ).format(keys, time_offset) + # ) + # x, fs = self._read(keys, time_offset=time_offset) + # for i in range(len(x)): + # end_sample = int(time_durs[i] * fs[i]) + # x[i] = x[i][:end_sample] + # except: + # # try to read the full file + # logging.info( + # ( + # "error-2 reading at key={}, " "retrying reading full file ..." + # ).format(keys) + # ) + # x, fs = self._read(keys) + # for i in range(len(x)): + # start_sample = int(time_offset[i] * fs[i]) + # end_sample = start_sample + int(time_durs[i] * fs[i]) + # x[i] = x[i][start_sample:end_sample] + + # return x, fs @staticmethod def filter_args(**kwargs): @@ -453,17 +517,21 @@ def filter_args(**kwargs): return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + def add_class_args(parser, prefix: Optional[str] = None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "wav-scale", - default=2 ** 15 - 1, + "--wav-scale", + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) add_argparse_args = add_class_args diff --git a/hyperion/io/audio_writer.py b/hyperion/io/audio_writer.py index 2fb9ce3c..ca0dde9f 100644 --- a/hyperion/io/audio_writer.py +++ b/hyperion/io/audio_writer.py @@ -5,15 +5,19 @@ import os import re -import soundfile as sf import numpy as np +import soundfile as sf +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from typing import Union, Optional, List +from pathlib import Path from ..hyp_defs import float_cpu -from ..utils.scp_list import SCPList from ..utils.kaldi_io_funcs import is_token +from ..utils import PathLike from .audio_reader import valid_ext + subtype_to_npdtype = { "PCM_32": "int32", "ALAW": "int16", @@ -23,12 +27,33 @@ "DOUBLE": "float64", "MS_ADPCM": "int16", "ULAW": "int16", - "PCM_U8": "uint8", - "PCM_S8": "int8", + "PCM_S8": "int16", "VORBIS": "float32", "GSM610": "int16", "G721_32": "int16", - "PCM_24": "int24", + "PCM_24": "int32", +} + +scale_32 = 2 ** 31 - 1 +scale_24 = 2 ** 23 - 1 +scale_16 = 2 ** 15 - 1 +scale_8 = 2 ** 7 - 1 + + +subtype_to_scale = { + "PCM_32": scale_32, + "ALAW": scale_16, + "IMA_ADPCM": scale_16, + "FLOAT": 1, + "PCM_16": scale_16, + "DOUBLE": 1, + "MS_ADPCM": scale_16, + "ULAW": scale_16, + "PCM_S8": scale_8, + "VORBIS": 1, + "GSM610": scale_16, + "G721_32": scale_16, + "PCM_24": scale_24, } @@ -37,43 +62,56 @@ class AudioWriter(object): Attributes: output_path: output data file path. - script_path: optional output scp file. + script_path: optional output kaldi .scp or pandas .csv file. audio_format: audio file format audio_subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...], if None, it uses soundfile defaults (recommended) - scp_sep: Separator for scp files (default ' '). + wav_scale: scale of the input waveform """ def __init__( self, - output_path, - script_path=None, - audio_format="wav", - audio_subtype=None, - scp_sep=" ", + output_path: PathLike, + script_path: Optional[PathLike] = None, + audio_format: str = "wav", + audio_subtype: Optional[str] = None, + wav_scale: float = 1.0, ): - self.output_path = output_path - self.script_path = script_path + self.output_path = Path(output_path) + self.script_path = Path(script_path) if script_path is not None else None self.audio_format = audio_format - self.scp_sep = scp_sep + self.output_path.mkdir(exist_ok=True, parents=True) assert "." + self.audio_format in valid_ext if audio_subtype is None: self.subtype = sf.default_subtype(self.audio_format) else: - self.subtype = audio_subtype + self.subtype = audio_subtype.upper() assert sf.check_format(self.audio_format, self.subtype) - if not os.path.exists(output_path): - try: - os.makedirs(output_path) - except FileExistsError: - pass + self._dtype = subtype_to_npdtype[self.subtype] + + self.wav_scale = wav_scale + # we multiply the audio for this number before saving it. + self._output_wav_scale = subtype_to_scale[self.subtype] / wav_scale + self.script_is_scp = False + self.script_sep = None + self.f_script = None if script_path is not None: - self.f_script = open(script_path, "w") - else: - self.f_script = None + self.script_path.parent.mkdir(exist_ok=True, parents=True) + script_ext = self.script_path.suffix + self.script_is_scp = script_ext == ".scp" + + if self.script_is_scp: + self.f_script = open(self.script_path, "w") + else: + self.script_sep = "," if script_ext == ".csv" else "\t" + self.f_script = open(self.script_path, "w", encoding="utf-8") + row = self.script_sep.join( + ["id", "storage_path", "duration", "sample_freq"] + ) + self.f_script.write(f"{row}\n") def __enter__(self): """Function required when entering contructions of type @@ -96,7 +134,12 @@ def close(self): if self.f_script is not None: self.f_script.close() - def write(self, keys, data, fs): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + fs: Union[int, float, List[int], List[float], np.array], + ): """Writes waveform to audio file. Args: @@ -109,8 +152,7 @@ def write(self, keys, data, fs): data = [data] fs_is_list = isinstance(fs, (list, np.ndarray)) - assert self.subtype in subtype_to_npdtype - dtype = subtype_to_npdtype[self.subtype] + output_files = [] for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -120,14 +162,21 @@ def write(self, keys, data, fs): file_basename, self.audio_format, ) - fs_i = fs[i] if fs_is_list else fs - data_i = data[i].astype(dtype, copy=False) + fs_i = int(fs[i]) if fs_is_list else fs + data_i = (self._output_wav_scale * data[i]).astype(self._dtype, copy=False) sf.write(output_file, data_i, fs_i, subtype=self.subtype) output_files.append(output_file) if self.f_script is not None: - self.f_script.write("%s%s%s\n" % (key_i, self.scp_sep, output_file)) + if self.script_is_scp: + self.f_script.write(f"{key_i} {output_file}\n") + else: + duration_i = data_i.shape[-1] / fs_i + row = self.script_sep.join( + [key_i, output_file, str(duration_i), str(fs_i)] + ) + self.f_script.write(f"{row}\n") self.f_script.flush() return output_files @@ -135,40 +184,42 @@ def write(self, keys, data, fs): @staticmethod def filter_args(**kwargs): valid_args = ( - "output_fs", - "output_wav_scale", - "output_audio_format", - "output_audio_subtype", - ) - return dict( - (re.sub("output_", "", k), kwargs[k]) for k in valid_args if k in kwargs + "wav_scale", + "audio_format", + "audio_subtype", ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." - - # parser.add_argument(p1+'output-wav-scale', default=1, type=float, - # help=('scale to divide the waveform before writing')) + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "output-audio-format", + "--audio-format", default="flac", choices=["flac", "ogg", "wav"], help=("ouput audio format"), ) parser.add_argument( - p1 + "output-audio-subtype", + "--audio-subtype", default=None, - choices=["pcm_16", "pcm_24", "float", "double", "vorbis"], + choices=["pcm_16", "pcm_24", "pcm_32", "float", "double", "vorbis"], help=("coding format for audio file"), ) - # parser.add_argument(p1+'output-fs', default=16000, type=int, - # help=('output sample frequency')) + try: + parser.add_argument( + "--wav-scale", default="1.0", help=("input waveform scale wrt 1"), + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) add_argparse_args = add_class_args diff --git a/hyperion/io/bin_vad_reader.py b/hyperion/io/bin_vad_reader.py index 452eb106..8ce91d15 100644 --- a/hyperion/io/bin_vad_reader.py +++ b/hyperion/io/bin_vad_reader.py @@ -4,12 +4,13 @@ """ import logging + import numpy as np from ..hyp_defs import float_cpu from ..utils.vad_utils import bin_vad_to_timestamps -from .vad_reader import VADReader from .data_rw_factory import RandomAccessDataReaderFactory as DRF +from .vad_reader import VADReader class BinVADReader(VADReader): @@ -17,13 +18,12 @@ def __init__( self, rspecifier, path_prefix=None, - scp_sep=" ", frame_length=25, frame_shift=10, snip_edges=False, ): - r = DRF.create(rspecifier, path_prefix, scp_sep=scp_sep) + r = DRF.create(rspecifier, path_prefix) super().__init__(r.file_path, r.permissive) self.r = r self.frame_shift = frame_shift @@ -59,7 +59,7 @@ def read( vad = self.r.read(keys) output_vad = [] for i in range(len(keys)): - vad_i = vad[i].astype(np.bool, copy=False) + vad_i = vad[i].astype(bool, copy=False) offset_i = offset[i] if offset_is_list else offset num_frames_i = num_frames[i] if num_frames_is_list else num_frames vad_i = self._get_bin_vad_slice(vad_i, offset_i, num_frames_i) @@ -77,7 +77,7 @@ def read_timestamps(self, keys, merge_tol=0.001): vad = self.r.read(keys) ts = [] for i in range(len(keys)): - vad_i = vad[i].astype(np.bool, copy=False) + vad_i = vad[i].astype(bool, copy=False) ts_i = bin_vad_to_timestamps( vad_i, self.frame_length / 1000, diff --git a/hyperion/io/data_reader.py b/hyperion/io/data_reader.py index f0c61d3a..73c120b5 100644 --- a/hyperion/io/data_reader.py +++ b/hyperion/io/data_reader.py @@ -4,19 +4,26 @@ """ import logging +import multiprocessing from abc import ABCMeta, abstractmethod +from typing import Union, Optional, List, Callable, Tuple + import numpy as np -import multiprocessing from ..hyp_defs import float_cpu -from ..utils.scp_list import SCPList -from ..transforms import TransformList +from ..np.transforms import TransformList +from ..utils import PathLike class DataReader(object): __metaclass__ = ABCMeta - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): """Abstract base class to read Ark or hdf5 feature files. Attributes: @@ -56,7 +63,7 @@ def close(self): pass @staticmethod - def _squeeze(data, permissive=False): + def _squeeze(data: np.array, permissive: bool = False): """Converts list of matrices to 3D numpy array or list of vectors to 2D numpy array. @@ -120,7 +127,7 @@ def _combine_ranges(read_range, row_offset, num_rows): return row_offset, num_rows @staticmethod - def _apply_range_to_shape(shape, row_offset, num_rows): + def _apply_range_to_shape(shape: Tuple[int, int], row_offset: int, num_rows: int): """Modifies shape given the user defined row_offset and num_rows to read. If we are reading a matrix of shape (100,4) and row_offset=10, num_rows=20, it returns (20,4). @@ -157,25 +164,22 @@ class SequentialDataReader(DataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ __metaclass__ = ABCMeta def __init__( self, - file_path, - transform=None, - permissive=False, - part_idx=1, - num_parts=1, - split_by_key=False, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + part_idx: int = 1, + num_parts: int = 1, ): super().__init__(file_path, transform, permissive) self.lock = multiprocessing.Lock() self.part_idx = part_idx self.num_parts = num_parts - self.split_by_key = split_by_key def __iter__(self): """Needed to build an iterator, e.g.: @@ -217,7 +221,7 @@ def eof(self): return False @abstractmethod - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -233,7 +237,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -249,7 +253,7 @@ def read_dims(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -265,7 +269,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read(self, num_records=0, squeeze=False, offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -289,7 +299,12 @@ def read(self, num_records=0, squeeze=False, offset=0, num_rows=0): class RandomAccessDataReader(DataReader): __metaclass__ = ABCMeta - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): """Abstract base class to read Ark or hdf5 feature files in random order. @@ -304,7 +319,7 @@ def __init__(self, file_path, transform=None, permissive=False): super().__init__(file_path, transform, permissive) @abstractmethod - def read_num_rows(self, keys=None, assert_same_dim=True): + def read_num_rows(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -319,7 +334,7 @@ def read_num_rows(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read_dims(self, keys=None, assert_same_dim=True): + def read_dims(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -334,7 +349,7 @@ def read_dims(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read_shapes(self, keys=None, assert_same_dim=True): + def read_shapes(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -349,7 +364,13 @@ def read_shapes(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read(self, keys, squeeze=False, offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str]], + squeeze: bool = False, + offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: diff --git a/hyperion/io/data_rw_factory.py b/hyperion/io/data_rw_factory.py index ed408156..092f5549 100644 --- a/hyperion/io/data_rw_factory.py +++ b/hyperion/io/data_rw_factory.py @@ -4,20 +4,23 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser +from typing import Union, Optional, List, Callable, Tuple + +from jsonargparse import ActionParser, ArgumentParser +import numpy as np from ..utils.kaldi_matrix import compression_methods -from .rw_specifiers import ArchiveType, WSpecifier, RSpecifier, WSpecType, RSpecType -from .h5_data_writer import H5DataWriter as H5DW -from .ark_data_writer import ArkDataWriter as ADW +from ..utils import PathLike +from .ark_data_reader import RandomAccessArkDataReader as RADR from .ark_data_reader import SequentialArkFileDataReader as SAFDR from .ark_data_reader import SequentialArkScriptDataReader as SASDR -from .ark_data_reader import RandomAccessArkDataReader as RADR -from .h5_data_writer import H5DataWriter as H5DW -from .h5_data_reader import SequentialH5FileDataReader as SH5FDR -from .h5_data_reader import SequentialH5ScriptDataReader as SH5SDR +from .ark_data_writer import ArkDataWriter as ADW from .h5_data_reader import RandomAccessH5FileDataReader as RH5FDR from .h5_data_reader import RandomAccessH5ScriptDataReader as RH5SDR +from .h5_data_reader import SequentialH5FileDataReader as SH5FDR +from .h5_data_reader import SequentialH5ScriptDataReader as SH5SDR +from .h5_data_writer import H5DataWriter as H5DW +from .rw_specifiers import ArchiveType, RSpecifier, RSpecType, WSpecifier, WSpecType class DataWriterFactory(object): @@ -26,7 +29,12 @@ class DataWriterFactory(object): """ @staticmethod - def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): + def create( + wspecifier: PathLike, + compress: bool = False, + compression_method: str = "auto", + metadata_columns: Optional[List[str]] = None, + ): if isinstance(wspecifier, str): wspecifier = WSpecifier.create(wspecifier) @@ -42,7 +50,7 @@ def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): flush=wspecifier.flush, compress=compress, compression_method=compression_method, - scp_sep=scp_sep, + metadata_columns=metadata_columns, ) else: return ADW( @@ -52,21 +60,20 @@ def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): flush=wspecifier.flush, compress=compress, compression_method=compression_method, - scp_sep=scp_sep, + metadata_columns=metadata_columns, ) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "compress", "compression_method") + valid_args = ("compress", "compression_method") return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument("--compress", default=False, action="store_true") parser.add_argument( "--compression-method", default="auto", choices=compression_methods @@ -74,12 +81,11 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='data writer options') class SequentialDataReaderFactory(object): @staticmethod - def create(rspecifier, path_prefix=None, scp_sep=" ", **kwargs): + def create(rspecifier: PathLike, path_prefix: Optional[PathLike] = None, **kwargs): if isinstance(rspecifier, str): rspecifier = RSpecifier.create(rspecifier) @@ -91,27 +97,21 @@ def create(rspecifier, path_prefix=None, scp_sep=" ", **kwargs): return SAFDR(rspecifier.archive, **kwargs) else: if rspecifier.archive_type == ArchiveType.H5: - return SH5SDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs) + return SH5SDR(rspecifier.script, path_prefix, **kwargs) else: - return SASDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs) + return SASDR(rspecifier.script, path_prefix, **kwargs) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "path_prefix", "part_idx", "num_parts") + valid_args = ("path_prefix", "part_idx", "num_parts") return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - try: - parser.add_argument( - "--scp-sep", default=" ", help=("scp file field separator") - ) - except: - pass parser.add_argument( "--path-prefix", default=None, help=("scp file_path prefix") ) @@ -138,7 +138,11 @@ def add_class_args(parser, prefix=None): class RandomAccessDataReaderFactory(object): @staticmethod - def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): + def create( + rspecifier: PathLike, + path_prefix: Optional[PathLike] = None, + transform: Optional[Callable[[np.array], np.array]] = None, + ): if isinstance(rspecifier, str): rspecifier = RSpecifier.create(rspecifier) logging.debug(rspecifier.__dict__) @@ -161,7 +165,6 @@ def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): path_prefix, transform=transform, permissive=rspecifier.permissive, - scp_sep=scp_sep, ) else: return RADR( @@ -169,26 +172,19 @@ def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): path_prefix, transform=transform, permissive=rspecifier.permissive, - scp_sep=scp_sep, ) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "path_prefix") + valid_args = "path_prefix" return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - try: - parser.add_argument( - "--scp-sep", default=" ", help=("scp file field separator") - ) - except: - pass parser.add_argument( "--path-prefix", default=None, help=("scp file_path prefix") ) diff --git a/hyperion/io/data_writer.py b/hyperion/io/data_writer.py index cf2bb4f9..ff35ef2a 100644 --- a/hyperion/io/data_writer.py +++ b/hyperion/io/data_writer.py @@ -5,9 +5,14 @@ import os from abc import ABCMeta, abstractmethod +from typing import Union, Optional, List, Dict +from pathlib import Path +import numpy as np +import pandas as pd +from ..utils import PathLike -class DataWriter(object): +class DataWriter: """Abstract base class to write Ark or hdf5 feature files. Attributes: @@ -19,35 +24,42 @@ class DataWriter(object): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). """ __metaclass__ = ABCMeta def __init__( self, - archive_path, - script_path=None, - flush=False, - compress=False, - compression_method="auto", - scp_sep=" ", + archive_path: PathLike, + script_path: Optional[PathLike] = None, + flush: bool = False, + compress: bool = False, + compression_method: str = "auto", + metadata_columns: Optional[List[str]] = None, ): - self.archive_path = archive_path - self.script_path = script_path + self.archive_path = Path(archive_path) + self.script_path = Path(script_path) if script_path is not None else None self._flush = flush self.compress = compress self.compression_method = compression_method - self.scp_sep = scp_sep + self.metadata_columns = metadata_columns - archive_dir = os.path.dirname(archive_path) - if not os.path.exists(archive_dir): - os.makedirs(archive_dir) + archive_dir = self.archive_path.parent + archive_dir.mkdir(exist_ok=True, parents=True) + self.script_is_scp = False + self.script_sep = None + self.f_script = None if script_path is not None: - script_dir = os.path.dirname(script_path) - if not os.path.exists(script_dir): - os.makedirs(script_dir) + self.script_path.parent.mkdir(exist_ok=True, parents=True) + script_ext = self.script_path.suffix + self.script_is_scp = script_ext == ".scp" + + if self.script_is_scp: + self.f_script = open(self.script_path, "w") + else: + self.script_sep = "," if script_ext == ".csv" else "\t" + self.f_script = open(self.script_path, "w", encoding="utf-8") def __enter__(self): """Function required when entering contructions of type @@ -76,8 +88,38 @@ def flush(self): """Flushes the file""" pass + def standardize_write_args( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, + ): + if isinstance(keys, str): + keys = [keys] + data = [data] + + if metadata is not None: + if isinstance(metadata, pd.DataFrame): + metadata = metadata.to_dict() + + metadata_list = [] + for c in self.metadata_columns: + m_c = metadata[c] + if not isinstance(m_c, (list, np.ndarray)): + m_c = [m_c] + metadata_list.append(m_c) + + metadata = metadata_list + + return keys, data, metadata + @abstractmethod - def write(self, key, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, + ): """Writes data to file. Args: @@ -86,5 +128,6 @@ def write(self, key, data): If all the matrices have the same dimension it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. + metadata: dictionary/DataFrame with metadata """ pass diff --git a/hyperion/io/h5_data_reader.py b/hyperion/io/h5_data_reader.py index 7ade2549..d509504d 100644 --- a/hyperion/io/h5_data_reader.py +++ b/hyperion/io/h5_data_reader.py @@ -5,21 +5,29 @@ Classes to read data from hdf5 files. """ -import sys +import multiprocessing import time -import numpy as np +from typing import Union, Optional, List, Callable, Tuple + import h5py -import multiprocessing +import numpy as np from ..hyp_defs import float_cpu -from ..utils.list_utils import split_list, split_list_group_by_key -from ..utils.scp_list import SCPList -from ..utils.kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix from ..utils.kaldi_io_funcs import is_token -from .data_reader import SequentialDataReader, RandomAccessDataReader +from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix +from ..utils.list_utils import split_list, split_list_group_by_key +# from ..utils.scp_list import SCPList +from ..utils import FeatureSet, PathLike +from .data_reader import RandomAccessDataReader, SequentialDataReader -def _read_h5_data(dset, row_offset=0, num_rows=0, transform=None): + +def _read_h5_data( + dset, + row_offset: int = 0, + num_rows: int = 0, + transform: Optional[Callable[[np.array], np.array]] = None, +): """Auxiliary function to read the feature matrix from hdf5 dataset. It decompresses the data if it was compressed. @@ -73,7 +81,7 @@ class SequentialH5DataReader(SequentialDataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.f = None self.cur_file = None @@ -85,7 +93,7 @@ def close(self): self.f.close() self.f = None - def _open_archive(self, file_path): + def _open_archive(self, file_path: PathLike): """Opens the hdf5 file where the next matrix/vector is if it is not open. If there was another hdf5 file open, it closes it. @@ -95,7 +103,7 @@ def _open_archive(self, file_path): self.cur_file = file_path self.f = h5py.File(file_path, "r") - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -112,7 +120,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return keys, num_rows - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -146,7 +154,7 @@ class SequentialH5FileDataReader(SequentialH5DataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, permissive=False, **kwargs) self._open_archive(self.file_path) self._keys = list(self.f.keys()) @@ -171,7 +179,7 @@ def eof(self): """Returns True when it reaches the end of the ark file.""" return self.cur_item == len(self._keys) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -203,7 +211,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -224,12 +238,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if num_records == 0: num_records = len(self._keys) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] with self.lock: @@ -267,7 +277,6 @@ class SequentialH5ScriptDataReader(SequentialH5DataReader): the scp file. This is useful when data is read from a different directory of that it was created. - scp_sep: Separator for scp files (default ' '). transform: TransformList object, applies a transformation to the features after reading them from disk. part_idx: It splits the input into num_parts and writes only @@ -276,20 +285,20 @@ class SequentialH5ScriptDataReader(SequentialH5DataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): super().__init__(file_path, permissive=False, **kwargs) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if self.num_parts > 1: - self.scp = self.scp.split( - self.part_idx, self.num_parts, group_by_key=self.split_by_key - ) + self.feature_set = self.feature_set.split(self.part_idx, self.num_parts) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) @property def keys(self): - return self.scp.key + return self.feature_set["id"] def reset(self): """Closes all the open hdf5 files and puts the read pointer pointing @@ -299,9 +308,9 @@ def reset(self): def eof(self): """Returns True when all the elements in the scp have been read.""" - return self.cur_item == len(self.scp) + return self.cur_item == len(self.feature_set) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -315,7 +324,7 @@ def read_shapes(self, num_records=0, assert_same_dim=True): List of tuples with num_records shapes. """ if num_records == 0: - num_records = len(self.scp) - self.cur_item + num_records = len(self.feature_set) - self.cur_item keys = [] shapes = [] @@ -323,14 +332,15 @@ def read_shapes(self, num_records=0, assert_same_dim=True): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - - self._open_archive(file_path) + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + self._open_archive(feature_spec["storage_path"]) shape_i = self.f[key].shape - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) keys.append(key) shapes.append(shape_i) @@ -342,7 +352,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -361,14 +377,10 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): data: List of feature matrices/vectors or 3D/2D numpy array. """ if num_records == 0: - num_records = len(self.scp) - self.cur_item + num_records = len(self.feature_set) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] @@ -377,7 +389,13 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + file_path = feature_spec["storage_path"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows @@ -412,11 +430,18 @@ class RandomAccessH5DataReader(RandomAccessDataReader): it returns an empty matrix, if False it raises an exception. """ - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): super().__init__(file_path, transform, permissive) self.f = None - def read_num_rows(self, keys, assert_same_dim=True): + def read_num_rows( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -432,7 +457,9 @@ def read_num_rows(self, keys, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return num_rows - def read_dims(self, keys, assert_same_dim=True): + def read_dims( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -462,7 +489,7 @@ class RandomAccessH5FileDataReader(RandomAccessH5DataReader): it returns an empty matrix, if False it raises an exception. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.lock = multiprocessing.Lock() self._open_archive(file_path) @@ -473,7 +500,7 @@ def close(self): self.f.close() self.f = None - def _open_archive(self, file_path): + def _open_archive(self, file_path: PathLike): """Open the hdf5 file it it is not open.""" if self.f is None: self.close() @@ -483,7 +510,9 @@ def _open_archive(self, file_path): def keys(self): return list(self.f.keys()) - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -517,7 +546,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -538,12 +573,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -588,17 +619,20 @@ class RandomAccessH5ScriptDataReader(RandomAccessH5DataReader): features after reading them from disk. permissive: If True, if the data that we want to read is not in the file it returns an empty matrix, if False it raises an exception. - scp_sep: Separator for scp files (default ' '). """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): super().__init__(file_path, **kwargs) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) - archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True) + archives, archive_idx = np.unique( + self.feature_set["storage_path"], return_inverse=True + ) self.archives = archives self.archive_idx = archive_idx self.f = [None] * len(self.archives) @@ -613,9 +647,9 @@ def close(self): @property def keys(self): - return self.scp.key + return self.feature_set["id"] - def _open_archive(self, key_idx): + def _open_archive(self, key_idx: int): """Opens the hdf5 file correspoding to a given feature/matrix if it is not already open. @@ -632,7 +666,9 @@ def _open_archive(self, key_idx): return self.f[archive_idx], self.locks[archive_idx] - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -650,18 +686,15 @@ def read_shapes(self, keys, assert_same_dim=True): shapes = [] for key in keys: - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: shapes.append((0,)) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] f, lock = self._open_archive(index) if not (key in f): if self.permissive: @@ -672,8 +705,12 @@ def read_shapes(self, keys, assert_same_dim=True): with lock: shape_i = f[key].shape - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) - # print('%s %d %.2f' % (key,time.time()-t1, len(shapes)/len(keys)*100.)) + + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + shapes.append(shape_i) if assert_same_dim: @@ -682,7 +719,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -703,12 +746,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -717,15 +756,19 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): data = [] for i, key in enumerate(keys): - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: data.append(np.array([], dtype=float_cpu())) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows diff --git a/hyperion/io/h5_data_writer.py b/hyperion/io/h5_data_writer.py index 0685d9b8..4d05f963 100644 --- a/hyperion/io/h5_data_writer.py +++ b/hyperion/io/h5_data_writer.py @@ -3,14 +3,16 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import numpy as np +from typing import Union, Optional, List, Dict + import h5py +import numpy as np +import pandas as pd from ..hyp_defs import float_save -from ..utils.scp_list import SCPList -from ..utils.kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix from ..utils.kaldi_io_funcs import is_token +from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix +from ..utils import PathLike from .data_writer import DataWriter @@ -26,18 +28,21 @@ class H5DataWriter(DataWriter): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). """ - def __init__(self, archive_path, script_path=None, **kwargs): + def __init__( + self, archive_path: PathLike, script_path: Optional[PathLike] = None, **kwargs + ): super().__init__(archive_path, script_path, **kwargs) self.f = h5py.File(archive_path, "w") - if script_path is None: - self.f_script = None - else: - self.f_script = open(script_path, "w") + if script_path is not None and not self.script_is_scp: + columns = ["id", "storage_path"] + if self.metadata_columns is not None: + columns += self.metadata_columns + row = self.script_sep.join(columns) + self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type @@ -63,7 +68,7 @@ def flush(self): if self.f_script is not None: self.f_script.flush() - def _convert_data(self, data): + def _convert_data(self, data: np.array): """Converts data to the format for saving. Compresses the data it needed. Args: @@ -84,7 +89,12 @@ def _convert_data(self, data): else: raise ValueError("Data is not ndarray") - def write(self, keys, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, + ): """Writes data to file. Args: @@ -94,9 +104,7 @@ def write(self, keys, data): it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ - if isinstance(keys, str): - keys = [keys] - data = [data] + keys, data, metadata = self.standardize_write_args(keys, data, metadata) for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -107,9 +115,15 @@ def write(self, keys, data): dset.attrs[k] = v if self.f_script is not None: - self.f_script.write( - "%s%s%s\n" % (key_i, self.scp_sep, self.archive_path) - ) + if self.script_is_scp: + self.f_script.write(f"{key_i} {self.archive_path}\n") + else: + columns = [key_i, str(self.archive_path)] + if metadata is not None: + metadata_i = [str(m[i]) for m in metadata] + columns += metadata_i + row = self.script_sep.join(columns) + self.f_script.write(f"{row}\n") if self._flush: self.flush() diff --git a/hyperion/io/h5_merger.py b/hyperion/io/h5_merger.py index f1b408e7..3e73608e 100644 --- a/hyperion/io/h5_merger.py +++ b/hyperion/io/h5_merger.py @@ -4,6 +4,7 @@ """ import sys + import numpy as np from .hyp_data_reader import HypDataReader as HR diff --git a/hyperion/io/hyp_data_reader.py b/hyperion/io/hyp_data_reader.py index 9219187a..63d463fb 100644 --- a/hyperion/io/hyp_data_reader.py +++ b/hyperion/io/hyp_data_reader.py @@ -4,11 +4,12 @@ """ import sys -import numpy as np + import h5py +import numpy as np from ..hyp_defs import float_cpu -from ..utils.list_utils import list2ndarray, ismember +from ..utils.list_utils import ismember, list2ndarray class HypDataReader(object): @@ -75,9 +76,8 @@ def read_random_slice(self, key, num_samples, rng, field=""): dataset = key + field assert dataset in self.f, "Dataset %s not found" % dataset num_rows = self.f[dataset].shape[0] - # print('hola',num_rows,num_samples,num_rows-num_samples) - # index = rng.random_integers(low=0, high=num_rows-num_samples, size=1)[0] - index = rng.randint(low=0, high=num_rows - num_samples + 1) + + index = rng.integers(low=0, high=num_rows - num_samples + 1) X = self.f[dataset][index : index + num_samples] return X, index diff --git a/hyperion/io/hyp_data_writer.py b/hyperion/io/hyp_data_writer.py index 9a5b5906..81ad2501 100644 --- a/hyperion/io/hyp_data_writer.py +++ b/hyperion/io/hyp_data_writer.py @@ -4,11 +4,12 @@ """ import sys -import numpy as np + import h5py +import numpy as np from ..hyp_defs import float_save -from ..utils.list_utils import list2ndarray, ismember +from ..utils.list_utils import ismember, list2ndarray class HypDataWriter(object): diff --git a/hyperion/io/int32_writer.py b/hyperion/io/int32_writer.py index c823dc0e..d881fb16 100644 --- a/hyperion/io/int32_writer.py +++ b/hyperion/io/int32_writer.py @@ -12,4 +12,4 @@ class Int32Writer(DataWriter): """Class to write data to int32 files.""" def __init__(self, wspecifier): - super(Int32Writer, self).__init__(wspecifier) + super().__init__(wspecifier) diff --git a/hyperion/io/kaldi_data_reader.py b/hyperion/io/kaldi_data_reader.py index 712941ec..60b55bfd 100644 --- a/hyperion/io/kaldi_data_reader.py +++ b/hyperion/io/kaldi_data_reader.py @@ -3,9 +3,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import gzip +import re +import struct import sys -import gzip, struct, re from collections import OrderedDict + import numpy as np from ..hyp_defs import float_cpu @@ -133,7 +136,7 @@ def _read_ascii_matrix(f): while 1: line = f.readline() if len(line) == 0: - raise BadInputFormat # eof, should not happen! + raise ValueError() # eof, should not happen! if len(line.strip()) == 0: continue # skip empty line arr = line.strip().split() diff --git a/hyperion/io/old_audio_reader.py b/hyperion/io/old_audio_reader.py new file mode 100644 index 00000000..341f04a4 --- /dev/null +++ b/hyperion/io/old_audio_reader.py @@ -0,0 +1,477 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import io +import logging +import math +import os +import subprocess + +import numpy as np +import soundfile as sf +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ..hyp_defs import float_cpu +from ..utils import SCPList, SegmentList + +valid_ext = [ + ".wav", + ".flac", + ".ogg", + ".au", + ".avr", + ".caf", + ".htk", + ".iff", + ".mat", + ".mpc", + ".oga", + ".pvf", + ".rf64", + ".sd2", + ".sds", + ".sf", + ".voc", + "w64", + ".wve", + ".xi", +] + + +class AudioReader(object): + """Class to read audio files from wav, flac or pipe + + Attributes: + file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object. + segments_path: segments file with format: segment_id file_id tbeg tend + wav_scale: multiplies signal by scale factor + """ + + def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): + self.file_path = file_path + if isinstance(file_path, SCPList): + self.scp = file_path + else: + self.scp = SCPList.load(file_path, sep=" ", is_wav=True) + + self.segments_path = segments_path + if segments_path is None: + self.segments = None + self.with_segments = False + else: + self.with_segments = True + if isinstance(file_path, SegmentList): + self.segments = segments_path + else: + self.segments = SegmentList.load( + segments_path, sep=" ", index_by_file=False + ) + + self.wav_scale = wav_scale + + @property + def keys(self): + if self.with_segments: + return np.asarray(self.segments["segment_id"]) + return self.scp.key + + def __enter__(self): + """Function required when entering contructions of type + + with AudioReader('file.h5') as f: + keys, data = f.read() + """ + return self + + def __exit__(self, exc_type, exc_value, traceback): + """Function required when exiting from contructions of type + + with AudioReader('file.h5') as f: + keys, data = f.read() + """ + pass + + @staticmethod + def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0): + """Reads an audiospecifier (audio_file/pipe) + It reads from pipe or from all the files that can be read + by `libsndfile ` + + Args: + wavspecifier: A pipe, wav, flac, ogg file etc. + scale: Multiplies signal by scale factor + time_offset: float indicating the start time to read in the utterance. + time_durs: floats indicating the number of seconds to read from the utterance, + if 0 it reads untils the end + + """ + wavspecifier = wavspecifier.strip() + if wavspecifier[-1] == "|": + wavspecifier = wavspecifier[:-1] + x, fs = AudioReader.read_pipe(wavspecifier, scale) + if time_offset == 0 and time_dur == 0: + return x, fs + + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + if num_samples == 0: + return x[start_sample:], fs + + end_sample = start_sample + num_samples + assert end_sample <= len(x) + return x[start_sample:end_sample], fs + + ext = os.path.splitext(wavspecifier)[1] + if ext in valid_ext: + if time_offset == 0 and time_dur == 0: + x, fs = sf.read(wavspecifier, dtype=float_cpu()) + x *= scale + return x, fs + + with sf.SoundFile(wavspecifier, "r") as f: + fs = f.samplerate + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + f.seek(start_sample) + if num_samples > 0: + x = scale * f.read(num_samples, dtype=float_cpu()) + else: + x = scale * f.read(dtype=float_cpu()) + return x, fs + + raise Exception("Unknown format for %s" % (wavspecifier)) + + @staticmethod + def read_pipe(wavspecifier, scale=2 ** 15): + """Reads wave file from a pipe + Args: + wavspecifier: Shell command with pipe output + scale: Multiplies signal by scale factor + """ + # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE) + pipe = proc.communicate()[0] + if proc.returncode != 0: + raise Exception( + "Wave read pipe command %s returned code %d" + % (wavspecifier, proc.returncode) + ) + x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu()) + x *= scale + return x, fs + + def _read_segment(self, segment, time_offset=0, time_dur=0): + """Reads a wave segment + + Args: + segment: pandas DataFrame (segment_id , file_id, tbeg, tend) + Returns: + Wave, sampling frequency + """ + file_id = segment["file_id"] + t_beg = segment["tbeg"] + time_offset + t_end = segment["tend"] + if time_dur > 0: + t_end_new = t_beg + time_dur + assert t_end_new <= t_end + t_end = t_end_new + + file_path, _, _ = self.scp[file_id] + x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale) + num_samples_i = len(x_i) + s_beg = int(t_beg * fs_i) + if s_beg >= num_samples_i: + raise Exception( + "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" + % (file_id, t_beg, s_beg, file_id, num_samples_i) + ) + + s_end = int(t_end * fs_i) + if s_end > num_samples_i or t_end < 0: + s_end = num_samples_i + + x_i = x_i[s_beg:s_end] + return x_i, fs_i + + def read(self): + pass + + +class SequentialAudioReader(AudioReader): + def __init__( + self, + file_path, + segments_path=None, + wav_scale=2 ** 15 - 1, + part_idx=1, + num_parts=1, + ): + super().__init__(file_path, segments_path, wav_scale=wav_scale) + self.cur_item = 0 + self.part_idx = part_idx + self.num_parts = num_parts + if self.num_parts > 1: + if self.with_segments: + self.segments = self.segments.split(self.part_idx, self.num_parts) + else: + self.scp = self.scp.split( + self.part_idx, self.num_parts, group_by_key=False + ) + + def __iter__(self): + """Needed to build an iterator, e.g.: + r = SequentialAudioReader(...) + for key, s, fs in r: + print(key) + process(s) + """ + return self + + def __next__(self): + """Needed to build an iterator, e.g.: + r = SequentialAudioReader(...) + for key , s, fs in r: + process(s) + """ + key, x, fs = self.read(1) + if len(key) == 0: + raise StopIteration + return key[0], x[0], fs[0] + + def next(self): + """__next__ for Python 2""" + return self.__next__() + + def reset(self): + """Returns the file pointer to the begining of the dataset, + then we can start reading the features again. + """ + self.cur_item = 0 + + def eof(self): + """End of file. + + Returns: + True, when we have read all the recordings in the dataset. + """ + if self.with_segments: + return self.cur_item == len(self.segments) + return self.cur_item == len(self.scp) + + def read(self, num_records=0, time_offset=0, time_durs=0): + """Reads next num_records audio files + + Args: + num_records: Number of audio files to read. + time_offset: List of floats indicating the start time to read in the utterance. + time_durs: List of floats indicating the number of seconds to read from each utterance + + Returns: + key: List of recording names. + data: List of waveforms + fs: list of sample freqs + """ + if num_records == 0: + if self.with_segments: + num_records = len(self.segments) - self.cur_item + else: + num_records = len(self.scp) - self.cur_item + + offset_is_list = isinstance(time_offset, (list, np.ndarray)) + dur_is_list = isinstance(time_durs, (list, np.ndarray)) + + keys = [] + data = [] + fs = [] + for i in range(num_records): + if self.eof(): + break + + offset_i = time_offset[i] if offset_is_list else time_offset + dur_i = time_durs[i] if dur_is_list else time_durs + + if self.with_segments: + segment = self.segments[self.cur_item] + key = segment["segment_id"] + x_i, fs_i = self._read_segment(segment, offset_i, dur_i) + else: + key, file_path, _, _ = self.scp[self.cur_item] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) + + keys.append(key) + data.append(x_i) + fs.append(fs_i) + self.cur_item += 1 + + return keys, data, fs + + @staticmethod + def filter_args(**kwargs): + valid_args = ("part_idx", "num_parts", "wav_scale") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--wav-scale", + default=2 ** 15 - 1, + type=float, + help=("multiplicative factor for waveform"), + ) + try: + parser.add_argument( + "--part-idx", + type=int, + default=1, + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), + ) + parser.add_argument( + "--num-parts", + type=int, + default=1, + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + add_argparse_args = add_class_args + + +class RandomAccessAudioReader(AudioReader): + def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): + super().__init__(file_path, segments_path, wav_scale) + + def _read(self, keys, time_offset=0, time_durs=0): + """Reads the waveforms for the recordings in keys. + + Args: + keys: List of recording/segment_ids names. + + Returns: + data: List of waveforms + """ + if isinstance(keys, str): + keys = [keys] + + offset_is_list = isinstance(time_offset, (list, np.ndarray)) + dur_is_list = isinstance(time_durs, (list, np.ndarray)) + + data = [] + fs = [] + for i, key in enumerate(keys): + + offset_i = time_offset[i] if offset_is_list else time_offset + dur_i = time_durs[i] if dur_is_list else time_durs + + if self.with_segments: + if not (key in self.segments): + raise Exception("Key %s not found" % key) + + segment = self.segments[key] + x_i, fs_i = self._read_segment(segment, offset_i, dur_i) + else: + if not (key in self.scp): + raise Exception("Key %s not found" % key) + + file_path, _, _ = self.scp[key] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) + + data.append(x_i) + fs.append(fs_i) + + return data, fs + + def read(self, keys, time_offset=0, time_durs=0): + """Reads the waveforms for the recordings in keys. + + Args: + keys: List of recording/segment_ids names. + + Returns: + data: List of waveforms + fs: List of sampling freq. + """ + try: + x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) + except: + if isinstance(keys, str): + keys = [keys] + + if not isinstance(time_offset, (list, np.ndarray)): + time_offset = [time_offset] * len(keys) + if not isinstance(time_durs, (list, np.ndarray)): + time_durs = [time_durs] * len(keys) + + try: + # some files produce error in the fseek after reading the data, + # this seems an issue from pysoundfile or soundfile lib itself + # we try to read from + # time-offset to the end of the file, and remove the extra frames later, + # this solves the problem in most cases + logging.info( + ( + "error-1 reading at keys={} offset={} " + "retrying reading until end-of-file ..." + ).format(keys, time_offset) + ) + x, fs = self._read(keys, time_offset=time_offset) + for i in range(len(x)): + end_sample = int(time_durs[i] * fs[i]) + x[i] = x[i][:end_sample] + except: + # try to read the full file + logging.info( + ( + "error-2 reading at key={}, " "retrying reading full file ..." + ).format(keys) + ) + x, fs = self._read(keys) + for i in range(len(x)): + start_sample = int(time_offset[i] * fs[i]) + end_sample = start_sample + int(time_durs[i] * fs[i]) + x[i] = x[i][start_sample:end_sample] + + return x, fs + + @staticmethod + def filter_args(**kwargs): + valid_args = ("wav_scale",) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--wav-scale", + default=2 ** 15 - 1, + type=float, + help=("multiplicative factor for waveform"), + ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + add_argparse_args = add_class_args diff --git a/hyperion/io/packed_audio_reader.py b/hyperion/io/packed_audio_reader.py index 61ebbd65..fb17cb18 100644 --- a/hyperion/io/packed_audio_reader.py +++ b/hyperion/io/packed_audio_reader.py @@ -2,15 +2,15 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import soundfile as sf - -import time -import math import logging -import numpy as np +import math import multiprocessing +import time from copy import deepcopy +import numpy as np +import soundfile as sf + from ..hyp_defs import float_cpu from ..utils import SCPList, SegmentList @@ -378,7 +378,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( p1 + "wav-scale", - default=2 ** 15 - 1, + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) @@ -633,7 +634,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( p1 + "wav-scale", - default=2 ** 15, + default=1.0, + # default=2 ** 15, type=float, help=("multiplicative factor for waveform"), ) diff --git a/hyperion/io/packed_audio_writer.py b/hyperion/io/packed_audio_writer.py index 3a15227a..ceda0d90 100644 --- a/hyperion/io/packed_audio_writer.py +++ b/hyperion/io/packed_audio_writer.py @@ -4,9 +4,9 @@ """ import os import re -import soundfile as sf import numpy as np +import soundfile as sf from ..utils.kaldi_io_funcs import is_token from .audio_reader import valid_ext diff --git a/hyperion/io/rw_specifiers.py b/hyperion/io/rw_specifiers.py index 37f579b4..60e01ef1 100644 --- a/hyperion/io/rw_specifiers.py +++ b/hyperion/io/rw_specifiers.py @@ -7,6 +7,8 @@ import re from enum import Enum +from pathlib import Path +import pandas as pd class ArchiveType(Enum): @@ -174,6 +176,11 @@ def create(cls, wspecifier): archive_type = ArchiveType.AUDIO archive = archives[cur_archive] cur_archive += 1 + elif option == "csv": + assert script is None, "Repeated csv in wspecifier %s" % script + assert len(archives) > cur_archive + script = archives[cur_archive] + cur_archive += 1 elif option == "scp": assert script is None, "Repeated scp in wspecifier %s" % script assert len(archives) > cur_archive @@ -332,7 +339,7 @@ def create(cls, rspecifier): assert len(archives) == 1 spec_type = None - archive = archives[0] + archive = Path(archives[0]) archive_type = None once = False is_sorted = False @@ -361,6 +368,9 @@ def create(cls, rspecifier): assert spec_type is None spec_type = RSpecType.ARCHIVE archive_type = ArchiveType.RTTM + elif option == "csv": + assert spec_type is None + spec_type = RSpecType.SCRIPT elif option == "scp": assert spec_type is None spec_type = RSpecType.SCRIPT @@ -374,24 +384,31 @@ def create(cls, rspecifier): assert spec_type is not None, "Wrong wspecifier options %s" % fields[0] if spec_type == RSpecType.SCRIPT: - with open(archive, "r") as f: - scp_f2 = f.readline().strip().split(" ")[1] - if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None: + if archive.suffix == ".csv": + df = pd.read_csv(archive, nrows=2) + storage_path = df["storage_path"].values[0] + if re.match(r".*\.h5$", storage_path) is not None: archive_type = ArchiveType.H5 - elif re.match(r".*\.ark:.*$", scp_f2) is not None: + elif re.match(r".*\.ark$", storage_path) is not None: archive_type = ArchiveType.ARK - elif ( - re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2) is not None - ): + elif re.match(r".*[cvg]$", storage_path) is not None: archive_type = ArchiveType.AUDIO else: - archive_type = ArchiveType.ARK - - # .split('[')[0].split(':') - # if len(scp) == 1: - # archive_type = ArchiveType.H5 - # else: - # archive_type = ArchiveType.ARK + raise ValueError(f"Unknown format for {storage_path}") + else: + with open(archive, "r") as f: + scp_f2 = f.readline().strip().split(" ")[1] + if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None: + archive_type = ArchiveType.H5 + elif re.match(r".*\.ark:.*$", scp_f2) is not None: + archive_type = ArchiveType.ARK + elif ( + re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2) + is not None + ): + archive_type = ArchiveType.AUDIO + else: + archive_type = ArchiveType.ARK if archive_type == ArchiveType.ARK: for option in options: diff --git a/hyperion/io/segment_vad_reader.py b/hyperion/io/segment_vad_reader.py index df8d39e5..01bf413e 100644 --- a/hyperion/io/segment_vad_reader.py +++ b/hyperion/io/segment_vad_reader.py @@ -3,13 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging + import numpy as np from ..hyp_defs import float_cpu from ..utils import SegmentList from ..utils.vad_utils import vad_timestamps_to_bin -from .vad_reader import VADReader from .data_reader import DataReader +from .vad_reader import VADReader class SegmentVADReader(VADReader): diff --git a/hyperion/io/vad_reader.py b/hyperion/io/vad_reader.py index c56a8ffe..40e2dda2 100644 --- a/hyperion/io/vad_reader.py +++ b/hyperion/io/vad_reader.py @@ -4,6 +4,7 @@ """ import logging + import numpy as np from ..hyp_defs import float_cpu diff --git a/hyperion/io/vad_rw_factory.py b/hyperion/io/vad_rw_factory.py index 7b855b07..fff1ab4a 100644 --- a/hyperion/io/vad_rw_factory.py +++ b/hyperion/io/vad_rw_factory.py @@ -5,8 +5,8 @@ import logging -from .rw_specifiers import ArchiveType, WSpecifier, RSpecifier, WSpecType, RSpecType from .bin_vad_reader import BinVADReader as BVR +from .rw_specifiers import ArchiveType, RSpecifier, RSpecType, WSpecifier, WSpecType from .segment_vad_reader import SegmentVADReader as SVR @@ -15,7 +15,6 @@ class VADReaderFactory(object): def create( rspecifier, path_prefix=None, - scp_sep=" ", frame_length=25, frame_shift=10, snip_edges=False, @@ -32,7 +31,6 @@ def create( return BVR( rspecifier, path_prefix, - scp_sep, frame_length=frame_length, frame_shift=frame_shift, snip_edges=snip_edges, @@ -47,7 +45,6 @@ def create( return BVR( rspecifier, path_prefix, - scp_sep, frame_length=frame_length, frame_shift=frame_shift, snip_edges=snip_edges, @@ -56,7 +53,6 @@ def create( @staticmethod def filter_args(**kwargs): valid_args = ( - "scp_sep", "path_prefix", "frame_shift", "frame_length", @@ -71,9 +67,6 @@ def add_class_args(parser, prefix=None): else: p1 = "--" + prefix + "." - parser.add_argument( - p1 + "scp-sep", default=" ", help=("scp file field separator") - ) parser.add_argument( p1 + "path-prefix", default=None, help=("scp file_path prefix") ) diff --git a/hyperion/metrics/__init__.py b/hyperion/metrics/__init__.py deleted file mode 100644 index 6725621a..00000000 --- a/hyperion/metrics/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -from .utils import effective_prior -from .acc import compute_accuracy -from .confusion_matrix import * -from .eer import compute_eer, compute_prbep -from .dcf import compute_dcf, compute_min_dcf, compute_act_dcf, fast_eval_dcf_eer diff --git a/hyperion/metrics/utils.py b/hyperion/metrics/utils.py deleted file mode 100644 index 8a764c3d..00000000 --- a/hyperion/metrics/utils.py +++ /dev/null @@ -1,149 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - - Utility functions to evaluate performance -""" - -import numpy as np - -from ..hyp_defs import float_cpu - - -def effective_prior(p_tar, c_miss, c_fa): - """This function adjusts a given prior probability of target p_targ, - to incorporate the effects of a cost of miss, cmiss, and a cost of false-alarm, cfa. - - Args: - p_tar: target prior - c_miss: cost of miss - c_fa: cost of false alarm - Returns: - Effective prior - - """ - beta = p_tar * c_miss / (1 - p_tar) / c_fa - p_eff = beta / (1 + beta) - return p_eff - - -def pavx(y): - """PAV: Pool Adjacent Violators algorithm. Non-paramtetric optimization subject to monotonicity. - - ghat = pav(y) - fits a vector ghat with nondecreasing components to the - data vector y such that sum((y - ghat).^2) is minimal. - (Pool-adjacent-violators algorithm). - - Author: This code is and adaptation from Bosaris Toolkit and - it is a simplified version of the 'IsoMeans.m' code made available - by Lutz Duembgen at: - http://www.imsv.unibe.ch/~duembgen/software - - Args: - y: uncalibrated scores - - Returns: - Calibrated scores - Width of pav bins, from left to right - (the number of bins is data dependent) - Height: corresponding heights of bins (in increasing order) - - """ - assert isinstance(y, np.ndarray) - - n = len(y) - assert n > 0 - index = np.zeros(y.shape, dtype=int) - l = np.zeros(y.shape, dtype=int) - # An interval of indices is represented by its left endpoint - # ("index") and its length "len" - ghat = np.zeros_like(y) - - ci = 0 - index[ci] = 0 - l[ci] = 1 - ghat[ci] = y[0] - # ci is the number of the interval considered currently. - # ghat[ci] is the mean of y-values within this interval. - for j in range(1, n): - # a new index intervall, {j}, is created: - ci = ci + 1 - index[ci] = j - l[ci] = 1 - ghat[ci] = y[j] - # while ci >= 1 and ghat[np.maximum(ci-1,0)] >= ghat[ci]: - while ci >= 1 and ghat[ci - 1] >= ghat[ci]: - # "pool adjacent violators": - nw = l[ci - 1] + l[ci] - ghat[ci - 1] = ghat[ci - 1] + (l[ci] / nw) * (ghat[ci] - ghat[ci - 1]) - l[ci - 1] = nw - ci = ci - 1 - - height = np.copy(ghat[: ci + 1]) - width = l[: ci + 1] - - # Now define ghat for all indices: - while n >= 1: - for j in range(index[ci], n): - ghat[j] = ghat[ci] - - n = index[ci] - ci = ci - 1 - - return ghat, width, height - - -def opt_loglr(tar, non, method="laplace"): - """Non-parametric optimization of score to log-likelihood-ratio mapping. - - Taken from Bosaris toolkit. - Niko Brummer and Johan du Preez, Application-Independent Evaluation of Speaker Detection, Computer Speech and Language, 2005 - - Args: - tar: target scores. - non: non-target scores. - method: laplace(default, avoids inf log-LR)/raw - - Returns: - Calibrated tar and non-tar log-LR - """ - ntar = len(tar) - nnon = len(non) - n = ntar + nnon - - scores = np.concatenate((tar, non)) - p_ideal = np.zeros((n,), dtype=float_cpu()) - p_ideal[:ntar] = 1 - - sort_idx = np.argsort(scores, kind="mergesort") - # print(scores) - # print(sort_idx) - p_ideal = p_ideal[sort_idx] - - if method == "laplace": - # The extra targets and non-targets at scores of -inf and +inf effectively - # implement Laplace's rule of succession to avoid log LRs of infinite magnitudes. - p_ideal = np.concatenate(([1, 0], p_ideal, [1, 0])) - - p_opt, _, _ = pavx(p_ideal) - - if method == "laplace": - p_opt = p_opt[2:-2] - - # Posterior to loglr - # This LR is prior-independent in the sense that if we weight the data with a synthetic prior, - # it makes no difference to the optimizing LR mapping. - # (A synthetic prior DOES change Popt: The posterior log-odds changes by an additive term. But this - # this cancels again when converting to log LR. ) - # print(p_opt) - post_log_odds = np.log(p_opt) - np.log(1 - p_opt) - prior_log_odds = np.log(ntar / nnon) - llr = post_log_odds - prior_log_odds - llr += 1e-6 * np.arange(n) / n - - llr[sort_idx] = llr - tar_llr = llr[:ntar] - non_llr = llr[ntar:] - - return tar_llr, non_llr diff --git a/hyperion/model_loader.py b/hyperion/model_loader.py deleted file mode 100644 index 30780d7b..00000000 --- a/hyperion/model_loader.py +++ /dev/null @@ -1,36 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -from .hyp_model import HypModel -from .pdfs import * -from .transforms import * - - -class ModelLoader(object): - @staticmethod - def get_object(): - obj_dict = { - "DiagNormal": DiagNormal, - "Normal": Normal, - "DiagGMM": DiagGMM, - "GMM": GMM, - "FRPLDA": FRPLDA, - "SPLDA": SPLDA, - "CentWhiten": CentWhiten, - "LNorm": LNorm, - "PCA": PCA, - "LDA": LDA, - "NAP": NAP, - "SbSw": SbSw, - "MVN": MVN, - "TransformList": TransformList, - } - return obj_dict - - @staticmethod - def load(file_path): - class_name = HypModel.load_config(file_path)["class_name"] - class_obj = ModelLoader.get_object()[class_name] - return class_obj.load(file_path) diff --git a/hyperion/np/__init__.py b/hyperion/np/__init__.py new file mode 100644 index 00000000..86fff349 --- /dev/null +++ b/hyperion/np/__init__.py @@ -0,0 +1,8 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +from .np_model import NPModel +from .np_model_loader import NPModelLoader diff --git a/hyperion/np/augment/__init__.py b/hyperion/np/augment/__init__.py new file mode 100644 index 00000000..1f99ffb0 --- /dev/null +++ b/hyperion/np/augment/__init__.py @@ -0,0 +1,9 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .noise_augment import NoiseAugment +from .reverb_augment import ReverbAugment +from .speech_augment import SpeechAugment +from .speed_augment import SpeedAugment diff --git a/hyperion/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py similarity index 88% rename from hyperion/augment/noise_augment.py rename to hyperion/np/augment/noise_augment.py index ad88ff08..92bd57dd 100644 --- a/hyperion/augment/noise_augment.py +++ b/hyperion/np/augment/noise_augment.py @@ -6,13 +6,13 @@ import logging import math import multiprocessing -import yaml from copy import deepcopy import numpy as np +import yaml -from ..hyp_defs import float_cpu -from ..io import RandomAccessAudioReader as AR +from ...hyp_defs import float_cpu +from ...io import RandomAccessAudioReader as AR class SingleNoiseAugment(object): @@ -26,7 +26,7 @@ class SingleNoiseAugment(object): min_snr: mininimum SNR(dB) to sample from. max_snr: maximum SNR(dB) to sample from. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -46,7 +46,7 @@ def __init__( self.cache = None self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -55,7 +55,7 @@ def __init__( @staticmethod def _power(x): """Computes power of x in dB.""" - return 10 * np.log10((x ** 2).sum()) + return 10 * np.log10((x**2).sum() + 1e-10) @staticmethod def snr(x, n): @@ -96,7 +96,7 @@ def forward(self, x): while noise is None or noise.shape[0] < num_samples: with self.lock: - noise_idx = self.rng.randint(len(self.noise_keys)) + noise_idx = self.rng.integers(len(self.noise_keys)) key = self.noise_keys[noise_idx] noise_k, fs_k = self.r.read([key]) noise_k = noise_k[0] @@ -112,12 +112,22 @@ def forward(self, x): with self.lock: self.cache = noise_k[need_samples:] + num_zeros = np.sum(noise == 0) with self.lock: + # add dither for noises files with many 0s. + if num_zeros > len(noise) // 3: + noise += 0.0001 * self.rng.standard_normal( + noise.shape, dtype=noise.dtype + ) + target_snr = self.rng.uniform(self.min_snr, self.max_snr) + scale = self._compute_noise_scale(x, noise, target_snr) info = {"noise_type": self.noise_type, "snr": target_snr} - return x + scale * noise, info + y = x + scale * noise + + return y, info def __call__(self, x): return self.forward(x) @@ -136,7 +146,7 @@ class NoiseAugment(object): is proportional to how often we want to sample a given noise type. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None): @@ -166,7 +176,7 @@ def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None): self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -177,7 +187,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: NoiseAugment object @@ -208,7 +218,7 @@ def forward(self, x): # decide whether to add noise or not with self.lock: - p = self.rng.random_sample() + p = self.rng.random() if p > self.noise_prob: # we don't add noise diff --git a/hyperion/augment/reverb_augment.py b/hyperion/np/augment/reverb_augment.py similarity index 94% rename from hyperion/augment/reverb_augment.py rename to hyperion/np/augment/reverb_augment.py index 9f80c168..0b1f3596 100644 --- a/hyperion/augment/reverb_augment.py +++ b/hyperion/np/augment/reverb_augment.py @@ -3,19 +3,19 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import time import logging import math import multiprocessing -import yaml +import time from copy import deepcopy from enum import Enum import numpy as np +import yaml from scipy import signal -from ..hyp_defs import float_cpu -from ..io import RandomAccessDataReaderFactory as DRF +from ...hyp_defs import float_cpu +from ...io import RandomAccessDataReaderFactory as DRF class RIRNormType(Enum): @@ -39,7 +39,7 @@ class SingleReverbAugment(object): its first sample. preload_rirs: if True all RIRS are loaded into RAM. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -80,7 +80,7 @@ def __init__( self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -129,7 +129,7 @@ def forward(self, x): num_samples = x.shape[0] with self.lock: - rir_idx = self.rng.randint(len(self.rir_keys)) + rir_idx = self.rng.integers(len(self.rir_keys)) if self.preload_rirs: h = self.rirs[rir_idx] @@ -155,6 +155,7 @@ def forward(self, x): "h_max": h_max, "h_delay": h_delay, } + return y, info def __call__(self, x): @@ -176,7 +177,7 @@ class ReverbAugment(object): max_reverb_context: number of samples required as left context for the convolution operation. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -210,7 +211,7 @@ def __init__( self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -221,7 +222,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with reverb options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: ReverbAugment object. @@ -267,7 +268,7 @@ def forward(self, x): # decide whether to add reverb or not with self.lock: - p = self.rng.random_sample() + p = self.rng.random() if p > self.reverb_prob: # we don't add reverb diff --git a/hyperion/augment/speech_augment.py b/hyperion/np/augment/speech_augment.py similarity index 97% rename from hyperion/augment/speech_augment.py rename to hyperion/np/augment/speech_augment.py index b6756ce7..c27ca321 100644 --- a/hyperion/augment/speech_augment.py +++ b/hyperion/np/augment/speech_augment.py @@ -5,12 +5,11 @@ import logging import math -import yaml import numpy as np +import yaml -from ..hyp_defs import float_cpu - +from ...hyp_defs import float_cpu from .noise_augment import NoiseAugment from .reverb_augment import ReverbAugment from .speed_augment import SpeedAugment @@ -38,7 +37,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: SpeechAugment object. diff --git a/hyperion/augment/speed_augment.py b/hyperion/np/augment/speed_augment.py similarity index 83% rename from hyperion/augment/speed_augment.py rename to hyperion/np/augment/speed_augment.py index b72bf338..95127084 100644 --- a/hyperion/augment/speed_augment.py +++ b/hyperion/np/augment/speed_augment.py @@ -5,11 +5,12 @@ import logging from copy import deepcopy -import multiprocessing + import numpy as np +import yaml from librosa.effects import time_stretch -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu class SpeedAugment(object): @@ -21,7 +22,7 @@ class SpeedAugment(object): keep_length: applies padding or cropping to keep the lenght of the signal. random_seed: random seed for random number generator. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -42,7 +43,7 @@ def __init__( self.keep_length = keep_length if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -53,7 +54,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: NoiseAugment object. @@ -62,12 +63,12 @@ def create(cls, cfg, random_seed=112358, rng=None): with open(cfg, "r") as f: cfg = yaml.load(f, Loader=yaml.FullLoader) - assert isinstance(cfg, dict), "wrong object type for cfg={}".format(cfg) + assert isinstance(cfg, dict), f"wrong object type for cfg={cfg}" return cls( speed_prob=cfg["speed_prob"], speed_ratios=cfg["speed_ratios"], - keep_length=cfg["keep_length"], + keep_length=cfg["keep_length"] if "keep_length" in cfg else False, random_seed=random_seed, rng=rng, ) @@ -85,7 +86,7 @@ def forward(self, x): """ # decide whether to add noise or not - p = self.rng.random_sample() + p = self.rng.random() if p > self.speed_prob: # we don't add speed perturbation info = {"speed_ratio": 1} @@ -95,17 +96,16 @@ def forward(self, x): # change speed r = self.speed_ratios[speed_idx] info = {"speed_ratio": r} - y = time_stretch(x, r) + y = time_stretch(x, rate=r) # print(f"1 r={r} {x.shape} {y.shape}", flush=True) if self.keep_length: if r > 1: - dither = np.max(x) / 2 ** 15 # we add some dither in the padding + dither = np.max(x) / 2**15 # we add some dither in the padding pad_y = dither * np.ones((x.shape[-1] - y.shape[-1],), dtype=y.dtype) y = np.concatenate((y, pad_y), axis=-1) elif r < 1: y = y[: x.shape[-1]] - # print(f"2 r={r} {x.shape} {y.shape}", flush=True) return y, info def __call__(self, x): diff --git a/hyperion/calibration/__init__.py b/hyperion/np/calibration/__init__.py similarity index 100% rename from hyperion/calibration/__init__.py rename to hyperion/np/calibration/__init__.py diff --git a/hyperion/calibration/gauss_calibration.py b/hyperion/np/calibration/gauss_calibration.py similarity index 98% rename from hyperion/calibration/gauss_calibration.py rename to hyperion/np/calibration/gauss_calibration.py index 07d882ed..630d5e95 100644 --- a/hyperion/calibration/gauss_calibration.py +++ b/hyperion/np/calibration/gauss_calibration.py @@ -4,10 +4,10 @@ """ import numpy as np -from ..hyp_model import HypModel +from ..np_model import NPModel -class GaussCalibration(HypModel): +class GaussCalibration(NPModel): """Class for supervised Gaussian calibration. The model assumes that targer and non-target score distributions are Gaussians with shared covariance. diff --git a/hyperion/calibration/unsup_gauss_calibration.py b/hyperion/np/calibration/unsup_gauss_calibration.py similarity index 99% rename from hyperion/calibration/unsup_gauss_calibration.py rename to hyperion/np/calibration/unsup_gauss_calibration.py index 5f368a71..fd440995 100644 --- a/hyperion/calibration/unsup_gauss_calibration.py +++ b/hyperion/np/calibration/unsup_gauss_calibration.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import sys + import numpy as np from ..pdfs.mixtures.diag_gmm_tiedcovs import DiagGMMTiedCovs as GMM diff --git a/hyperion/np/classifiers/__init__.py b/hyperion/np/classifiers/__init__.py new file mode 100644 index 00000000..60582016 --- /dev/null +++ b/hyperion/np/classifiers/__init__.py @@ -0,0 +1,13 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .binary_logistic_regression import BinaryLogisticRegression +from .greedy_fusion import GreedyFusionBinaryLR +from .linear_gbe import LinearGBE +from .linear_gbe_up import LinearGBEUP +from .linear_svmc import LinearSVMC +from .logistic_regression import LogisticRegression +from .q_scoring_homo_gbe import QScoringHomoGBE +from .svmc import SVMC diff --git a/hyperion/classifiers/binary_logistic_regression.py b/hyperion/np/classifiers/binary_logistic_regression.py similarity index 99% rename from hyperion/classifiers/binary_logistic_regression.py rename to hyperion/np/classifiers/binary_logistic_regression.py index c144105f..e77115cd 100644 --- a/hyperion/classifiers/binary_logistic_regression.py +++ b/hyperion/np/classifiers/binary_logistic_regression.py @@ -29,7 +29,7 @@ class BinaryLogisticRegression(LogisticRegression): In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight. Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased. priors: prior prob for having a positive sample. - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. @@ -91,7 +91,7 @@ def __init__( verbose=verbose, warm_start=warm_start, multi_class="ovr", - lr_seed=1024, + lr_seed=lr_seed, **kwargs ) diff --git a/hyperion/classifiers/greedy_fusion.py b/hyperion/np/classifiers/greedy_fusion.py similarity index 97% rename from hyperion/classifiers/greedy_fusion.py rename to hyperion/np/classifiers/greedy_fusion.py index 6eff32ad..646af8d3 100644 --- a/hyperion/classifiers/greedy_fusion.py +++ b/hyperion/np/classifiers/greedy_fusion.py @@ -4,16 +4,16 @@ """ import logging + import numpy as np -from ..hyp_defs import float_cpu, float_save -from ..hyp_model import HypModel +from ...hyp_defs import float_cpu, float_save from ..metrics import dcf - +from ..np_model import NPModel from .binary_logistic_regression import BinaryLogisticRegression as BLR -class GreedyFusionBinaryLR(HypModel): +class GreedyFusionBinaryLR(NPModel): """Greedy score fusion based on binary logistic regression. It computes ``max_systmes`` fusions. The best system, the best fusion of two, @@ -42,8 +42,8 @@ class GreedyFusionBinaryLR(HypModel): In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight. Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased. priors: prior prob for having a positive sample. - random_state: int, RandomState instance or None, optional, default: None - The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’. + random_state: int, default_rng instance or None, optional, default: None + The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If default_rng instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and @@ -226,7 +226,7 @@ def fit(self, x, class_ids, sample_weights=None): num_cands = len(cand_systems) cand_min_dcf = np.zeros((num_cands,), dtype=float_cpu()) cand_act_dcf = np.zeros((num_cands,), dtype=float_cpu()) - all_pos = np.zeros((num_cands,), dtype=np.bool) + all_pos = np.zeros((num_cands,), dtype=bool) cand_weights = [] for j in range(num_cands): system_idx_ij = np.concatenate( diff --git a/hyperion/classifiers/linear_gbe.py b/hyperion/np/classifiers/linear_gbe.py similarity index 88% rename from hyperion/classifiers/linear_gbe.py rename to hyperion/np/classifiers/linear_gbe.py index 075ea751..f551af14 100644 --- a/hyperion/classifiers/linear_gbe.py +++ b/hyperion/np/classifiers/linear_gbe.py @@ -4,15 +4,17 @@ """ import logging + import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from scipy.special import gammaln -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax +from ...hyp_defs import float_cpu +from ...utils.math_funcs import int2onehot, invert_pdmat, logdet_pdmat, softmax +from ..np_model import NPModel -class LinearGBE(HypModel): +class LinearGBE(NPModel): """Linear Gaussian Back-end. Attributes: @@ -30,6 +32,7 @@ class LinearGBE(HypModel): prior_nu: if given, it overwrites nu in the prior object. post_beta: if given, it fixes the value of beta in the posterior, overwriting the beta computed by the fit function. post_nu: if given, it fixes the value of nu in the posterior, overwriting the beta computed by the fit function. + labels: list of class labels. """ def __init__( @@ -48,6 +51,7 @@ def __init__( prior_nu=None, post_beta=None, post_nu=None, + labels=None, **kwargs ): @@ -73,8 +77,15 @@ def __init__( self.post_beta = post_beta self.post_nu = post_nu + self.set_labels(labels) self._compute_Ab() + def set_labels(self, labels): + if isinstance(labels, np.ndarray): + labels = list(labels) + + self.labels = labels + def get_config(self): """ Returns: @@ -90,6 +101,7 @@ def get_config(self): "prior_nu": self.prior_nu, "post_beta": self.post_beta, "post_nu": self.post_nu, + "labels": self.labels, } base_config = super().get_config() @@ -259,7 +271,6 @@ def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): p_theta = sample_weight[:, None] * p_theta N = np.sum(p_theta, axis=0) - F = np.dot(p_theta.T, x) if self.update_mu: @@ -337,8 +348,8 @@ def filter_class_args(**kwargs): valid_args = ( "update_mu", "update_W", - "no_update_mu", - "no_update_W", + "update_mu", + "update_W", "balance_class_weight", "prior", "prior_beta", @@ -348,11 +359,6 @@ def filter_class_args(**kwargs): "name", ) d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - if "no_update_mu" in d: - d["update_mu"] = not d["no_update_mu"] - if "no_update_W" in d: - d["update_W"] = not d["no_update_W"] - return d filter_train_args = filter_class_args @@ -364,61 +370,68 @@ def add_class_args(parser, prefix=None): parser: jsonargparse object prefix: argument prefix. """ - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "no-update-mu", - default=False, - action="store_true", + "--update-mu", + default=True, + action=ActionYesNo, + nargs="?", help="do not update mu", ) parser.add_argument( - p1 + "no-update-W", - default=False, - action="store_true", + "--update-W", + default=True, + action=ActionYesNo, + nargs="?", help="do not update W", ) parser.add_argument( - p1 + "balance-class-weight", + "--balance-class-weight", default=False, - action="store_true", + action=ActionYesNo, + nargs="?", help="Balances the weight of each class when computing W", ) parser.add_argument( - p1 + "prior", default=None, help="prior file for MAP adaptation" + "--prior", default=None, help="prior file for MAP adaptation" ) parser.add_argument( - p1 + "prior-beta", + "--prior-beta", default=16, type=float, help="relevance factor for the means", ) parser.add_argument( - p1 + "prior-nu", + "--prior-nu", default=16, type=float, help="relevance factor for the variances", ) parser.add_argument( - p1 + "post-beta", + "--post-beta", default=None, type=float, help="relevance factor for the means", ) parser.add_argument( - p1 + "post-nu", + "--post-nu", default=None, type=float, help="relevance factor for the variances", ) - parser.add_argument(p1 + "name", default="lgbe", help="model name") + parser.add_argument("--name", default="lgbe", help="model name") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) @staticmethod - def filter_eval_args(prefix, **kwargs): + def filter_eval_args(**kwargs): """Extracts the evaluation time hyperparams of the class from a dictionary. Returns: @@ -434,20 +447,19 @@ def add_eval_args(parser, prefix=None): parser: jsonargparse object prefix: argument prefix. """ - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") - parser.add_argument(p1 + "model-file", required=True, help=("model file")) parser.add_argument( - p1 + "normalize", + "--normalize", default=False, - action="store_true", + action=ActionYesNo, + nargs="?", help=("normalizes the ouput probabilities to sum to one"), ) parser.add_argument( - p1 + "eval-method", + "--eval-method", default="linear", choices=["linear", "llk", "predictive"], help=( @@ -455,6 +467,11 @@ def add_eval_args(parser, prefix=None): "or predictive distribution" ), ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) add_argparse_args = add_class_args add_argparse_train_args = add_class_args diff --git a/hyperion/classifiers/linear_gbe_up.py b/hyperion/np/classifiers/linear_gbe_up.py similarity index 98% rename from hyperion/classifiers/linear_gbe_up.py rename to hyperion/np/classifiers/linear_gbe_up.py index 8c855dfa..37ac9656 100644 --- a/hyperion/classifiers/linear_gbe_up.py +++ b/hyperion/np/classifiers/linear_gbe_up.py @@ -4,18 +4,19 @@ """ import logging + import numpy as np from scipy.special import gammaln -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import ( +from ...hyp_defs import float_cpu +from ...utils.math_funcs import ( + fullcov_varfloor, int2onehot, - logdet_pdmat, invert_pdmat, + logdet_pdmat, softmax, - fullcov_varfloor, ) +from ..np_model import NPModel from .linear_gbe import LinearGBE diff --git a/hyperion/classifiers/linear_svmc.py b/hyperion/np/classifiers/linear_svmc.py similarity index 83% rename from hyperion/classifiers/linear_svmc.py rename to hyperion/np/classifiers/linear_svmc.py index 244e0dc0..6a977df9 100644 --- a/hyperion/classifiers/linear_svmc.py +++ b/hyperion/np/classifiers/linear_svmc.py @@ -4,16 +4,17 @@ """ import logging -import numpy as np +import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from sklearn.svm import LinearSVC as SVC -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import softmax +from ...hyp_defs import float_cpu +from ...utils.math_funcs import softmax +from ..np_model import NPModel -class LinearSVMC(HypModel): +class LinearSVMC(NPModel): """Linear Support Vector Machine for Classification. Attributes: @@ -40,7 +41,7 @@ class LinearSVMC(HypModel): The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None max_iter: int, default: 100 Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge. @@ -60,7 +61,8 @@ class LinearSVMC(HypModel): penalty and dual will be ignored. verbose: int, default: 0 balance_class_weight: if True and class_weight is None, it makes class_weight="balanced". - lr_seed: seed form RandomState, used when random_state is None. + lr_seed: seed form default_rng, used when random_state is None. + labels: list of class labels """ def __init__( @@ -81,7 +83,8 @@ def __init__( verbose=0, balance_class_weight=True, lr_seed=1024, - **kwargs + labels=None, + **kwargs, ): super().__init__(**kwargs) @@ -90,12 +93,11 @@ def __init__( class_weight = "balanced" if random_state is None: - random_state = np.random.RandomState(seed=lr_seed) + random_state = np.random.default_rng(seed=lr_seed) self.use_bias = use_bias self.bias_scaling = bias_scaling self.balance_class_weight = balance_class_weight - logging.debug(class_weight) self.svm = SVC( penalty=penalty, C=C, @@ -117,6 +119,8 @@ def __init__( if b is not None: self.svm.intercept_ = b + self.set_labels(labels) + @property def A(self): return self.svm.coef_.T @@ -125,6 +129,12 @@ def A(self): def b(self): return self.svm.intercept_ * self.bias_scaling + def set_labels(self, labels): + if isinstance(labels, np.ndarray): + labels = list(labels) + + self.labels = labels + def get_config(self): """Gets configuration hyperparams. Returns: @@ -134,8 +144,9 @@ def get_config(self): "use_bias": self.use_bias, "bias_scaling": self.bias_scaling, "balance_class_weight": self.balance_class_weight, + "labels": self.labels, } - base_config = super(LinearSVMC, self).get_config() + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def predict(self, x, eval_type="logit"): @@ -203,7 +214,7 @@ def load_params(cls, f, config): return cls(**kwargs) @staticmethod - def filter_class_args(prefix=None, **kwargs): + def filter_class_args(**kwargs): """Extracts the hyperparams of the class from a dictionary. Returns: @@ -236,42 +247,35 @@ def add_class_args(parser, prefix=None): parser: jsonargparse object prefix: argument prefix. """ - if prefix is None: - p1 = "--" - p2 = "" - else: - p1 = "--" + prefix + "." - p2 = prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "penalty", + "--penalty", default="l2", choices=["l2", "l1"], help="used to specify the norm used in the penalization", ) parser.add_argument( - p1 + "c", - dest=(p2 + "C"), + "--c", + dest="C", default=1.0, type=float, help="inverse of regularization strength", ) parser.add_argument( - p1 + "loss", + "--loss", default="squared_hinge", choices=["hinge", "squared_hinge"], help="type of loss", ) parser.add_argument( - p1 + "no-use-bias", - dest=(p2 + "use_bias"), - default=True, - action="store_false", - help="Not use bias", + "--use-bias", default=True, action=ActionYesNo, nargs="?", help="Use bias", ) parser.add_argument( - p1 + "bias-scaling", + "--bias-scaling", default=1.0, type=float, help=( @@ -280,19 +284,19 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "lr-seed", default=1024, type=int, help="random number generator seed" + "--lr-seed", default=1024, type=int, help="random number generator seed" ) parser.add_argument( - p1 + "max-iter", + "--max-iter", default=100, type=int, help="only for the newton-cg, sag and lbfgs solvers", ) parser.add_argument( - p1 + "no-dual", - dest=(p2 + "dual"), + "--dual", default=True, - action="store_false", + action=ActionYesNo, + nargs="?", help=( "dual or primal formulation. " "Dual formulation is only implemented for " @@ -300,10 +304,10 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "tol", default=1e-4, type=float, help="tolerance for stopping criteria" + "--tol", default=1e-4, type=float, help="tolerance for stopping criteria" ) parser.add_argument( - p1 + "multi-class", + "--multi-class", default="ovr", choices=["ovr", "crammer_singer"], help=( @@ -312,29 +316,33 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "verbose", + "--verbose", default=0, type=int, help="For the liblinear and lbfgs solvers", ) parser.add_argument( - p1 + "balance-class-weight", + "--balance-class-weight", default=False, - action="store_true", + action=ActionYesNo, help="Balances the weight of each class when computing W", ) - parser.add_argument(p1 + "name", default="svc", help="model name") + parser.add_argument("--name", default="svc", help="model name") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) @staticmethod - def filter_eval_args(prefix, **kwargs): + def filter_eval_args(**kwargs): """Extracts the evaluation time hyperparams of the class from a dictionary. Returns: Hyperparameters to evaluate the class. """ - valid_args = ("model_file", "eval_type") + valid_args = "eval_type" return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod @@ -344,21 +352,22 @@ def add_eval_args(parser, prefix=None): parser: jsonargparse object prefix: argument prefix. """ - if prefix is None: - p1 = "--" - p2 = "" - else: - p1 = "--" + prefix + "." - p2 = prefix + "." - - parser.add_argument(p1 + "model-file", required=True, help=("model file")) + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + parser.add_argument( - p1 + "eval-type", + "--eval-type", default="logit", - choices=["logit", "bin-logpost", "bin-post", "cat-logpost", "cat-post"], + choices=["logit", "bin-log-post", "bin-post", "cat-log-post", "cat-post"], help=("type of evaluation"), ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + # for backward compatibility filter_train_args = filter_class_args add_argparse_args = add_class_args diff --git a/hyperion/classifiers/logistic_regression.py b/hyperion/np/classifiers/logistic_regression.py similarity index 98% rename from hyperion/classifiers/logistic_regression.py rename to hyperion/np/classifiers/logistic_regression.py index ad845170..03d9fd13 100644 --- a/hyperion/classifiers/logistic_regression.py +++ b/hyperion/np/classifiers/logistic_regression.py @@ -4,16 +4,16 @@ """ import logging -import numpy as np +import numpy as np from sklearn.linear_model import LogisticRegression as LR -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import softmax +from ...hyp_defs import float_cpu +from ...utils.math_funcs import softmax +from ..np_model import NPModel -class LogisticRegression(HypModel): +class LogisticRegression(NPModel): """Multi-class logistic regression. This is a wrapper that add functionalities to sklearn logistic regression. @@ -36,7 +36,7 @@ class LogisticRegression(HypModel): Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. @@ -93,6 +93,7 @@ def __init__( super().__init__(**kwargs) if random_state is None: + # random_state = np.random.default_rng(seed=lr_seed) random_state = np.random.RandomState(seed=lr_seed) if bias_scaling is None: diff --git a/hyperion/classifiers/q_scoring_homo_gbe.py b/hyperion/np/classifiers/q_scoring_homo_gbe.py similarity index 97% rename from hyperion/classifiers/q_scoring_homo_gbe.py rename to hyperion/np/classifiers/q_scoring_homo_gbe.py index 83f2408b..3345dd72 100644 --- a/hyperion/classifiers/q_scoring_homo_gbe.py +++ b/hyperion/np/classifiers/q_scoring_homo_gbe.py @@ -4,15 +4,16 @@ """ import logging + import numpy as np from scipy.special import gammaln -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax +from ...hyp_defs import float_cpu +from ...utils.math_funcs import int2onehot, invert_pdmat, logdet_pdmat, softmax +from ..np_model import NPModel -class QScoringHomoGBE(HypModel): +class QScoringHomoGBE(NPModel): def __init__( self, mu=None, diff --git a/hyperion/np/classifiers/svmc.py b/hyperion/np/classifiers/svmc.py new file mode 100644 index 00000000..ac5211ef --- /dev/null +++ b/hyperion/np/classifiers/svmc.py @@ -0,0 +1,360 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import pickle + +import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from sklearn.svm import SVC + +from ...hyp_defs import float_cpu +from ...utils.math_funcs import softmax +from ...utils.misc import filter_func_args +from ..np_model import NPModel + + +class SVMC(NPModel): + """Gaussian Support Vector Machine for Classification.""" + + def __init__( + self, + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=True, + tol=0.0001, + cache_size=600, + multi_class="ovr", + break_ties=True, + class_weight=None, + random_state=None, + max_iter=100, + verbose=0, + balance_class_weight=True, + lr_seed=1024, + labels=None, + **kwargs, + ): + + super().__init__(**kwargs) + + if class_weight is None and balance_class_weight: + class_weight = "balanced" + + if random_state is None: + random_state = np.random.default_rng(seed=lr_seed) + + self.C = C + self.kernel = kernel + self.degree = degree + self.gamma = gamma + self.coef0 = coef0 + self.shrinking = shrinking + self.probability = probability + self.tol = tol + self.cache_size = cache_size + self.multi_class = multi_class + self.break_ties = break_ties + self.class_weight = class_weight + + self.balance_class_weight = balance_class_weight + self.svm = SVC( + C=C, + kernel=kernel, + gamma=gamma, + degree=degree, + coef0=coef0, + shrinking=shrinking, + probability=probability, + tol=tol, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=multi_class, + break_ties=break_ties, + random_state=random_state, + ) + + self.set_labels(labels) + + @property + def model_params(self): + return self.svm.get_params() + + def set_labels(self, labels): + if isinstance(labels, np.ndarray): + labels = list(labels) + self.labels = labels + + def get_config(self): + """Gets configuration hyperparams. + Returns: + Dictionary with config hyperparams. + """ + config = { + "C": self.C, + "kernel": self.kernel, + "gamma": self.gamma, + "degree": self.degree, + "coef0": self.coef0, + "shrinking": self.shrinking, + "probability": self.probability, + "tol": self.tol, + "cache_size": self.cache_size, + "multi_class": self.multi_class, + "break_ties": self.break_ties, + "class_weight": self.class_weight, + "balance_class_weight": self.balance_class_weight, + "labels": self.labels, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def predict(self, x, eval_type="decision-func"): + """Evaluates the SVM + + Args: + x: input features (num_samples, feat_dim), + it can be (num_samples,) if feat_dim=1. + eval_type: evaluationg method: logit (log-likelihood ratio), + bin-log-post (binary log-posteriors), + bin-post (binary posteriors) + cat-log-post (categorical log-posteriors), + cat-post (categorical posteriors) + Returns: + Ouput scores (num_samples, num_classes) + """ + if eval_type == "cat-post": + return self.svm.predict_proba(x) + if eval_type == "cat-log-post": + return self.svm.predict_log_proba(x) + + return self.svm.decision_function(x) + + def __call__(self, x, eval_type="decision-func"): + """Evaluates the SVM + + Args: + x: input features (num_samples, feat_dim), + it can be (num_samples,) if feat_dim=1. + eval_type: evaluationg method: logit (log-likelihood ratio), + bin-log-post (binary log-posteriors), + bin-post (binary posteriors) + cat-log-post (categorical log-posteriors), + cat-post (categorical posteriors) + Returns: + Ouput scores (num_samples, num_classes) + """ + return self.predict(x, eval_type) + + def fit(self, x, class_ids, sample_weight=None): + """Estimates the parameters of the model. + + Args: + x: input features (num_samples, feat_dim), it can be (num_samples,) if feat_dim=1. + class_ids: class integer [0, num_classes-1] identifier (num_samples,) + sample_weight: weight of each sample in the estimation (num_samples,) + """ + self.svm.fit(x, class_ids) + if self.svm.fit_status_: + logging.warning("SVM did not converge") + + def save(self, file_path): + """Saves the model to file. + + Args: + file_path: filename path. + """ + file_dir = os.path.dirname(file_path) + if not (os.path.isdir(file_dir)): + os.makedirs(file_dir, exist_ok=True) + split_path = os.path.splitext(file_path) + if not split_path[-1] == "sav": + file_path = "".join(split_path[0] + ".sav") + with open(file_path, "wb") as f: + self.save_params(f) + + @classmethod + def load(cls, file_path): + """Loads the model from file. + + Args: + file_path: path to the file where the model is stored. + + Returns: + Model object. + """ + split_path = os.path.splitext(file_path) + if not split_path[-1] == "pkl": + file_path = "".join(split_path[0] + ".pkl") + + with open(file_path, "rb") as f: + return pickle.load(f) + + def save_params(self, f): + pickle.dump(self, f) + + @classmethod + def load_params(cls, f): + svmc = pickle.load(f) + return svmc + + @staticmethod + def filter_class_args(**kwargs): + """Extracts the hyperparams of the class from a dictionary. + + Returns: + Hyperparamter dictionary to initialize the class. + """ + return filter_func_args(SVMC.__init__, **kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + """It adds the arguments corresponding to the class to jsonarparse. + Args: + parser: jsonargparse object + prefix: argument prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--c", + dest="C", + default=1.0, + type=float, + help="inverse of regularization strength", + ) + parser.add_argument( + "--kernel", + default="rbf", + choices=["linear", "poly", "rbf", "sigmoid", "precomputed"], + help="kernel for svm", + ) + parser.add_argument( + "--degree", defaut=3, type=int, help="degree of polynomial kernel" + ) + parser.add_argument( + "--gamma", + default="scale", + choices=["scale", "auto"], + help="Kernel coefficient for ‘rbf’", + ) + parser.add_argument( + "--coef0", + default=0.0, + type=float, + help="independent term of poly and sigmoid kernels", + ) + parser.add_argument( + "--shrinking", + default=True, + type=bool, + help="Whether to use the shrinking heuristic", + ) + parser.add_argument( + "--probability", + default=True, + type=bool, + help="Whether to enable probability estimates", + ) + parser.add_argument( + "--break-ties", + default=True, + type=bool, + help="If true, predict will break ties according to the confidence values of decision_function; otherwise \ + the first class among the tied classes is returned", + ) + parser.add_argument( + "--lr-seed", default=1024, type=int, help="random number generator seed" + ) + parser.add_argument( + "--max-iter", + dest="max_iter", + default=100, + type=int, + help="only for the newton-cg, sag and lbfgs solvers", + ) + parser.add_argument( + "--tol", default=1e-4, type=float, help="tolerance for stopping criteria" + ) + parser.add_argument( + "--multi-class", + default="ovr", + choices=["ovr", "ovo"], + help=( + "ovr fits a binary problem for each class else " + "it minimizes the multinomial loss." + ), + ) + parser.add_argument( + "--cache-size", + default=600, + type=int, + help="Specify the size of the kernel cache (in MB)", + ) + parser.add_argument( + "--verbose", + default=0, + type=int, + help="For the liblinear and lbfgs solvers", + ) + parser.add_argument( + "--balance-class-weight", + default=False, + action=ActionYesNo, + help="Balances the weight of each class when computing W", + ) + parser.add_argument("--name", default="svc", help="model name") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + @staticmethod + def filter_eval_args(**kwargs): + """Extracts the evaluation time hyperparams of the class from a dictionary. + + Returns: + Hyperparameters to evaluate the class. + """ + valid_args = "eval_type" + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_eval_args(parser, prefix=None): + """It adds the arguments needed to evaluate the class to jsonarparse. + Args: + parser: jsonargparse object + prefix: argument prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--eval-type", + default="decision-func", + choices=["cat-log-post", "cat-post", "decision-func"], + help=("type of evaluation"), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + # for backward compatibility + filter_train_args = filter_class_args + add_argparse_args = add_class_args + add_argparse_train_args = add_class_args + add_argparse_eval_args = add_eval_args diff --git a/hyperion/np/clustering/__init__.py b/hyperion/np/clustering/__init__.py new file mode 100644 index 00000000..80cfaa2c --- /dev/null +++ b/hyperion/np/clustering/__init__.py @@ -0,0 +1,8 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .ahc import AHC +from .kmeans import KMeans, KMeansInitMethod +from .spectral_clustering import SpectralClustering diff --git a/hyperion/clustering/ahc.py b/hyperion/np/clustering/ahc.py similarity index 97% rename from hyperion/clustering/ahc.py rename to hyperion/np/clustering/ahc.py index c99cfa2e..e6e0d81b 100644 --- a/hyperion/clustering/ahc.py +++ b/hyperion/np/clustering/ahc.py @@ -3,18 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np -import h5py from copy import copy +import h5py +import numpy as np from scipy.cluster.hierarchy import linkage -from sklearn.metrics import homogeneity_score, completeness_score +from sklearn.metrics import completeness_score, homogeneity_score -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel +from ...hyp_defs import float_cpu +from ..np_model import NPModel -class AHC(HypModel): +class AHC(NPModel): """Agglomerative Hierarchical Clustering class. Attributes: diff --git a/hyperion/np/clustering/kmeans.py b/hyperion/np/clustering/kmeans.py new file mode 100644 index 00000000..59983cae --- /dev/null +++ b/hyperion/np/clustering/kmeans.py @@ -0,0 +1,270 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from enum import Enum + +import h5py +import numpy as np + +from ...hyp_defs import float_cpu +from ..np_model import NPModel + + +class KMeansInitMethod(str, Enum): + max_dist = "max_dist" + random = "random" + + @staticmethod + def choices(): + return [KMeansInitMethod.max_dist, KMeansInitMethod.random] + + +class KMeans(NPModel): + """K-Means clustering class. + + Attributes: + num_clusters: number of clusters. + mu: cluster centers. + rtol: minimum delta in loss function used as stopping criterion. + """ + + def __init__( + self, + num_clusters, + mu=None, + rtol=0.001, + epochs=100, + init_method=KMeansInitMethod.max_dist, + num_workers=1, + verbose=True, + rng_seed=11235813, + **kwargs + ): + super().__init__(**kwargs) + self.num_clusters = num_clusters + self.mu = mu + self.rtol = rtol + self.epochs = epochs + self.verbose = verbose + self.num_workers = num_workers + self.init_method = init_method + if self.init_method == KMeansInitMethod.random: + self.rng = np.random.default_rng(seed=rng_seed) + + def fit(self, x): + """Performs the clustering. + + Args: + x: input data (num_samples, feat_dim). + epochs: max. number of epochs. + + Returns: + loss: value of loss function (num_epochs,). + cluster_index: clustering labels as int numpy array with shape=(num_samples,) + """ + loss = np.zeros((self.epochs,), dtype=float_cpu()) + if self.init_method == KMeansInitMethod.max_dist: + if self.num_workers == 1: + self.mu = self._choose_seeds_max_dist(x) + else: + self.mu = self._choose_seeds_max_dist_multithread(x) + else: + self.mu = self._choose_seeds_random(x) + + cluster_index, err2 = self(x) + for epoch in range(self.epochs): + if self.num_workers == 1: + self.mu = self._compute_centroids(x, cluster_index) + else: + self.mu = self._compute_centroids_multithread(x, cluster_index) + cluster_index, err2 = self(x) + loss[epoch] = np.mean(err2) + if epoch > 0: + delta = np.abs(loss[epoch - 1] - loss[epoch]) / ( + loss[epoch - 1] + 1e-10 + ) + if self.verbose: + logging.info( + "epoch: %d loss: %f rdelta: %f", epoch, loss[epoch], delta + ) + if delta < self.rtol: + loss = loss[: epoch + 1] + break + else: + if self.verbose: + logging.info("epoch: %d loss: %f", epoch, loss[epoch]) + + return loss, cluster_index + + def _choose_seeds_random(self, x): + """Chooses the initial seeds for the clustering randomly. + + Args: + x: input data (num_samples, feat_dim). + + Returns: + Initial centers (num_clusters, feat_dim) + """ + if self.verbose: + logging.info("choosing seeds") + + mu = self.rng.choice(x, size=(self.num_clusters,), replace=False, shuffle=False) + if self.verbose: + logging.info("%d seeds chosen", self.num_clusters) + + return mu + + def _choose_seeds_max_dist(self, x): + """Chooses the initial seeds for the clustering. + + Args: + x: input data (num_samples, feat_dim). + + Returns: + Initial centers (num_clusters, feat_dim) + """ + if self.verbose: + logging.info("choosing seeds") + mu = np.zeros((self.num_clusters, x.shape[-1]), dtype=float_cpu()) + mu[0] = x[0] + for i in range(1, self.num_clusters): + d = np.zeros((x.shape[0],), dtype=float_cpu()) + for j in range(i): + d += np.sum(np.square(x - mu[j]), axis=-1) + index = np.argmax(d) + mu[i] = x[index] + return mu + + @staticmethod + def _compute_d2(x, mu): + return np.sum(np.square(x - mu), axis=-1) + + def _choose_seeds_max_dist_multithread(self, x): + """Chooses the initial seeds for the clustering. + + Args: + x: input data (num_samples, feat_dim). + + Returns: + Initial centers (num_clusters, feat_dim) + """ + if self.verbose: + logging.info("choosing seeds") + + mu = np.zeros((self.num_clusters, x.shape[-1]), dtype=float_cpu()) + with ThreadPoolExecutor(max_workers=self.num_workers) as executor: + mu[0] = x[0] + for i in range(1, self.num_clusters): + d = np.zeros((x.shape[0],), dtype=float_cpu()) + + futures = { + executor.submit(KMeans._compute_d2, x, mu[j]): j for j in range(i) + } + for future in as_completed(futures): + d += future.result() + + index = np.argmax(d) + mu[i] = x[index] + if self.verbose and (i % 10 == 0 or i == self.num_clusters - 1): + logging.info("%d seeds chosen", i + 1) + return mu + + def _compute_centroids(self, x, index): + """Compute the centroids given cluster assigments. + + Args: + x: input data (num_samples, feat_dim) + index: cluster assignments as integers with shape=(num_samples,) + + Returns: + Cluster centroids (num_clusters, feat_dim) + """ + mu = np.zeros((self.num_clusters, x.shape[-1]), dtype=float_cpu()) + for k in range(self.num_clusters): + r = index == k + if np.sum(r) > 0: + mu[k] = np.mean(x[r], axis=0) + return mu + + @staticmethod + def _compute_centroid(x, index, k): + r = index == k + if np.sum(r) > 0: + return np.mean(x[r], axis=0) + else: + return None + + def _compute_centroids_multithread(self, x, index): + """Compute the centroids given cluster assigments. + + Args: + x: input data (num_samples, feat_dim) + index: cluster assignments as integers with shape=(num_samples,) + + Returns: + Cluster centroids (num_clusters, feat_dim) + """ + mu = np.zeros((self.num_clusters, x.shape[-1]), dtype=float_cpu()) + with ThreadPoolExecutor(max_workers=self.num_workers) as executor: + futures = { + executor.submit(KMeans._compute_centroid, x, index, k): k + for k in range(self.num_clusters) + } + for future in as_completed(futures): + k = futures[future] + mu_k = future.result() + if mu_k is not None: + mu[k] = mu_k + + return mu + + def predict(self, x): + """Compute the cluster labels for new data. + + Args: + x: input data (num_samples, feat_dim) + + Returns: + Cluster assignments as integer array (num_samples,) + Square distance of each element to the center of its cluster. + """ + err2 = np.zeros((x.shape[0], self.num_clusters), dtype=float_cpu()) + for k in range(self.num_clusters): + err2[:, k] = np.sum(np.square(x - self.mu[k]), axis=-1) + + index = np.argmin(err2, axis=-1) + return index, err2[np.arange(x.shape[0]), index] + + def predict_multithread(self, x): + """Compute the cluster labels for new data. + + Args: + x: input data (num_samples, feat_dim) + + Returns: + Cluster assignments as integer array (num_samples,) + Square distance of each element to the center of its cluster. + """ + err2 = np.zeros((x.shape[0], self.num_clusters), dtype=float_cpu()) + with ThreadPoolExecutor(max_workers=self.num_workers) as executor: + futures = { + executor.submit(KMeans._compute_d2, x, self.mu[k]): k + for k in range(self.num_clusters) + } + for future in as_completed(futures): + k = futures[future] + err2[:, k] = future.result() + + index = np.argmin(err2, axis=-1) + return index, err2[np.arange(x.shape[0]), index] + + def __call__(self, x): + if self.num_workers == 1: + return self.predict(x) + else: + return self.predict_multithread(x) diff --git a/hyperion/np/clustering/spectral_clustering.py b/hyperion/np/clustering/spectral_clustering.py new file mode 100644 index 00000000..ab2fad26 --- /dev/null +++ b/hyperion/np/clustering/spectral_clustering.py @@ -0,0 +1,312 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from copy import copy +from enum import Enum +from typing import Any, Dict, Optional + +import h5py +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from scipy import sparse +from scipy.linalg import eigh +from scipy.sparse.csgraph import laplacian as csgraph_laplacian +from scipy.sparse.linalg import eigsh +from sklearn.metrics import completeness_score, homogeneity_score +from sklearn.preprocessing import normalize + +from ...hyp_defs import float_cpu +from ...utils import PathLike +from ..np_model import NPModel +from .kmeans import KMeans, KMeansInitMethod + + +class LaplacianType(str, Enum): + unnormalized = "unnormalized" + norm_sym = "norm_sym" + norm_rw = "norm_rw" + + @staticmethod + def choices(): + return [ + LaplacianType.unnormalized, + LaplacianType.norm_sym, + LaplacianType.norm_rw, + ] + + +class SpectralClusteringNumClassCriterion(str, Enum): + max_eigengap = "max_eigengap" + max_d_eig_vals = "max_d_eig_vals" + thr_eigengap = "thr_eigengap" + thr_d_eig_vals = "thr_d_eig_vals" + + @staticmethod + def choices(): + return [ + SpectralClusteringNumClassCriterion.max_eigengap, + SpectralClusteringNumClassCriterion.max_d_eig_vals, + SpectralClusteringNumClassCriterion.thr_eigengap, + SpectralClusteringNumClassCriterion.thr_d_eig_vals, + ] + + +class SpectralClustering(NPModel): + """Spectral Clustering class""" + + def __init__( + self, + laplacian: str = "norm_sym", + num_clusters: Optional[int] = None, + max_num_clusters: Optional[int] = None, + criterion: SpectralClusteringNumClassCriterion = SpectralClusteringNumClassCriterion.max_eigengap, + thr_eigengap: float = 1e-3, + kmeans_epochs: int = 100, + kmeans_init_method: KMeansInitMethod = KMeansInitMethod.max_dist, + num_workers: int = 1, + ): + self.laplacian = laplacian + self.num_clusters = num_clusters + self.max_num_clusters = max_num_clusters + self.criterion = criterion + self.kmeans_epochs = kmeans_epochs + self.thr_eigengap = thr_eigengap + self.kmeans_init_method = kmeans_init_method + self.num_workers = num_workers + + def spectral_embedding(self, x: np.ndarray): + num_nodes = x.shape[0] + if not sparse.issparse(x): + x.flat[:: num_nodes + 1] = 0 + r = num_nodes**2 / np.sum(x > 0) + if r > 4: + x = sparse.csr_matrix(x) + + D = None + if self.laplacian in LaplacianType.unnormalized: + L = csgraph_laplacian(x, normed=False) + elif self.laplacian == LaplacianType.norm_sym: + L = csgraph_laplacian(x, normed=True) + elif self.laplacian == LaplacianType.norm_rw: + L, dd = csgraph_laplacian(x, normed=False, return_diag=True) + if sparse.issparse(L): + D = sparse.diags(dd) + else: + D = np.diag(dd) + + max_num_clusters = num_nodes - 1 + if self.max_num_clusters is not None: + max_num_clusters = min(max_num_clusters, self.max_num_clusters) + if self.num_clusters is not None: + max_num_clusters = min(max_num_clusters, self.num_clusters) + + eig_vals, eig_vecs = eigsh(L, k=max_num_clusters, M=D, which="SM") + eig_vals = eig_vals[1:] + eig_vecs = eig_vecs[:, 1:] + return eig_vals, eig_vecs + + def spectral_embedding_0(self, x: np.ndarray): + num_nodes = x.shape[0] + x.flat[:: num_nodes + 1] = 0 + d = np.sum(x, axis=1) + D = None + if self.laplacian in LaplacianType.unnormalized: + L = np.diag(d) - x + elif self.laplacian == LaplacianType.norm_sym: + idsqrt = 1 / np.sqrt(d) + L = np.eye(num_nodes) - idsqrt[:, None] * x * idsqrt + elif self.laplacian == LaplacianType.norm_rw: + D = np.diag(d) + L = D - x + + max_num_clusters = num_nodes + if self.max_num_clusters is not None: + max_num_clusters = min(max_num_clusters, self.max_num_clusters) + if self.num_clusters is not None: + max_num_clusters = min(max_num_clusters, self.num_clusters) + + eig_vals, eig_vecs = eigh( + L, b=D, overwrite_a=True, subset_by_index=[1, max_num_clusters - 1] + ) + + return eig_vals, eig_vecs + + def compute_eigengap(self, eig_vals: np.ndarray): + eig_vals = np.concatenate(([0.0], eig_vals)) + eigengap = np.diff(np.concatenate(([0.0], eig_vals))) + filter = np.array([1 / 60, -3 / 20, 3 / 4, 0.0, -3 / 4, 3 / 20, -1 / 60]) + eig_vals_ext = np.concatenate((eig_vals, [eig_vals[-1]] * 3)) + d_eig_vals = np.convolve(eig_vals, filter)[3:-6] + k_max = np.argmax(eigengap) + gap_max = eigengap[k_max] + # k_relmax = [] + # gap_relmax = [] + # gap_norm_relmax = [] + # for k in range(len(eigengap)): + # if k == 0 and eigengap[k] > eigengap[k + 1]: + # k_relmax.append(k) + # gap_relmax.append(eigengap[k]) + # gap_norm_relmax.append(eigengap[k] / eigengap[k + 1]) + # elif k == len(eigengap) - 1 and eigengap[k] > eigengap[k - 1]: + # k_relmax.append(k) + # gap_relmax.append(eigengap[k]) + # gap_norm_relmax.append(eigengap[k] / eigengap[k - 1]) + # elif eigengap[k] > eigengap[k - 1] and eigengap[k] > eigengap[k + 1]: + # k_relmax.append(k) + # gap_relmax.append(eigengap[k]) + # gap_norm_relmax.append( + # 2 * eigengap[k] / (eigengap[k - 1] + eigengap[k + 1]) + # ) + + # idx = np.argmax(gap_norm_relmax) + # gap_norm_relmax_max = gap_norm_relmax[idx] + # k_relmax_max = k_relmax[idx] + eigengap_stats = { + "eig_vals": eig_vals, + "eigengap": eigengap, + "gap_max": gap_max, + "k_max": k_max, + # "gap_relmax": gap_relmax, + # "k_relmax": k_relmax, + # "gap_norm_relmax": gap_norm_relmax, + # "gap_norm_relmax_max": gap_norm_relmax_max, + # "k_relmax_max": k_relmax_max, + "d_eig_vals": d_eig_vals, + } + return eigengap_stats + + def predict_num_clusters(self, eigengap_stats: np.ndarray): + if self.num_clusters is not None: + num_clusters = self.num_clusters + + elif self.criterion == SpectralClusteringNumClassCriterion.max_eigengap: + num_clusters = eigengap_stats["k_max"] + 1 + elif self.criterion == SpectralClusteringNumClassCriterion.max_d_eig_vals: + num_clusters = np.argmax(eigengap_stats["d_eig_vals"]) + 1 + elif self.criterion == SpectralClusteringNumClassCriterion.thr_eigengap: + nz = (eigengap_stats["eigengap"] < self.thr_eigengap).nonzero()[0] + num_clusters = nz[nz > eigengap_stats["k_max"]][0] + 1 + elif self.criterion == SpectralClusteringNumClassCriterion.thr_d_eig_vals: + nz = (eigengap_stats["d_eig_vals"] < self.thr_eigengap).nonzero()[0] + num_clusters = nz[nz > eigengap_stats["k_max"]][0] + 1 + else: + raise ValueError(f"invalid num clusters criterion {self.criterion}") + return num_clusters + + def normalize_eigvecs(self, eig_vecs: np.ndarray): + if self.laplacian == LaplacianType.norm_sym: + return normalize(eig_vecs, axis=1) + else: + return eig_vecs + + def do_kmeans(self, x: np.ndarray, num_clusters: Optional[int] = None): + if num_clusters is None: + num_clusters = x.shape[1] + 1 + kmeans = KMeans( + num_clusters=num_clusters, + epochs=self.kmeans_epochs, + init_method=self.kmeans_init_method, + num_workers=self.num_workers, + ) + kmeans.fit(x) + y, _ = kmeans(x) + return y + + def fit(self, x: np.ndarray): + logging.info("compute spectral embeddings") + + eig_vals, eig_vecs = self.spectral_embedding(x) + if self.num_clusters is None: + logging.info("compute eigengap stats") + eigengap_stats = self.compute_eigengap(eig_vals) + else: + eigengap_stats = None + + logging.info("predicting number of clusters") + num_clusters = self.predict_num_clusters(eigengap_stats) + logging.info("predicted num_clusters=%d", num_clusters) + if num_clusters == 1: + return np.zeros((x.shape[0]), dtype=int), num_clusters, eigengap_stats + # minus one because we already removed the first eig vector + logging.info("normalizing embeddings") + eig_vecs = eig_vecs[:, : num_clusters - 1] + eig_vecs = self.normalize_eigvecs(eig_vecs) + logging.info("running k-means") + y = self.do_kmeans(eig_vecs, num_clusters) + return y, num_clusters, eigengap_stats + + def plot_eigengap_stats( + self, + eigengap_stats: Dict[str, Any], + num_clusters: int, + fig_file: Optional[PathLike] = None, + ): + fig, (ax0, ax1, ax2) = plt.subplots( + nrows=1, ncols=3, sharex=True, figsize=(12, 6) + ) + eig_vals = eigengap_stats["eig_vals"] + ax0.plot(np.arange(1, len(eig_vals) + 1), eig_vals, "b") + ax0.vlines( + num_clusters, ymin=np.min(eig_vals), ymax=np.max(eig_vals), colors="r" + ) + ax0.grid() + ax0.set_title("eigen_vals") + eigengap = eigengap_stats["eigengap"] + ax1.plot(np.arange(1, len(eigengap) + 1), eigengap, "b") + ax1.vlines( + num_clusters, ymin=np.min(eigengap), ymax=np.max(eigengap), colors="r" + ) + ax1.grid() + ax1.set_title("eigengap") + d_eig_vals = eigengap_stats["d_eig_vals"] + ax2.plot(np.arange(1, len(d_eig_vals) + 1), d_eig_vals, "b") + ax2.vlines( + num_clusters, ymin=np.min(d_eig_vals), ymax=np.max(d_eig_vals), colors="r" + ) + ax2.grid() + ax2.set_title("d_eigen_val") + if fig_file is not None: + fig.savefig(fig_file) + + @staticmethod + def add_class_args(parser, prefix=None): + """It adds the arguments corresponding to the class to jsonarparse. + Args: + parser: jsonargparse object + prefix: argument prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--laplacian", + default=LaplacianType.norm_sym, + choices=LaplacianType.choices(), + ) + parser.add_argument("--num-clusters", default=None, type=int) + parser.add_argument("--max-num-clusters", default=None, type=int) + parser.add_argument( + "--criterion", + default=SpectralClusteringNumClassCriterion.max_eigengap, + choices=SpectralClusteringNumClassCriterion.choices(), + ) + parser.add_argument("--thr-eigengap", default=1e-3, type=float) + parser.add_argument("--kmeans-epochs", default=100, type=int) + parser.add_argument( + "--kmeans-init-method", + default=KMeansInitMethod.max_dist, + choices=KMeansInitMethod.choices(), + ) + parser.add_argument("--num-workers", default=1, type=int) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) diff --git a/hyperion/diarization/__init__.py b/hyperion/np/diarization/__init__.py similarity index 100% rename from hyperion/diarization/__init__.py rename to hyperion/np/diarization/__init__.py diff --git a/hyperion/diarization/diar_ahc_plda.py b/hyperion/np/diarization/diar_ahc_plda.py similarity index 98% rename from hyperion/diarization/diar_ahc_plda.py rename to hyperion/np/diarization/diar_ahc_plda.py index b8fb0fa6..7bffa633 100644 --- a/hyperion/diarization/diar_ahc_plda.py +++ b/hyperion/np/diarization/diar_ahc_plda.py @@ -5,14 +5,13 @@ import logging from pathlib import Path -import numpy as np import h5py import matplotlib +import numpy as np matplotlib.use("Agg") import matplotlib.pyplot as plt - from ..clustering import AHC from ..pdfs import GMMTiedDiagCov as GMM from ..transforms import PCA, LNorm @@ -67,7 +66,7 @@ def _plot_score_hist(scores, output_file, thr=None, gmm=None): output_dir = Path(output_file).parent output_dir.mkdir(parents=True, exist_ok=True) - mask = np.triu(np.ones(scores.shape, dtype=np.bool), 1) + mask = np.triu(np.ones(scores.shape, dtype=bool), 1) scores_r = scores[mask].ravel() _, bins, _ = plt.hist( @@ -97,7 +96,7 @@ def _plot_score_hist(scores, output_file, thr=None, gmm=None): @staticmethod def _unsup_gmm_calibration(scores): """Performs unsupervised calibration on the scores by training a GMM.""" - mask = np.triu(np.ones(scores.shape, dtype=np.bool), 1) + mask = np.triu(np.ones(scores.shape, dtype=bool), 1) scores_r = scores[mask].ravel()[:, None] # N x 1 gmm_1c = GMM(num_comp=1) gmm_1c.fit(scores_r, epochs=1) diff --git a/hyperion/np/feats/__init__.py b/hyperion/np/feats/__init__.py new file mode 100644 index 00000000..5173bf4b --- /dev/null +++ b/hyperion/np/feats/__init__.py @@ -0,0 +1,13 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +# + +from .energy_vad import EnergyVAD +from .feature_normalization import MeanVarianceNorm +from .feature_windows import FeatureWindowFactory +from .filter_banks import FilterBankFactory +from .frame_selector import FrameSelector +from .mfcc import MFCC +from .stft import * diff --git a/hyperion/feats/energy_vad.py b/hyperion/np/feats/energy_vad.py similarity index 89% rename from hyperion/feats/energy_vad.py rename to hyperion/np/feats/energy_vad.py index 7e17acfc..1d578c68 100644 --- a/hyperion/feats/energy_vad.py +++ b/hyperion/np/feats/energy_vad.py @@ -5,10 +5,11 @@ import logging import numpy as np +from jsonargparse import ActionParser, ArgumentParser from scipy.signal import lfilter -from ..hyp_defs import float_cpu -from ..utils.misc import str2bool +from ...hyp_defs import float_cpu +from ...utils.misc import str2bool from .stft import st_logE @@ -19,7 +20,7 @@ class EnergyVAD(object): sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) frame_length: Frame length in milliseconds (default = 25) frame_shift: Frame shift in milliseconds (default = 10) - dither: Dithering constant (0.0 means no dither) (default = 1) + dither: Dithering constant (0.0 means no dither) (default = 2^(-15)) snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (default = True) vad_energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5) vad_energy_threshold: Constant term in energy threshold for MFCC0 for VAD (also see --vad-energy-mean-scale) (float, default = 5) @@ -32,7 +33,7 @@ def __init__( sample_frequency=16000, frame_length=25, frame_shift=10, - dither=1, + dither=1 / 2 ** 15, snip_edges=True, vad_energy_mean_scale=0.5, vad_energy_threshold=5, @@ -97,7 +98,7 @@ def compute(self, x, return_loge=False): # add dither if self.dither > 0: - n = self.dither * np.random.RandomState(seed=len(x)).randn( + n = self.dither * np.random.default_rng(seed=len(x)).randn( len(x) ).astype(float_cpu(), copy=False) x = x + n @@ -174,14 +175,12 @@ def add_class_args(parser, prefix=None): parser: Arguments parser prefix: Options prefix. """ - - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "sample-frequency", + "--sample-frequency", default=16000, type=int, help=( @@ -191,24 +190,21 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "frame-length", - type=int, - default=25, - help="Frame length in milliseconds", + "--frame-length", type=int, default=25, help="Frame length in milliseconds", ) parser.add_argument( - p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds" + "--frame-shift", type=int, default=10, help="Frame shift in milliseconds" ) parser.add_argument( - p1 + "dither", + "--dither", type=float, - default=1, + default=1 / 2 ** 15, help="Dithering constant (0.0 means no dither)", ) parser.add_argument( - p1 + "snip-edges", + "--snip-edges", default=True, type=str2bool, help=( @@ -221,7 +217,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "vad-energy-mean-scale", + "--vad-energy-mean-scale", type=float, default=0.5, help=( @@ -231,13 +227,13 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "vad-energy-threshold", + "--vad-energy-threshold", type=float, default=5, help="Constant term in energy threshold for MFCC0 for VAD", ) parser.add_argument( - p1 + "vad-frames-context", + "--vad-frames-context", type=int, default=0, help=( @@ -246,7 +242,7 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "vad-proportion-threshold", + "--vad-proportion-threshold", type=float, default=0.6, help=( @@ -254,5 +250,7 @@ def add_class_args(parser, prefix=None): "the window that need to have more energy than the threshold" ), ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args diff --git a/hyperion/feats/feature_normalization.py b/hyperion/np/feats/feature_normalization.py similarity index 98% rename from hyperion/feats/feature_normalization.py rename to hyperion/np/feats/feature_normalization.py index 5a2347e6..27683739 100644 --- a/hyperion/feats/feature_normalization.py +++ b/hyperion/np/feats/feature_normalization.py @@ -4,10 +4,10 @@ """ import numpy as np -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser from scipy.signal import convolve2d -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu class MeanVarianceNorm(object): diff --git a/hyperion/feats/feature_windows.py b/hyperion/np/feats/feature_windows.py similarity index 95% rename from hyperion/feats/feature_windows.py rename to hyperion/np/feats/feature_windows.py index c4238054..ef8fe7b4 100644 --- a/hyperion/feats/feature_windows.py +++ b/hyperion/np/feats/feature_windows.py @@ -6,9 +6,9 @@ import logging import numpy as np -from scipy.signal import blackman, hamming, hann +from scipy.signal.windows import blackman, hamming, hann -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu class FeatureWindowFactory(object): diff --git a/hyperion/feats/filter_banks.py b/hyperion/np/feats/filter_banks.py similarity index 98% rename from hyperion/feats/filter_banks.py rename to hyperion/np/feats/filter_banks.py index b92535da..0e0eaf84 100644 --- a/hyperion/feats/filter_banks.py +++ b/hyperion/np/feats/filter_banks.py @@ -3,14 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import logging import numpy as np +from jsonargparse import ActionParser, ArgumentParser from librosa.filters import mel as make_mel_librosa -from ..hyp_defs import float_cpu -from ..utils.misc import str2bool +from ...hyp_defs import float_cpu class FilterBankFactory(object): diff --git a/hyperion/feats/frame_selector.py b/hyperion/np/feats/frame_selector.py similarity index 100% rename from hyperion/feats/frame_selector.py rename to hyperion/np/feats/frame_selector.py diff --git a/hyperion/feats/mfcc.py b/hyperion/np/feats/mfcc.py similarity index 92% rename from hyperion/feats/mfcc.py rename to hyperion/np/feats/mfcc.py index 94af5c2e..b56728b8 100644 --- a/hyperion/feats/mfcc.py +++ b/hyperion/np/feats/mfcc.py @@ -6,14 +6,15 @@ from enum import Enum import numpy as np +from jsonargparse import ActionParser, ArgumentParser from scipy.fftpack import dct from scipy.signal import lfilter -from ..hyp_defs import float_cpu -from ..utils.misc import str2bool +from ...hyp_defs import float_cpu +from ...utils.misc import str2bool from .feature_windows import FeatureWindowFactory as FWF from .filter_banks import FilterBankFactory as FBF -from .stft import strft, st_logE +from .stft import st_logE, strft class MFCCSteps(Enum): @@ -64,7 +65,7 @@ class MFCC(object): """Compute MFCC features. Attributes: - sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) + sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) frame_length: Frame length in milliseconds (default = 25) frame_shift: Frame shift in milliseconds (default = 10) fft_length: Length of FFT (default = 512) @@ -72,7 +73,7 @@ class MFCC(object): preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"blackmann") (default = 'povey') use_fft2: If true, it uses |X(f)|^2, if false, it uses |X(f)|, (default = True) - dither: Dithering constant (0.0 means no dither) (default = 1) + dither: Dithering constant (0.0 means no dither) (default = 1/2**15) fb_type: Filter-bank type: mel_kaldi, mel_etsi, mel_librosa, mel_librosa_htk, linear (default = 'mel_kaldi') low_freq: Low cutoff frequency for mel bins (default = 20) high_freq: High cutoff frequency for mel bins (if < 0, offset from Nyquist) (default = 0) @@ -98,7 +99,7 @@ def __init__( preemphasis_coeff=0.97, window_type="povey", use_fft2=True, - dither=1, + dither=1 / 2 ** 15, fb_type="mel_kaldi", low_freq=20, high_freq=0, @@ -256,7 +257,7 @@ def compute(self, x, return_fft=False, return_spec=False, return_logfb=False): # add dither if self.dither > 0: - n = self.dither * np.random.RandomState(seed=len(x)).randn( + n = self.dither * np.random.default_rng(seed=len(x)).randn( len(x) ).astype(float_cpu(), copy=False) x = x + n @@ -400,14 +401,12 @@ def add_class_args(parser, prefix=None): parser: Arguments parser prefix: Options prefix. """ - - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "sample-frequency", + "--sample-frequency", default=16000, type=int, help="Waveform data sample frequency " @@ -415,27 +414,22 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "frame-length", - type=int, - default=25, - help="Frame length in milliseconds", - ) - parser.add_argument( - p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds" + "--frame-length", type=int, default=25, help="Frame length in milliseconds", ) parser.add_argument( - p1 + "fft-length", type=int, default=512, help="Length of FFT" + "--frame-shift", type=int, default=10, help="Frame shift in milliseconds" ) + parser.add_argument("--fft-length", type=int, default=512, help="Length of FFT") parser.add_argument( - p1 + "remove-dc-offset", + "--remove-dc-offset", default=True, type=str2bool, help="Subtract mean from waveform on each frame", ) parser.add_argument( - p1 + "preemphasis-coeff", + "--preemphasis-coeff", type=float, default=0.97, help="Coefficient for use in signal preemphasis", @@ -444,30 +438,30 @@ def add_class_args(parser, prefix=None): FWF.add_class_args(parser, prefix) parser.add_argument( - p1 + "use-fft2", + "--use-fft2", default=True, type=str2bool, help="If true, it uses |X(f)|^2, if false, it uses |X(f)|", ) parser.add_argument( - p1 + "dither", + "--dither", type=float, - default=1, + default=1 / 2 ** 15, help="Dithering constant (0.0 means no dither)", ) FBF.add_class_args(parser, prefix) parser.add_argument( - p1 + "num-ceps", + "--num-ceps", type=int, default=13, help="Number of cepstra in MFCC computation (including C0)", ) parser.add_argument( - p1 + "snip-edges", + "--snip-edges", default=True, type=str2bool, help=( @@ -480,34 +474,34 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "energy-floor", + "--energy-floor", type=float, default=0, help="Floor on energy (absolute, not relative) in MFCC computation", ) parser.add_argument( - p1 + "raw-energy", + "--raw-energy", default=True, type=str2bool, help="If true, compute energy before preemphasis and windowing", ) parser.add_argument( - p1 + "use-energy", + "--use-energy", default=True, type=str2bool, help="Use energy (not C0) in MFCC computation", ) parser.add_argument( - p1 + "cepstral-lifter", + "--cepstral-lifter", type=float, default=22, help="Constant that controls scaling of MFCCs", ) parser.add_argument( - p1 + "input-step", + "--input-step", default="wave", choices=["wave", "fft", "spec", "log_spec", "logfb"], help=( @@ -516,7 +510,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "output-step", + "--output-step", default="mfcc", choices=["fft", "spec", "log_spec", "logfb", "mfcc"], help=( @@ -524,4 +518,7 @@ def add_class_args(parser, prefix=None): ), ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + add_argparse_args = add_class_args diff --git a/hyperion/feats/stft.py b/hyperion/np/feats/stft.py similarity index 99% rename from hyperion/feats/stft.py rename to hyperion/np/feats/stft.py index 34f22b16..1c87b2c2 100644 --- a/hyperion/feats/stft.py +++ b/hyperion/np/feats/stft.py @@ -7,7 +7,7 @@ import numpy as np -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu def stft(x, frame_length, frame_shift, fft_length, window=None): diff --git a/hyperion/np/metrics/__init__.py b/hyperion/np/metrics/__init__.py new file mode 100644 index 00000000..d45daba5 --- /dev/null +++ b/hyperion/np/metrics/__init__.py @@ -0,0 +1,14 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .acc import compute_accuracy +from .confusion_matrix import * +from .dcf import compute_act_dcf, compute_dcf, compute_min_dcf, fast_eval_dcf_eer +from .eer import compute_eer, compute_prbep +from .utils import effective_prior +from .verification_evaluator import ( + VerificationEvaluator, + VerificationAdvAttackEvaluator, +) diff --git a/hyperion/metrics/acc.py b/hyperion/np/metrics/acc.py similarity index 88% rename from hyperion/metrics/acc.py rename to hyperion/np/metrics/acc.py index daea183e..148981f5 100644 --- a/hyperion/metrics/acc.py +++ b/hyperion/np/metrics/acc.py @@ -22,4 +22,6 @@ def compute_accuracy(y_true, y_pred, normalize=True, sample_weight=None): Returns: Accuracy or number of correctly classified samples. """ - return accuracy_score(y_true, y_pred, normalize, sample_weight) + return accuracy_score( + y_true, y_pred, normalize=normalize, sample_weight=sample_weight + ) diff --git a/hyperion/metrics/cllr.py b/hyperion/np/metrics/cllr.py similarity index 95% rename from hyperion/metrics/cllr.py rename to hyperion/np/metrics/cllr.py index ec816286..cd97a97c 100644 --- a/hyperion/metrics/cllr.py +++ b/hyperion/np/metrics/cllr.py @@ -5,7 +5,7 @@ import numpy as np -from ..utils.math import neglogsigmoid +from ..utils.math_funcs import neglogsigmoid from .utils import opt_loglr diff --git a/hyperion/metrics/confidence.py b/hyperion/np/metrics/confidence.py similarity index 100% rename from hyperion/metrics/confidence.py rename to hyperion/np/metrics/confidence.py diff --git a/hyperion/metrics/confusion_matrix.py b/hyperion/np/metrics/confusion_matrix.py similarity index 99% rename from hyperion/metrics/confusion_matrix.py rename to hyperion/np/metrics/confusion_matrix.py index 2efdd9e4..57f8f1ab 100644 --- a/hyperion/metrics/confusion_matrix.py +++ b/hyperion/np/metrics/confusion_matrix.py @@ -4,11 +4,12 @@ """ import sys -import numpy as np + import matplotlib.pyplot as plt +import numpy as np from sklearn.metrics import confusion_matrix -from ..utils.list_utils import list2ndarray +from ...utils.list_utils import list2ndarray def compute_confusion_matrix( diff --git a/hyperion/metrics/dcf.py b/hyperion/np/metrics/dcf.py similarity index 100% rename from hyperion/metrics/dcf.py rename to hyperion/np/metrics/dcf.py diff --git a/hyperion/metrics/dcf_plot.py b/hyperion/np/metrics/dcf_plot.py similarity index 100% rename from hyperion/metrics/dcf_plot.py rename to hyperion/np/metrics/dcf_plot.py diff --git a/hyperion/metrics/det_plot.py b/hyperion/np/metrics/det_plot.py similarity index 100% rename from hyperion/metrics/det_plot.py rename to hyperion/np/metrics/det_plot.py diff --git a/hyperion/metrics/eer.py b/hyperion/np/metrics/eer.py similarity index 100% rename from hyperion/metrics/eer.py rename to hyperion/np/metrics/eer.py diff --git a/hyperion/metrics/roc.py b/hyperion/np/metrics/roc.py similarity index 100% rename from hyperion/metrics/roc.py rename to hyperion/np/metrics/roc.py index 38e4fa3c..f8df8d10 100644 --- a/hyperion/metrics/roc.py +++ b/hyperion/np/metrics/roc.py @@ -3,9 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import matplotlib.pyplot as plt import numpy as np import scipy.linalg as sla -import matplotlib.pyplot as plt from .utils import pavx diff --git a/hyperion/np/metrics/utils.py b/hyperion/np/metrics/utils.py new file mode 100644 index 00000000..e638fd1b --- /dev/null +++ b/hyperion/np/metrics/utils.py @@ -0,0 +1,259 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Utility functions to evaluate performance +""" + +import numpy as np + +from ...hyp_defs import float_cpu +from ...utils.math_funcs import logsumexp, softmax + + +def effective_prior(p_tar, c_miss, c_fa): + """This function adjusts a given prior probability of target p_targ, + to incorporate the effects of a cost of miss, cmiss, and a cost of false-alarm, cfa. + + Args: + p_tar: target prior + c_miss: cost of miss + c_fa: cost of false alarm + Returns: + Effective prior + + """ + beta = p_tar * c_miss / (1 - p_tar) / c_fa + p_eff = beta / (1 + beta) + return p_eff + + +def lre_priors(num_classes, p_tar, p_oos=0.0): + """Returns all prior distributions as needed for LRE language detection task. + + Args: + num_classes: number of target classes. + p_tar: target prior. + p_oos: prior of out-of-set hypothesis. + + Returns + Matrix of priors P with shape (num_classes, num_classes) or (num_classes, num_classes+1) if p_oos > 0, where P(i,:) are the priors for the case that class i is the target class. + """ + I = np.eye(num_classes) + ones = np.ones((num_classes, num_classes)) + priors = (1 - p_tar - p_oos) * (ones - I) / (num_classes - 1) + p_tar * I + if p_oos > 0: + priors_oos = p_oos * np.ones((num_classes, 1)) + priors = np.concatenate((priors, priors_oos), axis=-1) + + return priors + + +def loglk2llr(loglk, priors, target_idx=None): + """Converts log-likelihoods to detection log-likelihood ratios. + + Args: + loglk: log-likelihood matrix P(x_t | class_i) with shape = (num_samples, num_classes) + priors: vector of prior probabilities, positive, sum up to one. + target_idx: index of the target class, the other classes are assumed to be non-target classes, + it can be also a list of indexes to consider multiple target classes. + if None, it returns matrix with LLR w.r.t. all classes. + + Returns: + Matrix of log-likelihood ratios LLR = log P(x_t | class_i) / log P(x_t / non-class_i) with + shape (num_samples, num_target_classes), if None, num_target_classes=num_classes + + """ + + num_classes = loglk.shape[1] + assert num_classes == len(priors), "wrong prior length" + assert np.all(priors >= 0), "negative priors present" + assert np.abs(np.log(np.sum(priors))) > 0.001, "priors does not sum up to one" + assert target_idx is None or target_idx >= 0 and target_idx < num_classes + if target_idx is None: + target_idx = np.arange(num_classes) + elif isinstance(target_idx, int): + target_idx = [target_idx] + + num_target_classes = len(target_idx) + llr = np.zeros((loglk.shape[0], num_target_classes), dtype=loglk.dtype) + for i, target in enumerate(target_idx): + priors_i = np.copy(priors) + priors[target] = 0 + priors /= np.sum(priors) + priors[target] = 1 + llr = llr + np.log(priors) + non_idx = np.concatenate( + (np.arange(target_idx), np.arange(target_idx + 1, num_classes)) + ) + llr[:, i] = loglk[:, target] - logsumexp(llglk[:, non_idx], axis=-1) + + return llr + + +def loglk2posterior(loglk, priors): + """Converts log-likelihoods to posteriors + + Args: + loglk: log-likelihood matrix P(x_t | class_i) with shape = (num_samples, num_classes) + priors: vector of prior probabilities, positive, sum up to one. + + Returns: + Matrix of posteriors with shape = (num_samples, num_classes) + + """ + + num_classes = loglk.shape[1] + assert num_classes == len(priors), "wrong prior length" + assert np.all(priors >= 0), "negative priors present" + assert np.abs(np.log(np.sum(priors))) > 0.001, "priors does not sum up to one" + + log_post = loglk + np.log(priors) + return softmax(log_post, axis=-1) + + +def lre_loglk2llr(loglk, p_tar, p_oos=0): + """Converts log-likelihoods to detection log-likelihood ratios suitable for LRE. + + Args: + loglk: log-likelihood matrix P(x_t | class_i) with shape = (num_samples, num_classes) + priors: prior prob that each language is the target language + p_oos: prior prob that test language is out-of-set. + + Returns: + Matrix of log-likelihood ratios LLR = log P(x_t | class_i) / log P(x_t / non-class_i) with + shape (num_samples, classes), + + """ + + num_tar_classes = loglk.shape[-1] + if p_oos == 0: + num_tar_classes -= 1 + priors = llr_priors(num_tar_classes, p_tar, p_oos) + llr = np.zeros_like((loglk.shape[0], num_tar_classes), dtype=loglk.dtype) + for i in range(num_tar_classes): + llr[:, i] = loglk2llr(loglk, priors[i], target_idx=i) + + return llr + + +def pavx(y): + """PAV: Pool Adjacent Violators algorithm. Non-paramtetric optimization subject to monotonicity. + + ghat = pav(y) + fits a vector ghat with nondecreasing components to the + data vector y such that sum((y - ghat).^2) is minimal. + (Pool-adjacent-violators algorithm). + + Author: This code is and adaptation from Bosaris Toolkit and + it is a simplified version of the 'IsoMeans.m' code made available + by Lutz Duembgen at: + http://www.imsv.unibe.ch/~duembgen/software + + Args: + y: uncalibrated scores + + Returns: + Calibrated scores + Width of pav bins, from left to right + (the number of bins is data dependent) + Height: corresponding heights of bins (in increasing order) + + """ + assert isinstance(y, np.ndarray) + + n = len(y) + assert n > 0 + index = np.zeros(y.shape, dtype=int) + l = np.zeros(y.shape, dtype=int) + # An interval of indices is represented by its left endpoint + # ("index") and its length "len" + ghat = np.zeros_like(y) + + ci = 0 + index[ci] = 0 + l[ci] = 1 + ghat[ci] = y[0] + # ci is the number of the interval considered currently. + # ghat[ci] is the mean of y-values within this interval. + for j in range(1, n): + # a new index intervall, {j}, is created: + ci = ci + 1 + index[ci] = j + l[ci] = 1 + ghat[ci] = y[j] + # while ci >= 1 and ghat[np.maximum(ci-1,0)] >= ghat[ci]: + while ci >= 1 and ghat[ci - 1] >= ghat[ci]: + # "pool adjacent violators": + nw = l[ci - 1] + l[ci] + ghat[ci - 1] = ghat[ci - 1] + (l[ci] / nw) * (ghat[ci] - ghat[ci - 1]) + l[ci - 1] = nw + ci = ci - 1 + + height = np.copy(ghat[: ci + 1]) + width = l[: ci + 1] + + # Now define ghat for all indices: + while n >= 1: + for j in range(index[ci], n): + ghat[j] = ghat[ci] + + n = index[ci] + ci = ci - 1 + + return ghat, width, height + + +def opt_loglr(tar, non, method="laplace"): + """Non-parametric optimization of score to log-likelihood-ratio mapping. + + Taken from Bosaris toolkit. + Niko Brummer and Johan du Preez, Application-Independent Evaluation of Speaker Detection, Computer Speech and Language, 2005 + + Args: + tar: target scores. + non: non-target scores. + method: laplace(default, avoids inf log-LR)/raw + + Returns: + Calibrated tar and non-tar log-LR + """ + ntar = len(tar) + nnon = len(non) + n = ntar + nnon + + scores = np.concatenate((tar, non)) + p_ideal = np.zeros((n,), dtype=float_cpu()) + p_ideal[:ntar] = 1 + + sort_idx = np.argsort(scores, kind="mergesort") + # print(scores) + # print(sort_idx) + p_ideal = p_ideal[sort_idx] + + if method == "laplace": + # The extra targets and non-targets at scores of -inf and +inf effectively + # implement Laplace's rule of succession to avoid log LRs of infinite magnitudes. + p_ideal = np.concatenate(([1, 0], p_ideal, [1, 0])) + + p_opt, _, _ = pavx(p_ideal) + + if method == "laplace": + p_opt = p_opt[2:-2] + + # Posterior to loglr + # This LR is prior-independent in the sense that if we weight the data with a synthetic prior, + # it makes no difference to the optimizing LR mapping. + # (A synthetic prior DOES change Popt: The posterior log-odds changes by an additive term. But this + # this cancels again when converting to log LR. ) + # print(p_opt) + post_log_odds = np.log(p_opt) - np.log(1 - p_opt) + prior_log_odds = np.log(ntar / nnon) + llr = post_log_odds - prior_log_odds + llr += 1e-6 * np.arange(n) / n + + llr[sort_idx] = llr + tar_llr = llr[:ntar] + non_llr = llr[ntar:] + + return tar_llr, non_llr diff --git a/hyperion/metrics/verification_evaluator.py b/hyperion/np/metrics/verification_evaluator.py similarity index 92% rename from hyperion/metrics/verification_evaluator.py rename to hyperion/np/metrics/verification_evaluator.py index d2b26ed6..e35e7cf7 100644 --- a/hyperion/metrics/verification_evaluator.py +++ b/hyperion/np/metrics/verification_evaluator.py @@ -2,29 +2,27 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - - +import copy import logging import re -import numpy as np -import pandas as pd -import copy import matplotlib +import numpy as np +import pandas as pd matplotlib.use("Agg") matplotlib.rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"]}) matplotlib.rc("text", usetex=True) import matplotlib.pyplot as plt -from ..hyp_defs import float_cpu -from ..utils import TrialKey, TrialScores -from ..utils.trial_stats import TrialStats -from .utils import effective_prior +from ...hyp_defs import float_cpu +from ...utils import TrialKey, TrialScores, SparseTrialKey, SparseTrialScores +from ...utils.trial_stats import TrialStats from .dcf import fast_eval_dcf_eer +from .utils import effective_prior -class VerificationEvaluator(object): +class VerificationEvaluator: """Class computes performance metrics for verification problems. Same metrics can be obtained from fast_eval_dcf_eer functions @@ -34,21 +32,40 @@ class VerificationEvaluator(object): p_tar: target prior float or list/nparray sorted in ascending order c_miss: cost of miss c_fa: cost of false alarm - + key_name: name describing the key + score_name: name describing the score + sparse: use sparse versions of TrialScores and Keys """ - def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None): - + def __init__( + self, + key, + scores, + p_tar, + c_miss=None, + c_fa=None, + key_name=None, + score_name=None, + sparse=False, + ): if isinstance(key, str): - logging.info("Load key: %s" % key) - key = TrialKey.load(key) + logging.info("Load key: %s", key) + if sparse: + key = SparseTrialKey.load(key) + else: + key = TrialKey.load(key) if isinstance(scores, str): - logging.info("Load scores: %s" % scores) - scores = TrialScores.load(scores) + logging.info("Load scores: %s", scores) + if sparse: + scores = SparseTrialScores.load(scores) + else: + scores = TrialScores.load(scores) self.key = key self.scores = scores.align_with_ndx(key) + self.key_name = key_name + self.score_name = score_name # compute effective prior is c_miss and c_fa are given if isinstance(p_tar, float): @@ -56,13 +73,16 @@ def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None): p_tar = np.asarray(p_tar) if c_miss is not None and c_fa is not None: + assert len(c_miss) == len(p_tar) + assert len(c_fa) == len(p_tar) c_miss = np.asarray(c_miss) c_fa = np.asarray(c_fa) p_tar = effective_prior(p_tar, c_miss, c_fa) + self._p_tar_sort = np.argsort(p_tar) self.p_tar = p_tar - def compute_dcf_eer(self, return_df=False): + def compute_dcf_eer(self, return_df=True): """ Computes DCF/EER @@ -74,24 +94,38 @@ def compute_dcf_eer(self, return_df=False): """ logging.info("separating tar/non") tar, non = self.scores.get_tar_non(self.key) + ntar = len(tar) + nnon = len(non) logging.info("computing EER/DCF") - min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer(tar, non, self.p_tar) + min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer( + tar, non, self.p_tar[self._p_tar_sort] + ) + min_dcf[self._p_tar_sort] = min_dcf.copy() + act_dcf[self._p_tar_sort] = act_dcf.copy() if not return_df: - return min_dcf, act_dcf, eer + return min_dcf, act_dcf, eer, ntar, nnon if len(self.p_tar) == 1: eer = [eer] min_dcf = [min_dcf] act_dcf = [act_dcf] - df = pd.DataFrame({"eer": eer}) - + df = pd.DataFrame( + { + "scores": [self.score_name], + "key": [self.key_name], + "eer": eer, + "eer(%)": eer * 100, + } + ) for i in range(len(min_dcf)): pi = self.p_tar[i] df["min-dcf-%.3f" % (pi)] = min_dcf[i] df["act-dcf-%.3f" % (pi)] = act_dcf[i] + df["num_targets"] = ntar + df["num_nontargets"] = nnon return df @@ -116,9 +150,7 @@ class VerificationAdvAttackEvaluator(VerificationEvaluator): def __init__( self, key, scores, attack_scores, attack_stats, p_tar, c_miss=None, c_fa=None ): - super(VerificationAdvAttackEvaluator, self).__init__( - key, scores, p_tar, c_miss, c_fa - ) + super().__init__(key, scores, p_tar, c_miss, c_fa) if not isinstance(attack_scores, list): attack_scores = [attack_scores] if not isinstance(attack_stats, list): @@ -133,7 +165,7 @@ def __init__( if isinstance(attack_scores[0], str): l = [] for file_path in attack_scores: - logging.info("Load attack scores: %s" % file_path) + logging.info("Load attack scores: %s", file_path) scores = TrialScores.load(file_path) l.append(scores) attack_scores = l @@ -151,7 +183,7 @@ def __init__( if isinstance(attack_stats[0], str): l = [] for file_path in attack_stats: - logging.info("Load attack stats: %s" % file_path) + logging.info("Load attack stats: %s", file_path) scores = TrialStats.load(file_path) l.append(scores) attack_stats = l @@ -216,7 +248,7 @@ def compute_dcf_eer_vs_stats( stat_bins, attacked_trials="all", higher_better=False, - return_df=False, + return_df=True, ): """ Computes DCF/EER versus SNR/Linf/etc curves @@ -307,7 +339,7 @@ def find_best_attacks( threshold=None, prior_idx=0, higher_better=False, - return_df=False, + return_df=True, ): """ Find the best attacks from the point of view of some of the stats. E.g., diff --git a/hyperion/np/np_model.py b/hyperion/np/np_model.py new file mode 100644 index 00000000..7b3b2e1c --- /dev/null +++ b/hyperion/np/np_model.py @@ -0,0 +1,248 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import json +import os +from copy import deepcopy + +import h5py +import numpy as np + +from ..hyp_defs import float_cpu, float_save +from ..utils.misc import PathLike + + +class NPModel(object): + """Base class for machine learning models based on numpy. + + Attributes: + name: optional identifier for the model. + """ + + registry = {} + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + NPModel.registry[cls.__name__] = cls + + def __init__(self, name=None, **kwargs): + if name is None: + name = self.__class__.__name__ + self.name = name + self._is_init = False + + def copy(self): + """Returns a clone of the model.""" + return deepcopy(self) + + def clone(self): + """Returns a clone of the model.""" + return deepcopy(self) + + @property + def is_init(self): + """Returns True if the model has been initialized.""" + return self._is_init + + def init_to_false(self): + """Sets the model as non initialized.""" + self._is_init = False + + def initialize(self): + pass + + def fit(self, x, sample_weight=None, x_val=None, sample_weight_val=None): + """Trains the model. + + Args: + x: train data matrix with shape (num_samples, x_dim). + sample_weight: weight of each sample in the training loss shape (num_samples,). + x_val: validation data matrix with shape (num_val_samples, x_dim). + sample_weight_val: weight of each sample in the val. loss. + """ + raise NotImplementedError() + + def fit_generator(self, x, x_val=None): + """Trains the model from a data generator function. + + Args: + x: train data generation function. + x_val: validation data generation function. + """ + raise NotImplementedError() + + def save(self, file_path): + """Saves the model to file. + + Args: + file_path: filename path. + """ + file_dir = os.path.dirname(file_path) + if not (os.path.isdir(file_dir)): + os.makedirs(file_dir, exist_ok=True) + with h5py.File(file_path, "w") as f: + config = self.to_json() + f.create_dataset("config", data=np.array(config, dtype="S")) + self.save_params(f) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + raise NotImplementedError( + f"save_params method not defined for {self.__class__.__name__}" + ) + + def _save_params_from_dict(self, f, params, dtypes=None): + """Saves a dictionary of model parameters into the file. + + Args: + f: file handle. + params: dictionary of model parameters. + dtypes: dictionary indicating the dtypes of the model parameters. + """ + if dtypes is None: + dtypes = dict((k, float_save()) for k in params) + elif isinstance(dtypes, type): + dtypes = dict((k, dtypes) for k in params) + + if self.name is None: + prefix = "" + else: + prefix = self.name + "/" + for k, v in params.items(): + if v is None: + continue + if not isinstance(v, np.ndarray): + v = np.asarray(v) + p_name = prefix + k + f.create_dataset(p_name, data=v.astype(dtypes[k], copy=False)) + + @classmethod + def load_config(cls, file_path): + """Loads the model configuration from file. + + Args: + file_path: path to the file where the model is stored. + + Returns: + Dictionary containing the model configuration. + """ + try: + with h5py.File(file_path, "r") as f: + json_str = str(np.asarray(f["config"]).astype("U")) + return cls.load_config_from_json(json_str) + except: + with open(file_path, "r") as f: + return cls.load_config_from_json(f.read()) + + @classmethod + def load(cls, file_path): + """Loads the model from file. + + Args: + file_path: path to the file where the model is stored. + + Returns: + Model object. + """ + with h5py.File(file_path, "r") as f: + json_str = str(np.asarray(f["config"]).astype("U")) + config = cls.load_config_from_json(json_str) + return cls.load_params(f, config) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + return cls(name=config["name"]) + + @staticmethod + def _load_params_to_dict(f, name, params, dtypes=None): + """Loads the model parameters from file to a dictionary. + + Args: + f: file handle. + name: model identifier or None. + params: parameter names. + dtypes: dictionary containing the dtypes of the parameters. + + Returns: + Dictionary with model parameters. + """ + if dtypes is None: + dtypes = dict((k, float_cpu()) for k in params) + elif isinstance(dtypes, type): + dtypes = dict((k, dtypes) for k in params) + + if name is None: + prefix = "" + else: + prefix = name + "/" + + param_dict = {} + for k in params: + p_name = prefix + k + if p_name in f: + param_dict[k] = np.asarray(f[p_name]).astype( + dtype=dtypes[k], copy=False + ) + else: + param_dict[k] = None + return param_dict + + def get_config(self): + """Returns the model configuration dict.""" + config = {"class_name": self.__class__.__name__, "name": self.name} + return config + + def to_json(self, **kwargs): + """Returns model config as json string.""" + + def get_json_type(obj): + # if obj is a np list of strings + if isinstance(obj, np.ndarray) and obj.ndim == 1: + if isinstance(obj[0], str): + return list(obj) + + # Piece of code borrowed from keras + # if obj is any numpy type + if type(obj).__module__ == np.__name__: + return obj.item() + + # if obj is a python 'type' + if type(obj).__name__ == type.__name__: + return obj.__name__ + + raise TypeError("Not JSON Serializable:", obj) + + config = self.get_config() + return json.dumps(config, default=get_json_type, **kwargs) + + @staticmethod + def load_config_from_json(json_str): + """Converts json string into dict.""" + return json.loads(json_str) + + @staticmethod + def auto_load(file_path: PathLike, extra_objs: dict = {}): + class_name = NPModel.load_config(file_path)["class_name"] + if class_name in NPModel.registry: + class_obj = NPModel.registry[class_name] + elif class_name in extra_objs: + class_obj = extra_objs[class_name] + else: + raise Exception("unknown object with class_name=%s" % (class_name)) + + return class_obj.load(file_path) diff --git a/hyperion/np/np_model_loader.py b/hyperion/np/np_model_loader.py new file mode 100644 index 00000000..efdd27a9 --- /dev/null +++ b/hyperion/np/np_model_loader.py @@ -0,0 +1,37 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .np_model import NPModel +from .pdfs import * +from .transforms import * + + +class NPModelLoader(object): + @staticmethod + def get_object(): + obj_dict = { + "DiagNormal": DiagNormal, + "Normal": Normal, + "DiagGMM": DiagGMM, + "GMM": GMM, + "FRPLDA": FRPLDA, + "SPLDA": SPLDA, + "PLDA": PLDA, + "CentWhiten": CentWhiten, + "LNorm": LNorm, + "PCA": PCA, + "LDA": LDA, + "NAP": NAP, + "SbSw": SbSw, + "MVN": MVN, + "TransformList": TransformList, + } + return obj_dict + + @staticmethod + def load(file_path): + class_name = NPModel.load_config(file_path)["class_name"] + class_obj = NPModelLoader.get_object()[class_name] + return class_obj.load(file_path) diff --git a/hyperion/np/pdfs/__init__.py b/hyperion/np/pdfs/__init__.py new file mode 100644 index 00000000..8a91e269 --- /dev/null +++ b/hyperion/np/pdfs/__init__.py @@ -0,0 +1,10 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .core import * +from .hmm import * +from .jfa import * +from .mixtures import * +from .plda import * diff --git a/hyperion/np/pdfs/core/__init__.py b/hyperion/np/pdfs/core/__init__.py new file mode 100644 index 00000000..0f6287f2 --- /dev/null +++ b/hyperion/np/pdfs/core/__init__.py @@ -0,0 +1,10 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +from .exp_family import ExpFamily +from .normal import Normal +from .normal_diag_cov import DiagNormal, NormalDiagCov +from .pdf import PDF diff --git a/hyperion/np/pdfs/core/exp_family.py b/hyperion/np/pdfs/core/exp_family.py new file mode 100644 index 00000000..c91469e7 --- /dev/null +++ b/hyperion/np/pdfs/core/exp_family.py @@ -0,0 +1,272 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import numpy as np + +from .pdf import PDF + + +class ExpFamily(PDF): + """Base class for exponential family distribution. + + p(x) = h(x) exp(\eta u(x) - A) + + Attributes: + eta: natural parameters of the distribution. + x_dim: data dimension. + """ + + def __init__(self, eta=None, **kwargs): + super().__init__(**kwargs) + self.eta = eta + self.A = None + + @property + def is_init(self): + """Returns True if the model has been initialized.""" + if not self._is_init: + self._compute_nat_std() + if self.eta is not None and self.A is not None: + self.validate() + self._is_init = True + return self._is_init + + def fit( + self, x, sample_weight=None, x_val=None, sample_weight_val=None, batch_size=None + ): + """Trains the model. + + Args: + x: train data matrix with shape (num_samples, x_dim). + sample_weight: weight of each sample in the training loss shape (num_samples,). + x_val: validation data matrix with shape (num_val_samples, x_dim). + sample_weight_val: weight of each sample in the val. loss. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + log p(X) of the training data. + log p(x) per sample. + log p(X) of the val. data, if present. + log p(x) of the val. data per sample, if present. + """ + + N, u_x = self.Estep(x=x, sample_weight=sample_weight, batch_size=batch_size) + self.Mstep(N, u_x) + elbo = self.elbo(x, N=N, u_x=u_x) + elbo = [elbo, elbo / N] + + if x_val is not None: + N, u_x = self.Estep( + x=x_val, sample_weight=sample_weight_val, batch_size=batch_size + ) + elbo_val = self.elbo(x_val, N=N, u_x=u_x) + elbo += [elbo_val, elbo_val / N] + return elbo + + def log_h(self, x): + """Computes log h(x) of the exp. family.""" + return 0 + + def accum_log_h(self, x, sample_weight=None): + """Accumlates log h(x)""" + if sample_weight is None: + return np.sum(self.log_h(x)) + return np.sum(sample_weight * self.log_h(x)) + + def compute_suff_stats(self, x): + """Computes sufficient stats for a data sample.""" + return x + + def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): + """Accumlates sufficient statistis over several data samples. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + if u_x is not None or batch_size is None: + return self._accum_suff_stats_1batch(x, u_x, sample_weight) + else: + return self._accum_suff_stats_nbatches(x, sample_weight, batch_size) + + def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): + """Accumlates sufficient statistis over several data samples for a single batch. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + if u_x is None: + u_x = self.compute_suff_stats(x) + if sample_weight is None: + N = u_x.shape[0] + else: + u_x *= sample_weight[:, None] + N = np.sum(sample_weight) + acc_u_x = np.sum(u_x, axis=0) + return N, acc_u_x + + def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): + """Accumlates sufficient statistis over several data samples for multiple batches. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + sw_i = None + for i1 in range(0, x.shape[0], batch_size): + i2 = np.minimum(i1 + batch_size, x.shape[0]) + x_i = x[i1:i2, :] + if sample_weight is not None: + sw_i = sample_weight[i1:i2] + N_i, u_x_i = self._accum_suff_stats_1batch(x_i, sample_weight=sw_i) + if i1 == 0: + N = N_i + u_x = u_x_i + else: + N += N_i + u_x += u_x_i + return N, u_x + + def sum_suff_stats(self, N, u_x): + """Sums suff. stats from muttiple sub-processes. + + Args: + N: zero order stats with shape = (num_proc,) + u_x: higher order stats with shape = (num_proc, u(x)_dim). + + Args: + Accumalted N and u_x. + """ + assert len(N) == len(u_x) + acc_N = N[1] + acc_u_x = u_x[1] + for i in range(1, len(N)): + acc_N += N + acc_u_x += u_x[i] + return acc_N, acc_u_x + + def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): + """Expectation step, accumlates suff. stats. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + return self.accum_suff_stats(x, u_x, sample_weight, batch_size) + + def Mstep(self, stats): + """Maximization step.""" + pass + + def elbo(self, x, u_x=None, N=1, log_h=None, sample_weight=None, batch_size=None): + """Evidence lower bound. + + Args: + x: data samples with shape = (num_samples, x_dim). + u_x: accumlated u(x) (optional). + log_h: accumlated log h(x) (optional). + sample_weight: weigth of each sample in the loss function. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + log p(X) of the data. + """ + assert self.is_init + if u_x is None: + N, u_x = self.accum_suff_stats( + x, sample_weight=sample_weight, batch_size=batch_size + ) + if log_h is None: + log_h = self.accum_log_h(x, sample_weight=sample_weight) + return log_h + np.inner(u_x, self.eta) - N * self.A + + def log_prob(self, x, u_x=None, method="nat"): + """log p(x) of each data sample. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + method: the probability is computed using standard ("std") or + natural parameters ("nat"). + + Returns: + log p(x) with shape (num_samples,) + """ + if method == "nat": + return self.log_prob_nat(x, u_x) + else: + return self.log_prob_std(x) + + def log_prob_nat(self, x, u_x=None): + """log p(x) of each data sample computed using the + natural parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + + Returns: + log p(x) with shape (num_samples,) + """ + assert self.is_init + if u_x is None: + u_x = self.compute_suff_stats(x) + return self.log_h(x) + np.inner(u_x, self.eta) - self.A + + @staticmethod + def compute_A_nat(eta): + """Computes A_theta from the natural param.""" + raise NotImplementedError() + + @staticmethod + def compute_A_std(params): + """Computes A_theta from the standard param.""" + raise NotImplementedError() + + @staticmethod + def compute_eta(param): + """Computes the natural param. from the standard param.""" + raise NotImplementedError() + + @staticmethod + def compute_std(eta): + """Computes the standard param. from the natural param.""" + raise NotImplementedError() + + def _compute_nat_params(self): + pass + + def _compute_std_params(self): + pass + + def _compute_nat_std(self): + pass + + def validate(self): + pass diff --git a/hyperion/np/pdfs/core/normal.py b/hyperion/np/pdfs/core/normal.py new file mode 100644 index 00000000..67872315 --- /dev/null +++ b/hyperion/np/pdfs/core/normal.py @@ -0,0 +1,427 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import numpy as np +import scipy.linalg as la + +from ....hyp_defs import float_cpu +from ....utils.math_funcs import ( + fullcov_varfloor, + invert_pdmat, + invert_trimat, + logdet_pdmat, + symmat2vec, + vec2symmat, +) +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) +from .exp_family import ExpFamily + + +class Normal(ExpFamily): + """Class for Normal distribution with full covariance. + + Attributes: + mu: mean with shape (x_dim,) or None. + Lambda: precision with shape (x_dim, x_dim) or None. + var_floor: variance floor. + update_mu: whether or not update mu when optimizing. + update_Lambda: wether or not update Lambda when optimizing. + x_dim: data dim (infered from mu if present) + """ + + def __init__( + self, + mu=None, + Lambda=None, + var_floor=1e-5, + update_mu=True, + update_Lambda=True, + **kwargs + ): + super().__init__(**kwargs) + self.mu = mu + self.Lambda = Lambda + self.var_floor = var_floor + self.update_mu = update_mu + self.update_Lambda = update_Lambda + + self._compute_nat_std() + + self._logLambda = None + self._cholLambda = None + self._Sigma = None + + def _compute_nat_std(self): + """Comptues natural and standard parameters of the distribution.""" + if self.mu is not None and self.Lambda is not None: + self._validate_mu() + self._validate_Lambda() + self._compute_nat_params() + elif self.eta is not None: + self._validate_eta() + self.A = self.compute_A_nat(self.eta) + self._compute_std_params() + + @property + def logLambda(self): + """log precision determinant.""" + if self._logLambda is None: + assert self.is_init + f, L, logL = invert_pdmat(self.Lambda, return_logdet=True) + self._logLambda = logL + self._cholLambda = L.T + return self._logLambda + + @property + def cholLambda(self): + """Cholesqy decomp. of the precision.""" + if self._cholLambda is None: + assert self.is_init + f, L, logL = invert_pdmat(self.Lambda, return_logdet=True) + self._logLambda = logL + self._cholLambda = L.T + return self._cholLambda + + @property + def Sigma(self): + """Covariance.""" + if self._Sigma is None: + assert self.is_init + self._Sigma = invert_pdmat(self.Lambda, return_inv=True)[-1] + return self._Sigma + + def initialize(self): + """Initializes the distribution.""" + self.validate() + self._compute_nat_std() + + def stack_suff_stats(self, F, S=None): + """Stacks F and S suff stats into single vector.""" + if S is None: + return F + return np.hstack((F, S)) + + def unstack_suff_stats(self, stats): + """Decomposes suff. stats vector into F and S.""" + F = stats[: self.x_dim] + S = stats[self.x_dim :] + return F, S + + def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): + """Accumlates sufficient statistis over several data samples. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: unused + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + if u_x is None: + if sample_weight is None: + N = x.shape[0] + F = np.sum(x, axis=0) + S = symmat2vec(np.dot(x.T, x)) + else: + N = np.sum(sample_weight) + wx = sample_weight[:, None] * x + F = np.sum(wx, axis=0) + S = symmat2vec(np.dot(wx.T, x)) + return N, self.stack_suff_stats(F, S) + else: + return self._accum_suff_stats_1batch(x, u_x, sample_weight) + + def norm_suff_stats(self, N, u_x, return_order2=False): + """Normalizes accumlated sufficient statistics with the + mean and covariance of the distribution. + + Args: + N: zeroth order sufficient stats. + u_x: 1st and 2nd order stats. + return_order2: whether or not return normalizes 2nd order stats. + + Return: + Normalized N, F or N, [F, S]. + """ + assert self.is_init + + F, S = self.unstack_suff_stats(u_x) + F_norm = np.dot(F - N * self.mu, self.cholLambda.T) + if return_order2: + SS = vec2symmat(S) + Fmu = np.outer(self.F, self.mu) + SS = SS - Fmu - Fmu.T + N * np.outer(self.mu, self.mu) + SS = np.dot(self.cholLambda, np.dot(SS, self.cholLambda.T)) + S = symmat2vec(SS) + return N, self.stack_suff_stats(F_norm, S) + return N, F_norm + + def Mstep(self, N, u_x): + """Maximization step. + + Args: + N: zeroth order stats. + u_x: accumlated higher order stats. + + """ + F, S = self.unstack_suff_stats(u_x) + + if self.update_mu: + self.mu = F / N + + if self.update_Lambda: + S = vec2symmat(S / N) + S -= np.outer(self.mu, self.mu) + # S = fullcov_varfloor(S, self.var_floor) + self.Lambda = invert_pdmat(S, return_inv=True)[-1] + self._Sigma = None + self._logLambda = None + self._cholLambda = None + + self._compute_nat_params() + + def log_prob_std(self, x): + """log p(x) of each data sample computed using the + standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + log p(x) with shape (num_samples,) + """ + assert self.is_init + mah_dist2 = np.sum(np.dot(x - self.mu, self.cholLambda) ** 2, axis=1) + return ( + 0.5 * self.logLambda + - 0.5 * self.x_dim * np.log(2 * np.pi) + - 0.5 * mah_dist2 + ) + + def sample(self, num_samples, rng=None, seed=1024): + """Draws samples from the data distribution. + + Args: + num_samples: number of samples. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ + assert self.is_init + + if rng is None: + rng = np.random.default_rng(seed) + return rng.multivariate_normal(self.mu, self.Sigma, size=(num_samples,)).astype( + float_cpu() + ) + + def get_config(self): + """Returns the model configuration dict.""" + config = { + "var_floor": self.var_floor, + "update_mu": self.update_mu, + "update_lambda": self.update_Lambda, + } + base_config = super(Normal, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + assert self.is_init + + params = {"mu": self.mu, "Lambda": self.Lambda} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["mu", "Lambda"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls( + x_dim=config["x_dim"], + mu=params["mu"], + Lambda=params["Lambda"], + var_floor=config["var_floor"], + update_mu=config["update_mu"], + update_Lambda=config["update_lambda"], + name=config["name"], + ) + + def _validate_mu(self): + assert self.mu.shape[0] == self.x_dim + + def _validate_Lambda(self): + assert self.Lambda.shape == (self.x_dim, self.x_dim) + + def _validate_eta(self): + assert self.eta.shape[0] == (self.x_dim ** 2 + 3 * self.x_dim) / 2 + + def validate(self): + """Validates the parameters of the distribution.""" + if self.mu is not None and self.Lambda is not None: + self._validate_mu() + self._validate_Lambda() + + if self.eta is not None: + self._validate_eta() + + @staticmethod + def compute_eta(mu, Lambda): + """Computes nat param. from mean and precision.""" + Lmu = np.dot(mu, Lambda) + eta = np.hstack((Lmu, -symmat2vec(Lambda, diag_factor=0.5))) + return eta + + @staticmethod + def compute_x_dim_from_eta(eta): + """Computes data dim. from natural param.""" + x_dim = 0.5 * (-3 + np.sqrt(9 + 8 * eta.shape[-1])) + assert int(x_dim) == x_dim + return int(x_dim) + + @staticmethod + def compute_std(eta): + """Computes standard params. from the natural param.""" + x_dim = Normal.compute_x_dim_from_eta(eta) + eta1 = eta[:x_dim] + eta2 = vec2symmat(eta[x_dim:], diag_factor=2) / 2 + Lambda = -2 * eta2 + f = invert_pdmat(-eta2, right_inv=True)[0] + mu = 0.5 * f(eta1) + return mu, Lambda + + @staticmethod + def compute_A_nat(eta): + """Computes A from the natural param.""" + x_dim = Normal.compute_x_dim_from_eta(eta) + eta1 = eta[:x_dim] + eta2 = vec2symmat(eta[x_dim:], diag_factor=2) / 2 + f, _, log_minus_eta2 = invert_pdmat(-eta2, right_inv=True, return_logdet=True) + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = 0.25 * np.inner(f(eta1), eta1) + r3 = -0.5 * x_dim * np.log(2) - 0.5 * log_minus_eta2 + return r1 + r2 + r3 + + @staticmethod + def compute_A_std(mu, Lambda): + """Computes A from the standard params.""" + x_dim = mu.shape[0] + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = -0.5 * logdet_pdmat(Lambda) + r3 = 0.5 * np.inner(np.dot(mu, Lambda), mu) + return r1 + r2 + r3 + + def _compute_nat_params(self): + """Computes all natural params from mean and precision.""" + self.eta = self.compute_eta(self.mu, self.Lambda) + self.A = self.compute_A_std(self.mu, self.Lambda) + + def _compute_std_params(self): + self.mu, self.Lambda = self.compute_std(self.eta) + self._cholLambda = None + self._logLambda = None + self._Sigma = None + + @staticmethod + def compute_suff_stats(x): + """Computes the sufficient stats. for each sample. + + Args: + x: data samples with shape (num_samples, x_dim). + + Returns: + Sufficient stats. for each data sample with shape (num_samples, u_dim). + """ + d = x.shape[1] + u = np.zeros((x.shape[0], int(d + d * (d + 1) / 2)), dtype=float_cpu()) + u[:, :d] = x + k = d + for i in range(d): + for j in range(i, d): + u[:, k] = x[:, i] * x[:, j] + k += 1 + return u + + def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): + """Plots one slice of the Gaussian in 1d. + + Args: + feat_idx: feature index. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + assert self.is_init + mu = self.mu[feat_idx] + C = invert_pdmat(self.Lambda, return_inv=True)[-1][feat_idx, feat_idx] + plot_gaussian_1D(mu, C, num_sigmas, num_pts, **kwargs) + + def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of the Gaussian in 2d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + assert self.is_init + mu = self.mu[feat_idx] + j, i = np.meshgrid(feat_idx, feat_idx) + C = invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] + plot_gaussian_ellipsoid_2D(mu, C, num_sigmas, num_pts, **kwargs) + + def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of the Gaussian in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + assert self.is_init + mu = self.mu[feat_idx] + j, i = np.meshgrid(feat_idx, feat_idx) + C = invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] + plot_gaussian_3D(mu, C, num_sigmas, num_pts, **kwargs) + + def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + """Plots 3 dimensions of the Gaussian in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + assert self.is_init + mu = self.mu[feat_idx] + j, i = np.meshgrid(feat_idx, feat_idx) + C = invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] + plot_gaussian_ellipsoid_3D(mu, C, num_sigmas, num_pts, **kwargs) diff --git a/hyperion/np/pdfs/core/normal_diag_cov.py b/hyperion/np/pdfs/core/normal_diag_cov.py new file mode 100644 index 00000000..23535112 --- /dev/null +++ b/hyperion/np/pdfs/core/normal_diag_cov.py @@ -0,0 +1,376 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import numpy as np +from scipy.special import erf + +from ....hyp_defs import float_cpu +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) +from .exp_family import ExpFamily + + +class NormalDiagCov(ExpFamily): + """Class for Normal distribution with diagonal covariance. + + Attributes: + mu: mean with shape (x_dim,) or None. + Lambda: precision with shape (x_dim, x_dim) or None. + var_floor: variance floor. + update_mu: whether or not update mu when optimizing. + update_Lambda: wether or not update Lambda when optimizing. + x_dim: data dim (infered from mu if present) + """ + + def __init__( + self, + mu=None, + Lambda=None, + var_floor=1e-5, + update_mu=True, + update_Lambda=True, + **kwargs + ): + super().__init__(**kwargs) + self.mu = mu + self.Lambda = Lambda + self.var_floor = var_floor + self.update_mu = update_mu + self.update_Lambda = update_Lambda + + self._compute_nat_std() + + self._logLambda = None + self._cholLambda = None + self._Sigma = None + + def _compute_nat_std(self): + """Comptues natural and standard parameters of the distribution.""" + if self.mu is not None and self.Lambda is not None: + self._validate_mu() + self._validate_Lambda() + self._compute_nat_params() + elif self.eta is not None: + self._validate_eta() + self.A = self.compute_A_nat(self.eta) + self._compute_std_params() + + @property + def logLambda(self): + """log precision determinant.""" + if self._logLambda is None: + assert self.is_init + self._logLambda = np.sum(np.log(self.Lambda)) + return self._logLambda + + @property + def cholLambda(self): + """Square root of precision.""" + if self._cholLambda is None: + assert self.is_init + self._cholLambda = np.sqrt(self.Lambda) + return self._cholLambda + + @property + def Sigma(self): + "Variance of the distribution." + if self._Sigma is None: + assert self.is_init + self._Sigma = 1.0 / self.Lambda + return self._Sigma + + def initialize(self): + """Initializes the distribution.""" + self.validate() + self._compute_nat_std() + assert self.is_init + + def stack_suff_stats(self, F, S=None): + """Stacks F and S suff stats into single vector.""" + + if S is None: + return F + return np.hstack((F, S)) + + def unstack_suff_stats(self, stats): + """Decomposes suff. stats vector into F and S.""" + F = stats[: self.x_dim] + S = stats[self.x_dim :] + return F, S + + def norm_suff_stats(self, N, u_x=None, return_order2=False): + """Normalizes accumlated sufficient statistics with the + mean and covariance of the distribution. + + Args: + N: zeroth order sufficient stats. + u_x: 1st and 2nd order stats. + return_order2: whether or not return normalizes 2nd order stats. + + Return: + Normalized N, F or N, [F, S]. + """ + assert self.is_init + F, S = self.unstack_suff_stats(u_x) + F_norm = self.cholLambda * (F - N * self.mu) + if return_order2: + S = S - 2 * self.mu * F + N * self.mu ** 2 + S *= self.Lambda + return N, self.stack_suff_stats(F_norm, S) + return N, F_norm + + def Mstep(self, N, u_x): + """Maximization step. + + Args: + N: zeroth order stats. + u_x: accumlated higher order stats. + + """ + F, S = self.unstack_suff_stats(u_x) + + if self.update_mu: + self.mu = F / N + + if self.update_Lambda: + S = S / N - self.mu ** 2 + S[S < self.var_floor] = self.var_floor + self.Lambda = 1 / S + self._Sigma = S + self._cholLambda = None + self._logLambda = None + + self._compute_nat_params() + + def log_prob_std(self, x): + """log p(x) of each data sample computed using the + standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + log p(x) with shape (num_samples,) + """ + assert self.is_init + mah_dist2 = np.sum(((x - self.mu) * self.cholLambda) ** 2, axis=1) + return ( + 0.5 * self.logLambda + - 0.5 * self.x_dim * np.log(2 * np.pi) + - 0.5 * mah_dist2 + ) + + def log_cdf(self, x): + """Log cumulative distribution function.""" + assert self.is_init + delta = (x - self.mu) * self.cholLambda + lk = 0.5 * (1 + erf(delta / np.sqrt(2))) + return np.sum(np.log(lk + 1e-10), axis=-1) + + def sample(self, num_samples, rng=None, seed=1024): + """Draws samples from the data distribution. + + Args: + num_samples: number of samples. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ + assert self.is_init + if rng is None: + rng = np.random.default_rng(seed) + x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) + return self.mu + 1.0 / self.cholLambda * x + + def get_config(self): + """Returns the model configuration dict.""" + config = { + "var_floor": self.var_floor, + "update_mu": self.update_mu, + "update_lambda": self.update_Lambda, + } + base_config = super(NormalDiagCov, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + assert self.is_init + params = {"mu": self.mu, "Lambda": self.Lambda} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["mu", "Lambda"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls( + x_dim=config["x_dim"], + mu=params["mu"], + Lambda=params["Lambda"], + var_floor=config["var_floor"], + update_mu=config["update_mu"], + update_Lambda=config["update_lambda"], + name=config["name"], + ) + + def _validate_mu(self): + assert self.mu.shape[0] == self.x_dim + + def _validate_Lambda(self): + assert self.Lambda.shape[0] == self.x_dim + assert np.all(self.Lambda > 0) + + def _validate_eta(self): + assert self.eta.shape[0] == self.x_dim * 2 + + def validate(self): + """Validates the parameters of the distribution.""" + if self.mu is not None and self.Lambda is not None: + self._validate_mu() + self._validate_Lambda() + + if self.eta is not None: + self._validate_eta() + + @staticmethod + def compute_eta(mu, Lambda): + """Computes nat param. from mean and precision.""" + Lmu = Lambda * mu + eta = np.hstack((Lmu, -0.5 * Lambda)) + return eta + + @staticmethod + def compute_std(eta): + """Computes standard params. from the natural param.""" + x_dim = int(eta.shape[0] / 2) + eta1 = eta[:x_dim] + eta2 = eta[x_dim:] + mu = -0.5 * eta1 / eta2 + Lambda = -2 * eta2 + return mu, Lambda + + @staticmethod + def compute_A_nat(eta): + """Computes A from the natural param.""" + x_dim = int(eta.shape[0] / 2) + eta1 = eta[:x_dim] + eta2 = eta[x_dim:] + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = -1 / 4 * np.sum(eta1 * eta1 / eta2) + r3 = -1 / 2 * np.sum(np.log(-2 * eta2)) + return r1 + r2 + r3 + + @staticmethod + def compute_A_std(mu, Lambda): + """Computes A from the standard params.""" + x_dim = mu.shape[0] + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = -0.5 * np.sum(np.log(Lambda)) + r3 = 0.5 * np.sum(mu * mu * Lambda) + return r1 + r2 + r3 + + def _compute_nat_params(self): + self.eta = self.compute_eta(self.mu, self.Lambda) + self.A = self.compute_A_nat(self.eta) + # Lmu = self.Lambda*self.mu + # muLmu = np.sum(self.mu*Lmu) + # lnr = 0.5*self.lnLambda - 0.5*self.x_dim*np.log(2*np.pi)-0.5*muLmu + # self.eta=np.hstack((lnr, Lmu, -0.5*self.Lambda)).T + + def _compute_std_params(self): + self.mu, self.Lambda = self.compute_std(self.eta) + self._cholLambda = None + self._logLambda = None + self._Sigma = None + + @staticmethod + def compute_suff_stats(x): + """Computes the sufficient stats. for each sample. + + Args: + x: data samples with shape (num_samples, x_dim). + + Returns: + Sufficient stats. for each data sample with shape (num_samples, u_dim). + """ + d = x.shape[1] + u = np.zeros((x.shape[0], 2 * d), dtype=float_cpu()) + u[:, :d] = x + u[:, d:] = x * x + return u + + def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): + """Plots one slice of the Gaussian in 1d. + + Args: + feat_idx: feature index. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[feat_idx] + C = 1 / self.Lambda[feat_idx] + plot_gaussian_1D(mu, C, num_sigmas, num_pts, **kwargs) + + def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of the Gaussian in 2d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[feat_idx] + C = np.diag(1.0 / self.Lambda[feat_idx]) + plot_gaussian_ellipsoid_2D(mu, C, num_sigmas, num_pts, **kwargs) + + def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of the Gaussian in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[feat_idx] + C = np.diag(1.0 / self.Lambda[feat_idx]) + plot_gaussian_3D(mu, C, num_sigmas, num_pts, **kwargs) + + def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + """Plots 3 dimensions of the Gaussian in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[feat_idx] + C = np.diag(1.0 / self.Lambda[feat_idx]) + plot_gaussian_ellipsoid_3D(mu, C, num_sigmas, num_pts, **kwargs) + + +DiagNormal = NormalDiagCov diff --git a/hyperion/np/pdfs/core/pdf.py b/hyperion/np/pdfs/core/pdf.py new file mode 100644 index 00000000..82f4330d --- /dev/null +++ b/hyperion/np/pdfs/core/pdf.py @@ -0,0 +1,48 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import numpy as np + +from ...np_model import NPModel + + +class PDF(NPModel): + """Base class for probability density functions. + + Attributes: + x_dim: data dimension. + """ + + def __init__(self, x_dim=1, **kwargs): + super().__init__(**kwargs) + self.x_dim = x_dim + + def get_config(self): + """Returns the model configuration dict.""" + config = {"x_dim": self.x_dim} + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def log_prob(self, x): + """Computes log probability of the data.""" + raise NotImplementedError() + + def eval_llk(self, x): + """Computes log likelihood of the data.""" + return self.log_prob(x) + + def sample(self, num_samples): + """Draws samples from the data distribution.""" + raise NotImplementedError() + + def generate(self, num_samples, **kwargs): + """Draws samples from the data distribution. + Args: + num_samples: number of samples to generate. + + Returns: + np.array of generated samples with shape=(num_samples, x_dim) + """ + return self.sample(num_samples, **kwargs) diff --git a/hyperion/pdfs/hmm/__init__.py b/hyperion/np/pdfs/hmm/__init__.py similarity index 100% rename from hyperion/pdfs/hmm/__init__.py rename to hyperion/np/pdfs/hmm/__init__.py diff --git a/hyperion/pdfs/hmm/hmm.py b/hyperion/np/pdfs/hmm/hmm.py similarity index 91% rename from hyperion/pdfs/hmm/hmm.py rename to hyperion/np/pdfs/hmm/hmm.py index aeec994a..92d9c371 100644 --- a/hyperion/pdfs/hmm/hmm.py +++ b/hyperion/np/pdfs/hmm/hmm.py @@ -5,24 +5,25 @@ import numpy as np -from ...hyp_defs import float_cpu -from ...utils.math import softmax, logsumexp +from ....hyp_defs import float_cpu +from ....utils.math_funcs import logsumexp, softmax from ..core import PDF class HMM(PDF): - def __init__(self, **kwargs): - super(HMM, self).__init__( - num_states=1, - pi=None, - trans=None, - trans_mask=None, - update_pi=True, - update_trans=True, - tied_trans=False, - left_to_right=False, - **kwargs - ) + def __init__( + self, + num_states=1, + pi=None, + trans=None, + trans_mask=None, + update_pi=True, + update_trans=True, + tied_trans=False, + left_to_right=False, + **kwargs + ): + super().__init__(**kwargs) if pi is not None: num_states = len(pi) @@ -36,13 +37,13 @@ def __init__(self, **kwargs): self.tied_trans = tied_trans self.left_to_right = left_to_right - if left_to_rigth and (trans_mask is None): + if left_to_right and (trans_mask is None): self.trans_mask = np.triu(np.ones_like(self.trans)) self._log_pi = None self._log_trans = None - def reset_aux(): + def reset_aux(self): self._log_pi = None self._log_trans = None @@ -132,11 +133,11 @@ def compute_pz(self, x, return_Nzz=False, return_log_px=False): pz = softmax(log_alpha + log_beta, axis=-1) - if not (return_Nzz or return_elbo or return_log_px): + if not (return_Nzz or return_log_px): return pz r = [pz] - if return_pzz_acc: + if return_Nzz: x_e = np.expand_dims(axis=1) log_alpha_e = np.expand_dims(axis=-1) log_beta_e = np.expand_dims(axis=1) @@ -169,7 +170,7 @@ def Estep(self, x, stats_0=None): pz, Nzz = self.compute_pz(x, return_Nzz=True) Nz += pz[0] - Nzz += pzz + Nzz += Nzz stats = (Nz, Nzz) return pz, stats @@ -231,14 +232,14 @@ def viterbi_decode(self, x, nbest=1): def sample(self, num_seqs, num_steps, rng=None, seed=1024): if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) x = np.zeros((num_seqs, num_steps, self.num_states), dtype=float_cpu()) x[:, 0, :] = rng.multinomial(1, self.pi, size=(num_seqs,)) for t in range(1, num_steps): for k in range(self.num_states): index = x[:, t - 1, k] == 1 - n_k = num.sum(index) + n_k = np.sum(index) if n_k == 0: continue x[index] = rng.multinomial(1, self.trans[k], size=(n_k,)) diff --git a/hyperion/pdfs/jfa/__init__.py b/hyperion/np/pdfs/jfa/__init__.py similarity index 100% rename from hyperion/pdfs/jfa/__init__.py rename to hyperion/np/pdfs/jfa/__init__.py diff --git a/hyperion/np/pdfs/jfa/jfa_total.py b/hyperion/np/pdfs/jfa/jfa_total.py new file mode 100644 index 00000000..97450e0e --- /dev/null +++ b/hyperion/np/pdfs/jfa/jfa_total.py @@ -0,0 +1,381 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import numpy as np +from scipy import linalg as la + +from ....hyp_defs import float_cpu +from ....utils.math_funcs import ( + invert_pdmat, + invert_trimat, + logdet_pdmat, + symmat2vec, + vec2symmat, +) +from ..core.pdf import PDF + + +class JFATotal(PDF): + """Class for joint factor analysis with total variability matrix (i-vectors). + Args: + K: number of gaussian components. + y_dim: dimension of total variability sub-space. + T: Total variability matrix with shape (y_dim, K * x_dim). + x_dim: data dimension. + + """ + + def __init__(self, K, y_dim=None, T=None, **kwargs): + super().__init__(**kwargs) + if T is not None: + y_dim = T.shape[0] + + self.K = K + self.y_dim = y_dim + self.T = T + + # aux + self._TT = None + self.__upptr = None + + def reset_aux(self): + """Resets auxiliary variables.""" + self._TT = None + + @property + def is_init(self): + """Returns True if the model has been initialized.""" + if self._is_init: + return True + if self.T is not None: + self._is_init = True + return self._is_init + + def initialize(self, N, F): + """Initializes the model. + + Args: + N: zero order statistics (num_utterances, K). + F: first order statisticss (num_utterances, K * x_dim) + """ + assert N.shape[1] == self.K + self.T = np.random.randn(self.y_dim, F.shape[1]).astype(float_cpu(), copy=False) + + def compute_py_g_x( + self, N, F, G=None, return_cov=False, return_elbo=False, return_acc=False + ): + """Computes the latent posterior P(Y|X). + + Args: + N: zero order statistics (num_utterances, K). + F: first order statisticss (num_utterances, K * x_dim). + G: logP(x| UBM, Z) to add to elbo (optional). + return_cov: whether or not to return the covariance of the posterior. + return_elbo: whther or not to return the ELBO. + return_acc: whther or not to return accumulated stats for EM algorithm. + + Returns: + y: latent mean (i-vector). + Posterior covariances. + ELBO + Ry accumlator for ML step with shape (y_dim, y_dim) + Py accumlator for MD step with shape (y_dim, y_dim) + """ + assert self.is_init + M = F.shape[0] + y_dim = self.y_dim + + compute_inv = return_cov or return_acc + return_tuple = compute_inv or return_elbo + + TF = np.dot(F, self.T.T) + L = self.compute_L(self.TT, N, self._upptr) + y = np.zeros((M, y_dim), dtype=float_cpu()) + + if return_cov: + Sy = np.zeros((M, int(y_dim * (y_dim + 1) // 2)), dtype=float_cpu()) + else: + Sy = None + + if return_elbo: + elbo = np.zeros((M,), dtype=float_cpu()) + + if return_acc: + Py = np.zeros((y_dim, y_dim), dtype=float_cpu()) + Ry = np.zeros((self.K, int(y_dim * (y_dim + 1) // 2)), dtype=float_cpu()) + + Li = np.zeros((self.y_dim, self.y_dim), dtype=float_cpu()) + for i in range(N.shape[0]): + Li[self._upptr] = L[i] + r = invert_pdmat( + Li, right_inv=True, return_logdet=return_elbo, return_inv=compute_inv + ) + mult_iL = r[0] + if return_elbo: + elbo[i] = -r[2] / 2 + if compute_inv: + iL = r[-1] + + y[i] = mult_iL(TF[i]) + + if return_cov: + Sy[i] = iL[self.__upptr] + + if return_acc: + iL += np.outer(y[i], y[i]) + Py += iL + Ry += iL[self.__upptr] * N[i][:, None] + + if not return_tuple: + return y + + r = [y] + + if return_cov: + r += [Sy] + + if return_elbo: + if G is not None: + elbo += G + elbo += 0.5 * np.sum(TF * y, axis=-1) + r += [elbo] + + if return_acc: + r += [Ry, Py] + + return tuple(r) + + def Estep(self, N, F, G=None): + """Computes the latent posterior P(Y|X). + + Args: + N: zero order statistics (num_utterances, K). + F: first order statisticss (num_utterances, K * x_dim). + G: logP(x| UBM, Z) to add to elbo (optional). + + Results: + Tuple with stats needed by the maximization step: + ELBO, num_classes, accumulated y, Ry, Cy, Py + """ + y, elbo, Ry, Py = self.compute_py_g_x( + N, F, G, return_elbo=True, return_acc=True + ) + + M = y.shape[0] + y_acc = np.sum(y, axis=0) + Cy = np.dot(F.T, y) + + elbo = np.sum(elbo) + + stats = (elbo, M, y_acc, Ry, Cy, Py) + return stats + + def MstepML(self, stats): + """Maximum likelihood step. + + Args: + stats: tuple with statistics prouced by the estimation step. + """ + _, M, y_acc, Ry, Cy, _ = stats + T = np.zeros_like(self.T) + Ryk = np.zeros((self.y_dim, self.y_dim), dtype=float_cpu()) + x_dim = T.shape[1] // self.K + for k in range(self.K): + idx = k * x_dim + Ryk[self._upptr] = Ry[k] + iRyk_mult = invert_pdmat(Ryk, right_inv=False)[0] + T[:, idx : idx + x_dim] = iRyk_mult(Cy[idx : idx + x_dim].T) + + self.T = T + self.reset_aux() + + def MstepMD(self, stats): + """Minimum divergence step. + + Args: + stats: tuple with statistics prouced by the estimation step. + """ + _, M, y_acc, Ry, Cy, Py = stats + mu_y = y_acc / M + Cy = Py / M - np.outer(mu_y, mu_y) + chol_Cy = la.cholesky(Cy, lower=False, overwrite_a=True) + self.T = np.dot(chol_Cy, self.T) + + self.reset_aux() + + def fit( + self, + N, + F, + G=None, + N_val=None, + F_val=None, + G_val=None, + epochs=20, + ml_md="ml+md", + md_epochs=None, + ): + """Trains the model. + + Args: + N: zero order sufficient statistics for training data with shape (num_utterances, K). + F: first order sufficient statistics for training data with shape (num_utterances, K*x_dim). + G: logP(x| UBM, Z) for training data to add to elbo (optional). + N_val: zero order sufficient statistics for val data with shape (num_utterances, K). + F_val: first order sufficient statistics for val data with shape (num_utterances, K*x_dim). + G_val: logP(x| UBM, Z) for val data to add to elbo (optional). + epochs: number of EM steps. + ml_md: whether to do maximum likelihood estimation ("ml"), minimum divergence ("md") or both ("ml+md"). + md_epochs: in which epochs to do MD estimation, if None, MD is done in all epochs. + + Returns: + log p(X) of the training data. + log p(x) per sample. + log p(X) of the val. data, if present. + log p(x) of the val. data per sample, if present. + """ + + use_ml = False if ml_md == "md" else True + use_md = False if ml_md == "ml" else True + + if not self.is_init: + self.initialize(N, F) + + elbo = np.zeros((epochs,), dtype=float_cpu()) + elbo_val = np.zeros((epochs,), dtype=float_cpu()) + for epoch in range(epochs): + + stats = self.Estep(N, F, G) + elbo[epoch] = stats[0] + if N_val is not None and F_val is not None: + _, elbo_val_e = self.compute_py_x(N_val, F_val, G_val, return_elbo=True) + elbo_val[epoch] = np.sum(elbo_val_e) + + if use_ml: + self.MstepML(stats) + if use_md and (md_epochs is None or epoch in md_epochs): + self.MstepMD(stats) + + elbo_norm = elbo / np.sum(N) + if N_val is None: + return elbo, elbo_norm + else: + elbo_val_norm = elbo_val / np.sum(N_val) + return elbo, elbo_norm, elbo_val, elbo_val_norm + + @property + def TT(self): + """ + Returns: + Matrices T_k T_k.T for Gaussian component k. + Matrices are vectorized and keep the upper triangular matrix + with shape = (K, y_dim (y_dim-1)/2 ) + """ + if self._TT is None: + self._TT = self.compute_TT(self.T, self.K, self._upptr) + return self._TT + + @property + def _upptr(self): + """Upper triangular mask.""" + if self.__upptr is None: + self.__upptr = np.triu(np.ones(self.y_dim, dtype=bool)) + return self.__upptr + + @staticmethod + def compute_TT(T, K, upptr): + """Computes T_k T_k.T matrices. + + Args: + T: Total variability factor loading matrix. + K: number of Gaussian components. + upptr: upper triangular mask. + + Returns: + Matrices T_k T_k.T for Gaussian component k. + Matrices are vectorized and keep the upper triangular matrix + with shape = (K, y_dim (y_dim-1)/2 ) + """ + x_dim = int(T.shape[1] / K) + y_dim = T.shape[0] + TT = np.zeros((K, int(y_dim * (y_dim + 1) / 2)), dtype=float_cpu()) + for k in range(K): + idx = k * x_dim + T_k = T[:, idx : idx + x_dim] + TT_k = np.dot(T_k, T_k.T) + TT[k] = TT_k[upptr] + + return TT + + @staticmethod + def compute_L(TT, N, upptr): + """Computes P(y|x) precision. + + Args: + TT: T_k T_k.T matrices. + N: zero order statistics. + upptr: upper triangular mask. + + Returns: + Posterior precision vectorized to keep just the upper triangular matrix. + """ + y_dim = upptr.shape[0] + I = np.eye(y_dim, dtype=float_cpu())[upptr] + return I + np.dot(N, TT) + + @staticmethod + def normalize_T(T, chol_prec): + """Normalizes T by the covariances of the GMM. + + Args: + T: original total variability matrix. + chol_prec: cholesqy decomp. of the precisions of the GMM components. + """ + Tnorm = np.zeros_like(T) + K = chol_prec.shape[0] + x_dim = int(T.shape[1] / K) + for k in range(K): + idx = k * x_dim + Tnorm[:, idx : idx + x_dim] = np.dot( + T[:, idx : idx + x_dim], chol_prec[k].T + ) + + return Tnorm + + def get_config(self): + """Returns the model configuration dict.""" + config = {"K": self.K} + base_config = super(JFATotal, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"T": self.T} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["T"] + params = cls._load_params_to_dict(f, config["name"], param_list) + kwargs = dict(list(config.items()) + list(params.items())) + return cls(**kwargs) + + def sample(self, num_samples): + """Draws samples from the i-vector model.""" + raise NotImplementedError() diff --git a/hyperion/np/pdfs/mixtures/__init__.py b/hyperion/np/pdfs/mixtures/__init__.py new file mode 100644 index 00000000..dccad8d1 --- /dev/null +++ b/hyperion/np/pdfs/mixtures/__init__.py @@ -0,0 +1,10 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +from .exp_family_mixture import ExpFamilyMixture +from .gmm import GMM +from .gmm_diag_cov import DiagGMM, GMMDiagCov +from .gmm_tied_diag_cov import DiagGMMTiedCov, GMMTiedDiagCov diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py new file mode 100644 index 00000000..e1355dc5 --- /dev/null +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -0,0 +1,638 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging + +import numpy as np + +from ....hyp_defs import float_cpu +from ....utils.math_funcs import logsumexp, softmax +from ..core import PDF + + +class ExpFamilyMixture(PDF): + """Base class for a mixture of exponential family distributions. + + p(x) = \sum_k h(x) exp(\eta_k u(x) - A_k) + + Attributes: + num_comp: number of components of the mixture. + pi: weights of the components. + eta: natural parameters of the distribution. + min_N: minimum number of samples for keeping the component. + update_pi: whether or Not to update the weights when optimizing. + x_dim: data dimension. + """ + + def __init__( + self, num_comp=1, pi=None, eta=None, min_N=0, update_pi=True, **kwargs + ): + super().__init__(**kwargs) + if pi is not None: + num_comp = len(pi) + self.num_comp = num_comp + self.pi = pi + self.eta = eta + self.min_N = min_N + self.A = None + self._log_pi = None + self.update_pi = update_pi + + @property + def is_init(self): + """Returns True if the model has been initialized.""" + if not self._is_init: + if self.eta is not None and self.A is not None and self.pi is not None: + self.validate() + self._is_init = True + return self._is_init + + @property + def log_pi(self): + """Log weights""" + if self._log_pi is None: + self._log_pi = np.log(self.pi + 1e-15) + return self._log_pi + + def _validate_pi(self): + assert len(self.pi) == self.num_comp + + def fit( + self, + x, + sample_weight=None, + x_val=None, + sample_weight_val=None, + epochs=10, + batch_size=None, + ): + """Trains the model. + + Args: + x: train data matrix with shape (num_samples, x_dim). + sample_weight: weight of each sample in the training loss shape (num_samples,). + x_val: validation data matrix with shape (num_val_samples, x_dim). + sample_weight_val: weight of each sample in the val. loss. + epochs: number of EM steps. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + log p(X) of the training data. + log p(x) per sample. + log p(X) of the val. data, if present. + log p(x) of the val. data per sample, if present. + """ + + if not self.is_init: + self.initialize(x) + + log_h = self.accum_log_h(x, sample_weight) + if x_val is not None: + log_h_val = self.accum_log_h(x_val, sample_weight_val) + + elbo = np.zeros((epochs,), dtype=float_cpu()) + elbo_val = np.zeros((epochs,), dtype=float_cpu()) + for epoch in range(epochs): + N, u_x = self.Estep(x=x, sample_weight=sample_weight, batch_size=batch_size) + elbo[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h) + self.Mstep(N, u_x) + + if x_val is not None: + N, u_x = self.Estep( + x=x_val, sample_weight=sample_weight_val, batch_size=batch_size + ) + elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) + + if x_val is None: + return elbo, elbo / x.shape[0] + else: + return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] + + def log_h(self, x): + """Computes log h(x) of the exp. family.""" + return 0 + + def accum_log_h(self, x, sample_weight=None): + """Accumlates log h(x)""" + if sample_weight is None: + return np.sum(self.log_h(x)) + return np.sum(sample_weight * self.log_h(x)) + + def compute_pz(self, x, u_x=None, mode="nat"): + """Computes p(z|x) + + Args: + x: input data with shape (num_samples, x_dim). + u_x: precomputed sufficient stats with shape (num_samples, u_dim). + mode: whether to use natural (nat) or standard (std) parameters. + + Returns: + p(z|x) with shape (num_samples, num_comp) + """ + if mode == "nat": + return self.compute_pz_nat(x, u_x) + else: + return self.compute_pz_std(x) + + def compute_pz_nat(self, x, u_x=None): + """Computes p(z|x) using the natural parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: precomputed sufficient stats with shape (num_samples, u_dim). + + Returns: + p(z|x) with shape (num_samples, num_comp) + """ + if u_x is None: + u_x = self.compute_suff_stats(x) + logr = np.dot(u_x, self.eta.T) - self.A + self.log_pi + return softmax(logr) + + def compute_pz_std(self, x): + """Computes p(z|x) using the standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + p(z|x) with shape (num_samples, num_comp) + """ + return self.compute_pz_nat(x) + + def compute_suff_stats(self, x): + """Computes sufficient stats for a data sample.""" + return x + + def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): + """Accumlates sufficient statistis over several data samples. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + if u_x is not None or batch_size is None: + return self._accum_suff_stats_1batch(x, u_x, sample_weight) + else: + return self._accum_suff_stats_nbatches(x, sample_weight, batch_size) + + def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): + """Accumlates sufficient statistis over several data samples for a single batch. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + if u_x is None: + u_x = self.compute_suff_stats(x) + z = self.compute_pz_nat(x, u_x) + if sample_weight is not None: + z *= sample_weight[:, None] + + N = np.sum(z, axis=0) + acc_u_x = np.dot(z.T, u_x) + return N, acc_u_x + + def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): + """Accumlates sufficient statistis over several data samples for multiple batches. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + sw_i = None + for i1 in range(0, x.shape[0], batch_size): + i2 = np.minimum(i1 + batch_size, x.shape[0]) + x_i = x[i1:i2, :] + if sample_weight is not None: + sw_i = sample_weight[i1:i2] + N_i, u_x_i = self._accum_suff_stats_1batch(x_i, sample_weight=sw_i) + if i1 == 0: + N = N_i + u_x = u_x_i + else: + N += N_i + u_x += u_x_i + return N, u_x + + def accum_suff_stats_segments( + self, x, segments, u_x=None, sample_weight=None, batch_size=None + ): + """Accumlates sufficient statistis per each segment in an utterance. + + Args: + x: data samples of shape (num_samples, x_dim). + segments: segments t_start and t_end with shape (num_segments, 2). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + K = self.num_comp + num_segments = len(segments) + N = np.zeros((num_segments, K), dtype=float_cpu()) + acc_u_x = np.zeros((num_segments, K, self.eta.shape[1]), dtype=float_cpu()) + u_x_i = None + sw_i = None + for i in range(num_segments): + start = int(segments[i][0]) + end = int(segments[i][1]) + 1 + x_i = x[start:end] + if u_x is not None: + u_x_i = u_x[start:end] + if sample_weight is not None: + sw_i = sample_weight[start:end] + N_i, acc_u_x_i = self.accum_suff_stats( + x_i, u_x=u_x_i, sample_weight=sw_i, batch_size=batch_size + ) + N[i] = N_i + acc_u_x[i] = acc_u_x_i + + return N, acc_u_x + + def accum_suff_stats_segments_prob( + self, x, prob, u_x=None, sample_weight=None, batch_size=None + ): + """Accumlates sufficient statistis per each segment in an utterance, + Segments are defined by the probability for a frame to belong to the + segment + + Args: + x: data samples of shape (num_samples, x_dim). + prob: probability of belonging to a segments with shape (num_samples, num_segments). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + if u_x is not None or batch_size is None: + return self._accum_suff_stats_segments_prob_1batch( + x, prob, u_x, sample_weight + ) + else: + return self._accum_suff_stats_segments_prob_nbatches( + x, prob, sample_weight, batch_size + ) + + def _accum_suff_stats_segments_prob_1batch( + self, x, prob, u_x=None, sample_weight=None + ): + if u_x is None: + u_x = self.compute_suff_stats(x) + z = self.compute_pz_nat(x, u_x) + if sample_weight is not None: + z *= sample_weight[:, None] + + K = len(self.pi) + num_segments = prob.shape[1] + N = np.zeros((num_segments, K), float_cpu()) + acc_u_x = np.zeros((num_segments, K, self.eta.shape[1]), float_cpu()) + + for i in range(num_segments): + z_i = z * prob[:, i][:, None] + N[i] = np.sum(z_i, axis=0) + acc_u_x[i] = np.dot(z_i.T, u_x) + + return N, acc_u_x + + def _accum_suff_stats_segments_prob_nbatches( + self, x, prob, sample_weight, batch_size + ): + sw_i = None + for i1 in range(0, x.shape[0], batch_size): + i2 = np.minimum(i1 + batch_size, x.shape[0]) + x_i = x[i1:i2, :] + prob_i = prob[i1:i2, :] + if sample_weight is not None: + sw_i = sample_weight[i1:i2] + N_i, u_x_i = self._accum_suff_stats_segments_prob_1batch( + x_i, prob_i, sample_weight=sw_i + ) + if i1 == 0: + N = N_i + u_x = u_x_i + else: + N += N_i + u_x += u_x_i + return N, u_x + + def accum_suff_stats_sorttime( + self, + x, + frame_length, + frame_shift, + u_x=None, + sample_weight=None, + batch_size=None, + ): + """Accumlates sufficient statistis over a sliding window. + + Args: + x: data samples of shape (num_samples, x_dim). + frame_length: frame length. + frame_shift: frame shift. + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + if u_x is not None or batch_size is None: + return self._accum_suff_stats_sorttime_1batch( + x, frame_length, frame_shift, u_x, sample_weight + ) + else: + return self._accum_suff_stats_sorttime_nbatches( + x, frame_length, frame_shift, sample_weight, batch_size + ) + + def _accum_suff_stats_sorttime_1batch( + self, x, frame_length, frame_shift, u_x=None, sample_weight=None + ): + K = len(self.pi) + num_frames = x.shape[0] + num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) + if num_segments == 1: + return self._accum_suff_stats_1batch(self, x, u_x, sample_weight) + + if u_x is None: + u_x = self.compute_suff_stats(x) + z = self.compute_pz_nat(x, u_x) + if sample_weight is not None: + z *= sample_weight[:, None] + + N = np.zeros((num_segments, K), float_cpu()) + acc_u_x = np.zeros((num_segments, K, self.eta.shape[1]), float_cpu()) + + start1 = int(frame_shift - 1) + end1 = int((num_segments - 1) * frame_shift) + start2 = int(start1 + frame_length) + end2 = int(end1 + frame_length) + cum_N = np.cumsum(z, axis=0) + N[0] = cum_N[frame_length - 1] + N[1:] = cum_N[start2:end2:frame_shift] - cum_N[start1:end1:frame_shift] + + for k in range(K): + cum_u_x_k = np.cumsum(z[:, k][:, None] * u_x, axis=0) + acc_u_x[0, k] = cum_u_x_k[frame_length - 1] + acc_u_x[1:, k] = ( + cum_u_x_k[start2:end2:frame_shift] - cum_u_x_k[start1:end1:frame_shift] + ) + + return N, acc_u_x + + def _accum_suff_stats_sorttime_nbatches( + self, x, frame_length, frame_shift, sample_weight, batch_size + ): + K = len(self.pi) + num_frames = x.shape[0] + num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) + if num_segments == 1: + return self._accum_suff_stats_1batch(self, x, None, sample_weight) + + num_segments_per_batch = np.floor((num_frames - frame_length) / frame_shift + 1) + batch_size = int((num_segments_per_batch - 1) * frame_shift + frame_length) + batch_shift = int(num_segments_per_batch * frame_shift) + + N = np.zeros((num_segments, K), float_cpu()) + acc_u_x = np.zeros((num_segments, K, self.eta.shape[1]), float_cpu()) + + sw_i = None + cur_segment = 0 + for i1 in range(0, x.shape[0], batch_shift): + i2 = np.minimum(i1 + batch_size, x.shape[0]) + x_i = x[i1:i2, :] + if sample_weight is not None: + sw_i = sample_weight[i1:i2] + N_i, u_x_i = self._accum_suff_stats_sorttime_1batch( + x_i, frame_length, frame_shift, sample_weight=sw_i + ) + num_segments_i = N_i.shape[0] + N[cur_segment : cur_segment + num_segments_i] = N_i + acc_u_x[cur_segment : cur_segment + num_segments_i] = u_x_i + cur_segment += num_segments_i + return N, acc_u_x + + def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): + """Expectation step, accumlates suff. stats. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ + return self.accum_suff_stats(x, u_x, sample_weight, batch_size) + + def sum_suff_stats(self, N, u_x): + """Sums suff. stats from muttiple sub-processes. + + Args: + N: zero order stats with shape = (num_proc,) + u_x: higher order stats with shape = (num_proc, u(x)_dim). + + Args: + Accumalted N and u_x. + """ + assert len(N) == len(u_x) + acc_N = N[0] + acc_u_x = u_x[0] + for i in range(1, len(N)): + acc_N += N[i] + acc_u_x += u_x[i] + return acc_N, acc_u_x + + def Mstep(self, stats): + """Maximization step.""" + pass + + def elbo(self, x, u_x=None, N=1, log_h=None, sample_weight=None, batch_size=None): + """Evidence lower bound. + + Args: + x: data samples with shape = (num_samples, x_dim). + u_x: accumlated u(x) (optional). + N: zero-th orders statistics (optional) + log_h: accumlated log h(x) (optional). + sample_weight: weigth of each sample in the loss function. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + log p(X) of the data. + """ + if u_x is None: + N, u_x = self.accum_suff_stats( + x, sample_weight=sample_weight, batch_size=batch_size + ) + if log_h is None: + log_h = self.accum_log_h(x, sample_weight=sample_weight) + return log_h + np.sum(u_x * self.eta) + np.inner(N, self.log_pi - self.A) + + def log_prob(self, x, u_x=None, mode="nat"): + """log p(x) of each data sample. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + method: the probability is computed using standard ("std") or + natural parameters ("nat"). + + Returns: + log p(x) with shape (num_samples,) + """ + if mode == "nat": + return self.log_prob_nat(x, u_x) + else: + return self.log_prob_std(x) + + def log_prob_nat(self, x, u_x=None): + """log p(x) of each data sample computed using the + natural parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + + Returns: + log p(x) with shape (num_samples,) + """ + if u_x is None: + u_x = self.compute_suff_stats(x) + llk_k = np.dot(u_x, self.eta.T) - self.A + self.log_pi + llk = logsumexp(llk_k) + return self.log_h(x) + llk + + def log_prob_std(self, x): + """log p(x) of each data sample computed using the + standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + + Returns: + log p(x) with shape (num_samples,) + """ + raise NotImplementedError() + + def log_prob_nbest(self, x, u_x=None, mode="nat", nbest_mode="ubm", nbest=1): + """log p(x) of each data sample computed using the N best components. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + method: the probability is computed using standard ("std") or + natural parameters ("nat"). + nbest_mode: if "ubm", it selects the best components. + nbest: number of best components, or selected components. + + Returns: + log p(x) with shape (num_samples,) + """ + if mode == "nat": + return self.log_prob_nbest_nat(x, u_x, nbest_mode=nbest_mode, nbest=nbest) + else: + return self.log_prob_nbest_std(x, nbest_mode=nbest_mode, nbest=nbest) + + def log_prob_nbest_nat(self, x, u_x=None, nbest_mode="master", nbest=1): + """log p(x) of each data sample computed using the N best components + and natural parameters. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + nbest_mode: if "ubm", it selects the best components. + nbest: number of best components, or selected components. + + Returns: + log p(x) with shape (num_samples,) + """ + if u_x is None: + u_x = self.compute_suff_stats(x) + if nbest_mode == "master": + assert isinstance(nbest, int) + llk_k = np.dot(u_x, self.eta.T) - self.A + self.log_pi + nbest = np.argsort(llk_k)[: -(nbest + 1) : -1] + llk_k = llk_k[nbest] + else: + llk_k = np.dot(u_x, self.eta[nbest, :].T) - self.A + self.log_pi + llk = logsumexp(llk_k) + return self.log_h(x) + llk + + def log_prob_nbest_std(self, x, nbest_mode="master", nbest=1): + """log p(x) of each data sample computed using the N best components + and standard parameters. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + nbest_mode: if "ubm", it selects the best components. + nbest: number of best components, or selected components. + + Returns: + log p(x) with shape (num_samples,) + """ + raise NotImplementedError() + + def get_config(self): + """Returns the model configuration dict.""" + config = {"min_n": self.min_N, "update_pi": self.update_pi} + base_config = super(ExpFamilyMixture, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def compute_A_nat(eta): + """Computes A_theta from the natural param.""" + raise NotImplementedError() + + @staticmethod + def compute_A_std(params): + """Computes A_theta from the standard param.""" + raise NotImplementedError() + + @staticmethod + def compute_eta(param): + """Computes the natural param. from the standard param.""" + raise NotImplementedError() + + @staticmethod + def compute_std(eta): + """Computes the standard param. from the natural param.""" + raise NotImplementedError() + + def _compute_nat_params(self): + pass + + def _compute_std_params(self): + pass diff --git a/hyperion/np/pdfs/mixtures/gmm.py b/hyperion/np/pdfs/mixtures/gmm.py new file mode 100644 index 00000000..934c6749 --- /dev/null +++ b/hyperion/np/pdfs/mixtures/gmm.py @@ -0,0 +1,581 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np +import scipy.linalg as la +from scipy.special import erf + +from ....hyp_defs import float_cpu +from ....utils.math_funcs import ( + fullcov_varfloor, + invert_pdmat, + invert_trimat, + logdet_pdmat, + logsumexp, + softmax, + symmat2vec, + vec2symmat, +) +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) +from ...clustering import KMeans +from ..core import Normal +from .exp_family_mixture import ExpFamilyMixture + + +class GMM(ExpFamilyMixture): + """Class for GMM with full covariance. + + Attributes: + num_comp: number of components of the mixture (intered from pi). + pi: weights of the components. + mu: mean with shape (num_comp, x_dim,) or None. + Lambda: precision with shape (num_comp, x_dim, x_dim) or None. + var_floor: variance floor. + update_mu: whether or not update mu when optimizing. + update_Lambda: wether or not update Lambda when optimizing. + x_dim: data dim (infered from mu if present) + """ + + def __init__( + self, + num_comp=1, + pi=None, + mu=None, + Lambda=None, + var_floor=1e-3, + update_mu=True, + update_Lambda=True, + **kwargs + ): + if mu is not None: + assert mu.ndim == 2 + kwargs["x_dim"] = mu.shape[1] + super().__init__(num_comp=num_comp, pi=pi, **kwargs) + self.mu = mu + self.Lambda = Lambda + self.var_floor = var_floor + self.update_mu = update_mu + self.update_Lambda = update_Lambda + + self._compute_gmm_nat_std() + + self._logLambda = None + self._cholLambda = None + self._Sigma = None + + def _compute_gmm_nat_std(self): + """Comptues natural and standard parameters of the distribution.""" + if self.mu is not None and self.Lambda is not None: + self._validate_mu() + self._validate_Lambda() + self._compute_nat_params() + elif self.eta is not None: + self._validate_eta() + self.A = self.compute_A_nat(self.eta) + self._compute_std_params() + + def compute_Lambda_aux(self): + """Comptues auxiliary variables derived from the precision.""" + self._logLambda = np.zeros((self.num_comp,), dtype=float_cpu()) + self._cholLambda = np.zeros( + (self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu() + ) + for i, L in enumerate(self.Lambda): + f, L, logL = invert_pdmat(L, return_logdet=True) + self._logLambda[i] = logL + self._cholLambda[i] = L.T + + @property + def logLambda(self): + """log precision determinants.""" + if self._logLambda is None: + self.compute_Lambda_aux() + return self._logLambda + + @property + def cholLambda(self): + """Cholesqy decomp. of the precisions.""" + if self._cholLambda is None: + self.compute_Lambda_aux() + return self._cholLambda + + @property + def Sigma(self): + """Covariances.""" + if self._Sigma is None: + self._Sigma = np.zeros( + (self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu() + ) + for k in range(self.num_comp): + self._Sigma[k] = invert_pdmat(self.Lambda[k], return_inv=True)[-1] + return self._Sigma + + def initialize(self, x=None): + """Initializes the distribution.""" + if x is None and self.mu is None and self.eta is None: + assert self.num_comp == 1 + self._initialize_stdnormal() + if x is not None: + self._initialize_kmeans(self.num_comp, x) + self.validate() + self._compute_gmm_nat_std() + + def _initialize_stdnormal(self): + """Initializes a single component GMM with std. Normal.""" + self.pi = np.array([1], dtype=float_cpu()) + self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) + self.Lambda = np.zeros((1, self.x_dim, self.x_dim), dtype=float_cpu()) + self.Lambda[0] = np.eye(self.x_dim, dtype=float_cpu()) + + def _initialize_kmeans(self, num_comp, x): + """Initializes the GMM with K-Means. + + Args: + num_comp: number of components. + x: initialization data with shape (num_samples, x_dim). + """ + if num_comp == 1: + self.pi = np.array([1], dtype=float_cpu()) + self.mu = np.mean(x, axis=0, keepdims=True) + self.Lambda = np.zeros((1, self.x_dim, self.x_dim), dtype=float_cpu()) + delta = x - self.mu + S = np.dot(delta.T, delta) / x.shape[0] + self.Lambda[0] = invert_pdmat(S, return_inv=True)[-1] + return + + kmeans = KMeans(num_clusters=num_comp, epochs=100) + loss, cluster_index = kmeans.fit(x) + + self.mu = kmeans.mu + self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) + self.Lambda = np.zeros( + (self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu() + ) + + for k in range(num_comp): + r = cluster_index == k + self.pi[k] = np.sum(r) / x.shape[0] + delta = x[r] - self.mu[k] + S = np.dot(delta.T, delta) / np.sum(r) + self.Lambda[k] = invert_pdmat(S, return_inv=True)[-1] + + def stack_suff_stats(self, F, S=None): + """Stacks F and S suff stats into single vector.""" + if S is None: + return F + return np.hstack((F, S)) + + def unstack_suff_stats(self, stats): + """Decomposes suff. stats vector into F and S.""" + F = stats[:, : self.x_dim] + S = stats[:, self.x_dim :] + return F, S + + def norm_suff_stats(self, N, u_x, return_order2=False): + """Normalizes accumlated sufficient statistics with the + mean and covariance of the distribution. + + Args: + N: zeroth order sufficient stats. + u_x: 1st and 2nd order stats. + return_order2: whether or not return normalized 2nd order stats. + + Return: + Normalized N, F or N, [F, S]. + """ + F, S = self.unstack_suff_stats(u_x) + F_norm = F - N[:, None] * self.mu + for k in range(self.num_comp): + F_norm[k] = np.dot(F_norm[k], self.cholLambda[k].T) + if return_order2: + SS = vec2symmat(S[k]) + Fmu = np.outer(self.F[k], self.mu[k]) + SS = SS - Fmu - Fmu.T + N * np.outer(self.mu[k], self.mu[k]) + SS = np.dot(self.cholLambda[k], np.dot(SS, self.cholLambda[k].T)) + S[k] = symmat2vec(SS) + if return_order2: + return N, self.stack_suff_stats(F_norm, S) + return N, F_norm + + def Mstep(self, N, u_x): + """Maximization step. + + Args: + N: zeroth order stats. + u_x: accumlated higher order stats. + + """ + F, S = self.unstack_suff_stats(u_x) + + if self.update_mu: + self.mu = F / N[:, None] + + if self.update_Lambda: + C = np.zeros((self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu()) + for k in range(self.num_comp): + C[k] = vec2symmat(S[k] / N[k]) + C[k] -= np.outer(self.mu[k], self.mu[k]) + Sfloor = self.var_floor * np.mean(C, axis=0) + cholfloor = la.cholesky(Sfloor, overwrite_a=True) + for k in range(self.num_comp): + C[k] = fullcov_varfloor(C[k], cholfloor, F_is_chol=True) + self.Lambda[k] = invert_pdmat(C[k], return_inv=True)[-1] + self._Sigma = None + self._logLambda = None + self._cholLambda = None + + if self.update_pi: + N0 = N < self.min_N + if np.any(N0): + N[N0] = 0 + self.mu[N0] = 0 + self.Lambda[N0] = np.eye(self.x_dim) + self.pi = N / np.sum(N) + self._log_pi = None + + self._compute_nat_params() + + def split_comp(self, K=2): + """Creates a new GMM with K x num_componentes. + + Args: + K: multiplier for the number of components + + Returns: + GMM object. + """ + num_comp = self.num_comp * K + pi = np.repeat(self.pi, K) / K + Lambda = np.repeat(self.Lambda, K, axis=0) * (K**2) + mu = np.repeat(self.mu, K, axis=0) + + for g in range(self.num_comp): + w, v = la.eigh(self.Sigma[g]) + v *= np.sqrt(v) + if K == 2: + std_dev = np.sum(v, axis=1) + mu[2 * g] += std_dev + mu[2 * g + 1] -= std_dev + else: + for k in range(K): + factor = 2 * (np.random.uniform(size=(v.shape[1],)) > 0.5) - 1 + std_dev = np.sum(v * factor, axis=1) + mu[K * g + k] += std_dev + + config = self.get_config() + return GMM(pi=pi, mu=mu, Lambda=Lambda, **config) + + def log_prob_std(self, x): + """log p(x) of each data sample computed using the + standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + log p(x) with shape (num_samples,) + """ + r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) + llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) + for k in range(self.num_comp): + mah_dist2 = np.sum(np.dot(x - self.mu[k], self.cholLambda[k]) ** 2, axis=1) + llk_k[:, k] = r0[k] - 0.5 * mah_dist2 + + return logsumexp(llk_k, axis=-1) + + def sample(self, num_samples, rng=None, seed=1024, r=None): + """Draws samples from the data distribution. + + Args: + num_samples: number of samples. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ + if rng is None: + rng = np.random.default_rng(seed) + + if r is None: + r = rng.multinomial(1, self.pi, size=(num_samples,)) + else: + num_samples = len(r) + + x = np.zeros((num_samples, self.x_dim), dtype=float_cpu()) + for k in range(self.num_comp): + index = r[:, k] == 1 + n_k = np.sum(index) + if n_k == 0: + continue + x[index] = rng.multivariate_normal( + self.mu[k], self.Sigma[k], size=(n_k,) + ).astype(float_cpu()) + + return x + + def get_config(self): + """Returns the model configuration dict.""" + config = { + "var_floor": self.var_floor, + "update_mu": self.update_mu, + "update_lambda": self.update_Lambda, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"pi": self.pi, "mu": self.mu, "Lambda": self.Lambda} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["pi", "mu", "Lambda"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls( + x_dim=config["x_dim"], + pi=params["pi"], + mu=params["mu"], + Lambda=params["Lambda"], + var_floor=config["var_floor"], + min_N=config["min_n"], + update_pi=config["update_pi"], + update_mu=config["update_mu"], + update_Lambda=config["update_lambda"], + name=config["name"], + ) + + @classmethod + def load_from_kaldi(cls, file_path): + """Loads GMM from Kaldi file. + + Args: + file_path: kaldi file path. + + Returns: + Model object. + """ + pi = None + eta1 = None + eta2 = None + num_comp = 0 + x_dim = 0 + success = False + with open(file_path, "r") as f: + while True: + line = f.readline() + if not line: + break + fields = line.rstrip().split() + if fields[0] == "": + pi = np.array([float(v) for v in fields[2:-1]], dtype=float_cpu()) + num_comp = len(pi) + elif fields[0] == "": + for k in range(num_comp): + line = f.readline() + fields = line.split() + if x_dim == 0: + x_dim = len(fields) + eta1 = np.zeros((num_comp, x_dim), dtype=float_cpu()) + eta2 = np.zeros( + (num_comp, int((x_dim**2 + 3 * x_dim) / 2)), + dtype=float_cpu(), + ) + + assert len(fields) == x_dim or len(fields) == x_dim + 1 + eta1[k] = [float(v) for v in fields[:x_dim]] + elif fields[0] == "": + L = np.zeros((x_dim, x_dim), dtype=float_cpu()) + for k in range(num_comp): + L[:, :] = 0 + for j in range(x_dim): + line = f.readline() + fields = line.split() + if j < x_dim - 1: + assert len(fields) == j + 1 + else: + assert len(fields) == x_dim + 1 + L[j, : j + 1] = [float(v) for v in fields[: j + 1]] + eta2[k] = -symmat2vec(L.T, diag_factor=0.5) + if k == num_comp - 1: + success = True + assert success + eta = np.hstack((eta1, eta2)) + return cls(x_dim=x_dim, pi=pi, eta=eta) + + def _validate_mu(self): + assert self.mu.shape[0] == self.num_comp + assert self.mu.shape[1] == self.x_dim + + def _validate_Lambda(self): + assert self.Lambda.shape[0] == self.num_comp + assert self.Lambda.shape[1] == self.x_dim + assert self.Lambda.shape[2] == self.x_dim + + def _validate_eta(self): + assert self.eta.shape[0] == self.num_comp + assert self.eta.shape[1] == (self.x_dim**2 + 3 * self.x_dim) / 2 + + def validate(self): + """Validates the parameters of the distribution.""" + if self.pi is not None: + self._validate_pi() + + if self.mu is not None and self.Lambda is not None: + self._validate_mu() + self._validate_Lambda() + + if self.eta is not None: + self._validate_eta() + + @staticmethod + def compute_eta(mu, Lambda): + """Computes nat param. from mean and precision.""" + x_dim = mu.shape[-1] + eta_dim = int((x_dim**2 + 3 * x_dim) / 2) + eta = np.zeros((mu.shape[0], eta_dim), dtype=float_cpu()) + for k in range(mu.shape[0]): + eta[k] = Normal.compute_eta(mu[k], Lambda[k]) + + return eta + + @staticmethod + def compute_std(eta): + """Computes standard params. from the natural param.""" + x_dim = Normal.compute_x_dim_from_eta(eta) + mu = np.zeros((eta.shape[0], x_dim), dtype=float_cpu()) + Lambda = np.zeros((eta.shape[0], x_dim, x_dim), dtype="float32") + for k in range(eta.shape[0]): + mu[k], Lambda[k] = Normal.compute_std(eta[k]) + + return mu, Lambda + + @staticmethod + def compute_A_nat(eta): + """Computes A from the natural param.""" + A = np.zeros((eta.shape[0],), dtype=float_cpu()) + for k in range(eta.shape[0]): + A[k] = Normal.compute_A_nat(eta[k]) + + return A + + @staticmethod + def compute_A_std(mu, Lambda): + """Computes A from the standard params.""" + A = np.zeros((mu.shape[0],), dtype=float_cpu()) + for k in range(mu.shape[0]): + A[k] = Normal.compute_A_std(mu[k], Lambda[k]) + + return A + + def _compute_nat_params(self): + self.eta = self.compute_eta(self.mu, self.Lambda) + self.A = self.compute_A_nat(self.eta) + + def _compute_std_params(self): + self.mu, self.Lambda = self.compute_std(self.eta) + self._cholLambda = None + self._logLambda = None + self._Sigma = None + + @staticmethod + def compute_suff_stats(x): + """Computes the sufficient stats. for each sample. + + Args: + x: data samples with shape (num_samples, x_dim). + + Returns: + Sufficient stats. for each data sample with shape (num_samples, u_dim). + """ + d = x.shape[1] + u = np.zeros((x.shape[0], int(d + d * (d + 1) / 2)), dtype=float_cpu()) + u[:, :d] = x + k = d + for i in range(d): + for j in range(i, d): + u[:, k] = x[:, i] * x[:, j] + k += 1 + return u + + def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): + """Plots one slice of each GMM component in 1d. + + Args: + feat_idx: feature index. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + for k in range(mu.shape[0]): + C = invert_pdmat(self.Lambda[k], return_inv=True)[-1][feat_idx, feat_idx] + plot_gaussian_1D(mu[k], C, num_sigmas, num_pts, **kwargs) + + def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 2d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + j, i = np.meshgrid(feat_idx, feat_idx) + for k in range(mu.shape[0]): + C_k = invert_pdmat(self.Lambda[k], return_inv=True)[-1][i, j] + plot_gaussian_ellipsoid_2D(mu[k], C_k, num_sigmas, num_pts, **kwargs) + + def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + j, i = np.meshgrid(feat_idx, feat_idx) + for k in range(mu.shape[0]): + C_k = invert_pdmat(self.Lambda[k], return_inv=True)[-1][i, j] + plot_gaussian_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) + + def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + """Plots 3 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + j, i = np.meshgrid(feat_idx, feat_idx) + for k in range(mu.shape[0]): + C_k = invert_pdmat(self.Lambda[k], return_inv=True)[-1][i, j] + plot_gaussian_ellipsoid_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py new file mode 100644 index 00000000..c3985aef --- /dev/null +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -0,0 +1,525 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np +from scipy.special import erf + +from ....hyp_defs import float_cpu +from ....utils.math_funcs import logsumexp, softmax +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) +from ...clustering import KMeans +from .exp_family_mixture import ExpFamilyMixture + + +class GMMDiagCov(ExpFamilyMixture): + """Class for GMM with diagonal covariance. + + Attributes: + num_comp: number of components of the mixture (intered from pi). + pi: weights of the components. + mu: mean with shape (num_comp, x_dim,) or None. + Lambda: precision with shape (num_comp, x_dim, x_dim) or None. + var_floor: variance floor. + update_mu: whether or not update mu when optimizing. + update_Lambda: wether or not update Lambda when optimizing. + x_dim: data dim (infered from mu if present) + """ + + def __init__( + self, + num_comp=1, + pi=None, + mu=None, + Lambda=None, + var_floor=1e-3, + update_mu=True, + update_Lambda=True, + **kwargs + ): + if mu is not None: + assert mu.ndim == 2 + kwargs["x_dim"] = mu.shape[1] + + super().__init__(num_comp=num_comp, pi=pi, **kwargs) + self.mu = mu + self.Lambda = Lambda + self.var_floor = var_floor + self.update_mu = update_mu + self.update_Lambda = update_Lambda + + self._compute_gmm_nat_std() + + self._logLambda = None + self._cholLambda = None + self._Sigma = None + + def _compute_gmm_nat_std(self): + if self.mu is not None and self.Lambda is not None: + self._validate_mu() + self._validate_Lambda() + self._compute_nat_params() + elif self.eta is not None: + self._validate_eta() + self.A = self.compute_A_nat(self.eta) + self._compute_std_params() + + @property + def logLambda(self): + """log precision determinants.""" + if self._logLambda is None: + self._logLambda = np.sum(np.log(self.Lambda), axis=-1) + return self._logLambda + + @property + def cholLambda(self): + """Cholesqy decomp. of the precisions.""" + if self._cholLambda is None: + self._cholLambda = np.sqrt(self.Lambda) + return self._cholLambda + + @property + def Sigma(self): + """Covariances.""" + if self._Sigma is None: + self._Sigma = 1.0 / self.Lambda + return self._Sigma + + def initialize(self, x=None): + """Initializes the distribution.""" + if x is None and self.mu is None and self.eta is None: + assert self.num_comp == 1 + self._initialize_stdnormal() + if x is not None: + self._initialize_kmeans(self.num_comp, x) + self.validate() + self._compute_gmm_nat_std() + + def _initialize_stdnormal(self): + """Initializes a single component GMM with std. Normal.""" + self.pi = np.array([1], dtype=float_cpu()) + self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) + self.Lambda = np.ones((1, self.x_dim), dtype=float_cpu()) + + def _initialize_kmeans(self, num_comp, x): + """Initializes the GMM with K-Means. + + Args: + num_comp: number of components. + x: initialization data with shape (num_samples, x_dim). + """ + if num_comp == 1: + self.pi = np.array([1], dtype=float_cpu()) + self.mu = np.mean(x, axis=0, keepdims=True) + self.Lambda = 1 / np.std(x, axis=0, keepdims=True) ** 2 + return + + kmeans = KMeans(num_clusters=num_comp, epochs=100) + loss, cluster_index = kmeans.fit(x) + + self.mu = kmeans.mu + self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) + self.Lambda = np.zeros((self.num_comp, x.shape[-1]), dtype=float_cpu()) + for k in range(num_comp): + r = cluster_index == k + self.pi[k] = np.sum(r) / x.shape[0] + self.Lambda[k] = 1 / np.std(x[r], axis=0) ** 2 + + def stack_suff_stats(self, F, S=None): + """Stacks F and S suff stats into single vector.""" + if S is None: + return F + return np.hstack((F, S)) + + def unstack_suff_stats(self, stats): + """Decomposes suff. stats vector into F and S.""" + F = stats[:, : self.x_dim] + S = stats[:, self.x_dim :] + return F, S + + def norm_suff_stats(self, N, u_x, return_order2=False): + """Normalizes accumlated sufficient statistics with the + mean and covariance of the distribution. + + Args: + N: zeroth order sufficient stats. + u_x: 1st and 2nd order stats. + return_order2: whether or not return normalized 2nd order stats. + + Return: + Normalized N, F or N, [F, S]. + """ + F, S = self.unstack_suff_stats(u_x) + F_norm = self.cholLambda * (F - N[:, None] * self.mu) + if return_order2: + S = S - 2 * self.mu * F + N * self.mu**2 + S *= self.Lambda + return N, self.stack_suff_stats(F_norm, S) + + return N, F_norm + + def Mstep(self, N, u_x): + """Maximization step. + + Args: + N: zeroth order stats. + u_x: accumlated higher order stats. + + """ + F, S = self.unstack_suff_stats(u_x) + + if self.update_mu: + self.mu = F / N[:, None] + + if self.update_Lambda: + S = S / N[:, None] - self.mu**2 + S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) + S_floor = np.maximum(S_floor, 1e-10) + S = np.maximum(S, S_floor) + self.Lambda = 1 / S + self._Sigma = S + self._cholLambda = None + self._logLambda = None + + if self.update_pi: + N0 = N < self.min_N + if np.any(N0): + N[N0] = 0 + self.mu[N0] = 0 + self._Sigma[N0] = 1 + self.Lambda[N0] = 1 + self.pi = N / np.sum(N) + self._log_pi = None + + self._compute_nat_params() + + def split_comp(self, K=2): + """Creates a new GMM with K x num_componentes. + + Args: + K: multiplier for the number of components + + Returns: + GMMDiagConv object. + """ + std_dev = 1 / self.cholLambda + + num_comp = self.num_comp * K + pi = np.repeat(self.pi, K) / K + Lambda = np.repeat(self.Lambda, K, axis=0) * (K**2) + mu = np.repeat(self.mu, K, axis=0) + + if K == 2: + mu[::2] += std_dev + mu[1::2] -= std_dev + else: + for k in range(K): + factor = 2 * (np.random.uniform(size=std_dev.shape) > 0.5) - 1 + mu[k::K] += factor * std_dev + + config = self.get_config() + return GMMDiagCov(pi=pi, mu=mu, Lambda=Lambda, **config) + + def log_prob_std(self, x): + """log p(x) of each data sample computed using the + standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + log p(x) with shape (num_samples,) + """ + r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) + llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) + for k in range(self.num_comp): + mah_dist2 = np.sum(((x - self.mu[k]) * self.cholLambda[k]) ** 2, axis=-1) + llk_k[:, k] = r0[k] - 0.5 * mah_dist2 + return logsumexp(llk_k, axis=-1) + + def log_cdf(self, x): + """Log cumulative distribution function.""" + llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) + for k in range(self.num_comp): + delta = (x - self.mu[k]) * self.cholLambda[k] + lk = 0.5 * (1 + erf(delta / np.sqrt(2))) + llk_k[:, k] = self.log_pi[k] + np.sum(np.log(lk + 1e-20), axis=-1) + + return logsumexp(llk_k) + + def sample(self, num_samples=1, rng=None, seed=1024, r=None): + """Draws samples from the data distribution. + + Args: + num_samples: number of samples. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ + if rng is None: + rng = np.random.default_rng(seed) + + if r is None: + r = rng.multinomial(1, self.pi, size=(num_samples,)) + else: + num_samples = len(r) + x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) + + for k in range(self.num_comp): + index = r[:, k] == 1 + x[index] = 1.0 / self.cholLambda[k] * x[index] + self.mu[k] + + return x + + def get_config(self): + """Returns the model configuration dict.""" + config = { + "var_floor": self.var_floor, + "update_mu": self.update_mu, + "update_lambda": self.update_Lambda, + } + base_config = super(GMMDiagCov, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"pi": self.pi, "mu": self.mu, "Lambda": self.Lambda} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["pi", "mu", "Lambda"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls( + x_dim=config["x_dim"], + pi=params["pi"], + mu=params["mu"], + Lambda=params["Lambda"], + var_floor=config["var_floor"], + min_N=config["min_n"], + update_pi=config["update_pi"], + update_mu=config["update_mu"], + update_Lambda=config["update_lambda"], + name=config["name"], + ) + + @classmethod + def load_from_kaldi(cls, file_path): + """Loads GMM from Kaldi file. + + Args: + file_path: kaldi file path. + + Returns: + Model object. + """ + pi = None + eta1 = None + eta2 = None + num_comp = 0 + x_dim = 0 + success = False + with open(file_path, "r") as f: + while True: + line = f.readline() + if not line: + break + fields = line.rstrip().split() + if fields[0] == "": + pi = np.array([float(v) for v in fields[2:-1]], dtype=float_cpu()) + num_comp = len(pi) + elif fields[0] == "": + for k in range(num_comp): + line = f.readline() + fields = line.split() + if x_dim == 0: + x_dim = len(fields) + eta1 = np.zeros((num_comp, x_dim), dtype=float_cpu()) + eta2 = np.zeros((num_comp, x_dim), dtype=float_cpu()) + + assert len(fields) == x_dim or len(fields) == x_dim + 1 + eta1[k] = [float(v) for v in fields[:x_dim]] + elif fields[0] == "": + for k in range(num_comp): + line = f.readline() + fields = line.split() + assert len(fields) == x_dim or len(fields) == x_dim + 1 + eta2[k] = [-0.5 * float(v) for v in fields[:x_dim]] + if k == num_comp - 1: + success = True + assert success + eta = np.hstack((eta1, eta2)) + return cls(x_dim=x_dim, pi=pi, eta=eta) + + def _validate_mu(self): + assert self.mu.shape[0] == self.num_comp + assert self.mu.shape[1] == self.x_dim + + def _validate_Lambda(self): + assert self.Lambda.shape[0] == self.num_comp + assert self.Lambda.shape[1] == self.x_dim + assert np.all(self.Lambda > 0) + + def _validate_eta(self): + assert self.eta.shape[0] == self.num_comp + assert self.eta.shape[1] == self.x_dim * 2 + + def validate(self): + """Validates the parameters of the distribution.""" + if self.pi is not None: + self._validate_pi() + + if self.mu is not None and self.Lambda is not None: + self._validate_mu() + self._validate_Lambda() + + if self.eta is not None: + self._validate_eta() + + @staticmethod + def compute_eta(mu, Lambda): + """Computes nat param. from mean and precision.""" + Lmu = Lambda * mu + eta = np.hstack((Lmu, -0.5 * Lambda)) + return eta + + @staticmethod + def compute_std(eta): + """Computes standard params. from the natural param.""" + x_dim = int(eta.shape[-1] / 2) + eta1 = eta[:, :x_dim] + eta2 = eta[:, x_dim:] + mu = -0.5 * eta1 / eta2 + Lambda = -2 * eta2 + return mu, Lambda + + @staticmethod + def compute_A_nat(eta): + """Computes A from the natural param.""" + x_dim = int(eta.shape[-1] / 2) + eta1 = eta[:, :x_dim] + eta2 = eta[:, x_dim:] + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = -1 / 4 * np.sum(eta1 * eta1 / eta2, axis=-1) + r3 = -1 / 2 * np.sum(np.log(-2 * eta2), axis=-1) + return r1 + r2 + r3 + + @staticmethod + def compute_A_std(mu, Lambda): + """Computes A from the standard params.""" + x_dim = mu.shape[1] + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = -0.5 * np.sum(np.log(Lambda), axis=-1) + r3 = 0.5 * np.sum(mu * mu * Lambda, axis=-1) + return r1 + r2 + r3 + + def _compute_nat_params(self): + self.eta = self.compute_eta(self.mu, self.Lambda) + self.A = self.compute_A_nat(self.eta) + + def _compute_std_params(self): + self.mu, self.Lambda = self.compute_std(self.eta) + self._cholLambda = None + self._logLambda = None + self._Sigma = None + + @staticmethod + def compute_suff_stats(x): + """Computes the sufficient stats. for each sample. + + Args: + x: data samples with shape (num_samples, x_dim). + + Returns: + Sufficient stats. for each data sample with shape (num_samples, u_dim). + """ + d = x.shape[-1] + u = np.zeros((x.shape[0], 2 * d), dtype=float_cpu()) + u[:, :d] = x + u[:, d:] = x * x + return u + + def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): + """Plots one slice of each GMM component in 1d. + + Args: + feat_idx: feature index. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + C = 1 / self.Lambda[:, feat_idx] + for k in range(mu.shape[0]): + plot_gaussian_1D(mu[k], C[k], num_sigmas, num_pts, **kwargs) + + def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 2d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + C = 1 / self.Lambda[:, feat_idx] + for k in range(mu.shape[0]): + C_k = np.diag(C[k]) + plot_gaussian_ellipsoid_2D(mu[k], C_k, num_sigmas, num_pts, **kwargs) + + def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + C = 1 / self.Lambda[:, feat_idx] + for k in range(mu.shape[0]): + C_k = np.diag(C[k]) + plot_gaussian_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) + + def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + """Plots 3 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + C = 1 / self.Lambda[:, feat_idx] + for k in range(mu.shape[0]): + C_k = np.diag(C[k]) + plot_gaussian_ellipsoid_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) + + +DiagGMM = GMMDiagCov diff --git a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py new file mode 100644 index 00000000..d696bbac --- /dev/null +++ b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py @@ -0,0 +1,292 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np +from scipy.special import erf + +from ....hyp_defs import float_cpu +from ....utils.math_funcs import logsumexp, softmax +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) +from ...clustering import KMeans +from .gmm_diag_cov import GMMDiagCov + + +class GMMTiedDiagCov(GMMDiagCov): + """Class for GMM with diagonal covariance tied across components. + + Attributes: + num_comp: number of components of the mixture (intered from pi). + pi: weights of the components. + mu: mean with shape (num_comp, x_dim,) or None. + Lambda: precision with shape (num_comp, x_dim, x_dim) or None. + var_floor: variance floor. + update_mu: whether or not update mu when optimizing. + update_Lambda: wether or not update Lambda when optimizing. + x_dim: data dim (infered from mu if present) + """ + + def __init__( + self, + num_comp=1, + pi=None, + mu=None, + Lambda=None, + var_floor=1e-3, + update_mu=True, + update_Lambda=True, + **kwargs + ): + super().__init__( + num_comp=num_comp, + pi=pi, + mu=mu, + Lambda=Lambda, + var_floor=var_floor, + update_mu=update_mu, + update_Lambda=update_Lambda, + **kwargs + ) + + def _compute_gmm_nat_std(self): + if self.mu is not None and self.Lambda is not None: + self._validate_mu() + self._validate_Lambda() + self._compute_nat_params() + elif self.eta is not None: + self._validate_eta() + self.A = self.compute_A_nat(self.eta) + self._compute_std_params() + + def _initialize_stdnormal(self): + """Initializes a single component GMM with std. Normal.""" + self.pi = np.array([1], dtype=float_cpu()) + self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) + self.Lambda = np.ones((self.x_dim,), dtype=float_cpu()) + + def _initialize_kmeans(self, num_comp, x): + """Initializes the GMM with K-Means. + + Args: + num_comp: number of components. + x: initialization data with shape (num_samples, x_dim). + """ + if num_comp == 1: + self.pi = np.array([1], dtype=float_cpu()) + self.mu = np.mean(x, axis=0, keepdims=True) + self.Lambda = 1 / np.std(x, axis=0, keepdims=True) ** 2 + return + + kmeans = KMeans(num_clusters=num_comp, epochs=100) + loss, cluster_index = kmeans.fit(x) + + self.mu = kmeans.mu + self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) + C = np.zeros((x.shape[-1],), dtype=float_cpu()) + for k in range(num_comp): + r = cluster_index == k + self.pi[k] = np.sum(r) / x.shape[0] + delta = x[r] - self.mu[k] + C += np.sum(delta**2, axis=0) + + self.Lambda = x.shape[0] / C + + def Mstep(self, N, u_x): + """Maximization step. + + Args: + N: zeroth order stats. + u_x: accumlated higher order stats. + + """ + F, S = self.unstack_suff_stats(u_x) + + if self.update_mu: + self.mu = F / N[:, None] + + if self.update_Lambda: + S = S / N[:, None] - self.mu**2 + S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) + S = np.maximum(S, S_floor) + Spool = np.sum(N[:, None] * S, axis=0) / np.sum(N) + self.Lambda = 1 / Spool + self._Sigma = Spool + self._cholLambda = None + self._logLambda = None + + if self.update_pi: + N0 = N < self.min_N + if np.any(N0): + N[N0] = 0 + self.mu[N0] = 0 + + self.pi = N / np.sum(N) + self._log_pi = None + + self._compute_nat_params() + + def split_comp(self, K=2): + """Creates a new GMM with K x num_componentes. + + Args: + K: multiplier for the number of components + + Returns: + GMMTiedDiagConv object. + """ + std_dev = 1 / self.cholLambda + + num_comp = self.num_comp * K + pi = np.repeat(self.pi, K) / K + mu = np.repeat(self.mu, K, axis=0) + + if K == 2: + mu[::2] += std_dev + mu[1::2] -= std_dev + else: + for k in range(K): + factor = 2 * (np.random.uniform(size=std_dev.shape) > 0.5) - 1 + mu[k::K] += factor * std_dev + + config = self.get_config() + return DiagGMMTiedCov(pi=pi, mu=mu, Lambda=self.Lambda, **config) + + def log_prob_std(self, x): + """log p(x) of each data sample computed using the + standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + log p(x) with shape (num_samples,) + """ + r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) + llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) + for k in range(self.num_comp): + mah_dist2 = np.sum(((x - self.mu[k]) * self.cholLambda) ** 2, axis=-1) + llk_k[:, k] = r0[k] - 0.5 * mah_dist2 + return logsumexp(llk_k, axis=-1) + + def log_cdf(self, x): + """Log cumulative distribution function.""" + llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) + for k in range(self.num_comp): + delta = (x - self.mu[k]) * self.cholLambda + lk = 0.5 * (1 + erf(delta / np.sqrt(2))) + llk_k[:, k] = self.log_pi[k] + np.sum(np.log(lk + 1e-20), axis=-1) + + return logsumexp(llk_k) + + def sample(self, num_samples=1, rng=None, seed=1024, r=None): + """Draws samples from the data distribution. + + Args: + num_samples: number of samples. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ + if rng is None: + rng = np.random.default_rng(seed) + + if r is None: + r = rng.multinomial(1, self.pi, size=(num_samples,)) + else: + num_samples = len(r) + x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) + + for k in range(self.num_comp): + index = r[:, k] == 1 + x[index] = 1.0 / self.cholLambda * x[index] + self.mu[k] + + return x + + def _validate_Lambda(self): + assert self.Lambda.shape[0] == self.x_dim + assert np.all(self.Lambda > 0) + + @staticmethod + def compute_eta(mu, Lambda): + """Computes nat param. from mean and precision.""" + Lmu = Lambda * mu + eta = np.hstack((Lmu, -0.5 * np.tile(Lambda, (mu.shape[0], 1)))) + return eta + + @staticmethod + def compute_std(eta): + """Computes standard params. from the natural param.""" + x_dim = int(eta.shape[-1] / 2) + eta1 = eta[:, :x_dim] + eta2 = eta[:, x_dim:] + mu = -0.5 * eta1 / eta2 + Lambda = -2 * eta2[0] + return mu, Lambda + + def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): + """Plots one slice of each GMM component in 1d. + + Args: + feat_idx: feature index. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + C = 1 / self.Lambda[feat_idx] + for k in range(mu.shape[0]): + plot_gaussian_1D(mu[k], C, num_sigmas, num_pts, **kwargs) + + def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 2d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + C = np.diag(1 / self.Lambda[feat_idx]) + for k in range(mu.shape[0]): + plot_gaussian_ellipsoid_2D(mu[k], C, num_sigmas, num_pts, **kwargs) + + def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + C = np.diag(1 / self.Lambda[feat_idx]) + for k in range(mu.shape[0]): + plot_gaussian_3D(mu[k], C, num_sigmas, num_pts, **kwargs) + + def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + """Plots 3 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ + mu = self.mu[:, feat_idx] + C = np.diag(1 / self.Lambda[feat_idx]) + for k in range(mu.shape[0]): + plot_gaussian_ellipsoid_3D(mu[k], C, num_sigmas, num_pts, **kwargs) + + +DiagGMMTiedCov = GMMTiedDiagCov diff --git a/hyperion/np/pdfs/plda/__init__.py b/hyperion/np/pdfs/plda/__init__.py new file mode 100644 index 00000000..5961b71f --- /dev/null +++ b/hyperion/np/pdfs/plda/__init__.py @@ -0,0 +1,11 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +from .factory import PLDAFactory, PLDAType +from .frplda import FRPLDA +from .plda import PLDA +from .plda_base import PLDABase, PLDALLRNvsMMethod +from .splda import SPLDA diff --git a/hyperion/np/pdfs/plda/factory.py b/hyperion/np/pdfs/plda/factory.py new file mode 100644 index 00000000..dd19ab9f --- /dev/null +++ b/hyperion/np/pdfs/plda/factory.py @@ -0,0 +1,204 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from enum import Enum + +import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ....utils.misc import filter_func_args +from .frplda import FRPLDA +from .plda import PLDA +from .plda_base import PLDALLRNvsMMethod +from .splda import SPLDA + + +class PLDAType(str, Enum): + frplda = "frplda" + splda = "splda" + plda = "plda" + + @staticmethod + def choices(): + return [PLDAType.frplda, PLDAType.splda, PLDAType.plda] + + +class PLDAFactory(object): + """Class to create PLDA objects.""" + + @staticmethod + def create( + plda_type, + y_dim=None, + z_dim=None, + fullcov_W=True, + update_mu=True, + update_V=True, + update_U=True, + update_B=True, + update_W=True, + update_D=True, + floor_iD=1e-5, + name="plda", + **kwargs + ): + if plda_type == PLDAType.frplda: + return FRPLDA( + fullcov_W=fullcov_W, + update_mu=update_mu, + update_B=update_B, + update_W=update_W, + name=name, + **kwargs + ) + if plda_type == PLDAType.splda: + return SPLDA( + y_dim=y_dim, + fullcov_W=fullcov_W, + update_mu=update_mu, + update_V=update_V, + update_W=update_W, + name=name, + **kwargs + ) + + if plda_type == PLDAType.plda: + return PLDA( + y_dim=y_dim, + z_dim=z_dim, + floor_iD=floor_iD, + update_mu=update_mu, + update_V=update_V, + update_U=update_U, + update_D=update_D, + name=name, + **kwargs + ) + + @staticmethod + def load_plda(plda_type, model_file): + if plda_type == "frplda": + return FRPLDA.load(model_file) + elif plda_type == "splda": + return SPLDA.load(model_file) + elif plda_type == "plda": + return PLDA.load(model_file) + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(PLDAFactory.create, kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--plda-type", + default=PLDAType.splda, + choices=PLDAType.choices(), + help="PLDA type", + ) + + parser.add_argument( + "--y-dim", type=int, default=150, help="num. of eigenvoices" + ) + parser.add_argument( + "--z-dim", type=int, default=400, help="num. of eigenchannels" + ) + + parser.add_argument( + "--fullcov-W", + default=True, + action=ActionYesNo, + help="use full covariance W", + ) + parser.add_argument( + "--update-mu", + default=True, + action=ActionYesNo, + help="not update mu", + ) + parser.add_argument( + "--update-V", default=True, action=ActionYesNo, help="update V" + ) + parser.add_argument( + "--update-U", default=True, action=ActionYesNo, help="update U" + ) + + parser.add_argument( + "--update-B", default=True, action=ActionYesNo, help="update B" + ) + parser.add_argument( + "--update-W", default=True, action=ActionYesNo, help="update W" + ) + parser.add_argument( + "--update-D", default=True, action=ActionYesNo, help="update D" + ) + parser.add_argument( + "--floor-iD", + type=float, + default=1e-5, + help="floor for inverse of D matrix", + ) + + parser.add_argument("--epochs", type=int, default=40, help="num. of epochs") + parser.add_argument( + "--ml-md", + default="ml+md", + choices=["ml+md", "ml", "md"], + help=("optimization type"), + ) + + parser.add_argument( + "--md-epochs", + default=None, + type=int, + nargs="+", + help=("epochs in which we do MD, if None we do it in all the epochs"), + ) + + parser.add_argument("--name", default="plda", help="model name") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + @staticmethod + def filter_eval_args(**kwargs): + valid_args = "eval_method" + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_llr_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--llr-method", default="vavg", choices=PLDALLRNvsMMethod.choices() + ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + # @staticmethod + # def add_eval_args(parser, prefix=None): + # if prefix is None: + # p1 = "--" + # else: + # p1 = "--" + prefix + "." + + # parser.add_argument( + # p1 + "plda-type", + # default="splda", + # choices=["frplda", "splda", "plda"], + # help=("PLDA type"), + # ) + # parser.add_argument(p1 + "model-file", required=True, help=("model file")) diff --git a/hyperion/np/pdfs/plda/frplda.py b/hyperion/np/pdfs/plda/frplda.py new file mode 100644 index 00000000..84cf0ace --- /dev/null +++ b/hyperion/np/pdfs/plda/frplda.py @@ -0,0 +1,526 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import numpy as np +from scipy import linalg as sla + +from ....hyp_defs import float_cpu +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat +from .plda_base import PLDABase + + +class FRPLDA(PLDABase): + """Class for Full-rank PLDA (a.k.a. Two-Covariance Model) where + .. math:: + \mathbf{x}_{ij} = \mathbf{y}_i + \varepsilon_{ij} + + + Attributes: + mu: class-independent mean. + B: between-class precision. + W: within-class precision. + update_mu: whether to update mu or not when training the model. + update_B: whether to update B or not when training the model. + update_W: whether to update W or not when training the model. + x_dim: data dimension. + """ + + def __init__( + self, + mu=None, + B=None, + W=None, + fullcov_W=True, + update_mu=True, + update_B=True, + update_W=True, + epochs=20, + ml_md="ml+md", + md_epochs=None, + **kwargs + ): + super().__init__(mu=mu, update_mu=update_mu, epochs=epochs, **kwargs) + if mu is not None: + self.y_dim = mu.shape[0] + self.B = B + self.W = W + self.fullcov_W = fullcov_W + self.update_B = update_B + self.update_W = update_W + + def validate(self): + """Validates the model parameters.""" + assert self.mu.shape[0] == self.B.shape[0] + assert self.mu.shape[0] == self.B.shape[1] + assert self.mu.shape[0] == self.W.shape[0] + assert self.mu.shape[0] == self.W.shape[1] + + @property + def is_init(self): + """Returns True if the model has been initialized.""" + if self._is_init: + return True + if self.mu is not None and self.B is not None and self.W is not None: + self.validate() + self._is_init = True + return self._is_init + + def initialize(self, D): + """initializes the model. + + Args: + D: tuple of sufficient statistics (N, F, S) + """ + N, F, S = D + self.x_dim = F.shape[1] + self.y_dim = F.shape[1] + M = F.shape[0] + N_tot = np.sum(N) + + y = F / N[:, None] + Fy = np.dot(F.T, y) + C = S - Fy - Fy.T + for i in range(M): + yy = np.outer(y[i, :], y[i, :]) + C += N[i] * yy + + C = (C + C.T) / 2 + mu = np.mean(y, axis=0) + iB = np.dot(y.T, y) / M - np.outer(mu, mu) + iW = C / N_tot + + B = invert_pdmat(iB, return_inv=True)[-1] + W = invert_pdmat(iW, return_inv=True)[-1] + + self.mu = mu + self.B = B + self.W = W + self._is_init = True + + def compute_py_g_x( + self, D, return_cov=False, return_logpy_0=False, return_acc=False + ): + """Computes the posterior P(y|x) + + Args: + D: tuple of sufficient statistics (N, F, S) + return_cov: whether or not to return the posterior covariances. + return_logpy_0: whether or not to return log P(y=0|x). + return_acc: whether or not to return Ry and Py accumulators. + + Returns: + Speaker factor posterior means with shape (num_speakers, y_dim) + Speaker factor posterior convariances with shape (num_speakers, y_dim, y_dim) + log P(y=0|x) with shape (num_spakers,) + Ry accumlator for ML step with shape (y_dim, y_dim) + Py accumlator for MD step with shape (y_dim, y_dim) + """ + + assert self.is_init + + if isinstance(D, tuple): + N, F, S = D + else: + F = D + N = np.ones((F.shape[0],), dtype=F.dtype) + S = None + + M = F.shape[0] + y_dim = self.y_dim + assert y_dim == F.shape[1] + + compute_inv = return_cov or return_acc + return_tuple = compute_inv or return_logpy_0 + + N_is_int = False + if np.all(np.ceil(N) == N): + N_is_int = True + + gamma = np.dot(F, self.W) + np.dot(self.mu, self.B) + if N_is_int: + iterator = np.unique(N) + else: + iterator = range(M) + + y = np.zeros_like(F) + if return_cov: + Sigma_y = np.zeros((M, y_dim, y_dim), dtype=float_cpu()) + else: + Sigma_y = None + + if return_logpy_0: + logpy = -0.5 * y_dim * np.log(2 * np.pi) * np.ones((M,), dtype=float_cpu()) + + if return_acc: + Py = np.zeros((y_dim, y_dim), dtype=float_cpu()) + Ry = np.zeros((y_dim, y_dim), dtype=float_cpu()) + + for k in iterator: + if N_is_int: + i = (N == k).nonzero()[0] + N_i = k + M_i = len(i) + else: + i = k + N_i = N[k] + M_i = 1 + + L_i = self.B + N_i * self.W + + r = invert_pdmat( + L_i, + right_inv=True, + return_logdet=return_logpy_0, + return_inv=compute_inv, + ) + mult_iL = r[0] + if return_logpy_0: + logL = r[2] + if compute_inv: + iL = r[-1] + + y[i, :] = mult_iL(gamma[i, :]) + + if return_cov: + Sigma_y[i, :, :] = iL + + if return_logpy_0: + logpy[i] += 0.5 * (logL - np.sum(y[i, :] * gamma[i, :], axis=-1)) + + if return_acc: + Py += M_i * iL + + if not return_tuple: + return y + + r = [y] + if return_cov: + r += [Sigma_y] + if return_logpy_0: + r += [logpy] + if return_acc: + r += [Ry, Py] + return r + + def Estep(self, D): + """Expectation step. + + Args: + D: tuple with sufficient statistics (N, F, S) + + Returns: + Tuple of statistics with accumlated expectations. + """ + N, F, S = D + y, logpy, Ry, Py = self.compute_py_g_x(D, return_logpy_0=True, return_acc=True) + + M = F.shape[0] + N_tot = np.sum(N) + + y_acc = np.sum(y, axis=0) + Cy = np.dot(F.T, y) + + Niy = y * N[:, None] + Ry += np.dot(Niy.T, y) + Py += np.dot(y.T, y) + + logpy_acc = np.sum(logpy) + + stats = (N_tot, M, S, logpy_acc, y_acc, Ry, Cy, Py) + return stats + + def elbo(self, stats): + """Computes the objective function. + + Args: + stats: tuple of expectations computed at the Estep. + + Returns: + log P(X) + """ + N, M, S, logpy_x = stats[:4] + + logW = logdet_pdmat(self.W) + logB = logdet_pdmat(self.B) + + logpx_y = 0.5 * ( + -N * self.x_dim * np.log(2 * np.pi) + + N * logW + - np.inner(self.W.ravel(), S.ravel()) + ) + logpy = ( + 0.5 + * M + * ( + -self.y_dim * np.log(2 * np.pi) + + logB + - np.inner(np.dot(self.mu, self.B), self.mu) + ) + ) + + elbo = logpx_y + logpy - logpy_x + return elbo + + def MstepML(self, stats): + """Maximum likelihood estimation step. + + Args: + stats: tuple of expectations computed at the Estep. + + """ + N, M, S, _, y_acc, Ry, Cy, Py = stats + ybar = y_acc / M + if self.update_mu: + self.mu = ybar + if self.update_B: + if self.update_mu: + iB = Py / M - np.outer(self.mu, self.mu) + else: + muybar = np.outer(self.mu, ybar) + iB = Py / M - muybar - muybar + np.outer(self.mu, self.mu) + self.B = invert_pdmat(iB, return_inv=True)[-1] + if self.update_W: + iW = (S - Cy - Cy.T + Ry) / N + if self.fullcov_W: + self.W = invert_pdmat(iW, return_inv=True)[-1] + else: + self.W = np.diag(1 / np.diag(iW)) + + def MstepMD(self, stats): + pass + + def get_config(self): + """Returns the model configuration dict.""" + config = { + "update_W": self.update_W, + "update_B": self.update_B, + "fullcov_W": self.fullcov_W, + } + base_config = super(FRPLDA, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"mu": self.mu, "B": self.B, "W": self.W} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["mu", "B", "W"] + params = cls._load_params_to_dict(f, config["name"], param_list) + kwargs = dict(list(config.items()) + list(params.items())) + return cls(**kwargs) + + def llr_1vs1(self, x1, x2): + """log-likelihood ratio between target and non-target hypothesis for + the case of one enrollment and one test segments. + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + + Returns: + Score matrix with shape (num_enrollment_segments, num_test_segments). + """ + assert self.is_init + + Lnon = self.B + self.W + mult_icholLnon, logcholLnon = invert_trimat( + sla.cholesky(Lnon, lower=False, overwrite_a=True), + right_inv=True, + return_logdet=True, + )[:2] + logLnon = 2 * logcholLnon + + Ltar = self.B + 2 * self.W + mult_icholLtar, logcholLtar = invert_trimat( + sla.cholesky(Ltar, lower=False, overwrite_a=True), + right_inv=True, + return_logdet=True, + )[:2] + logLtar = 2 * logcholLtar + + WF1 = np.dot(x1, self.W) + WF2 = np.dot(x2, self.W) + Bmu = np.dot(self.mu, self.B) + + gamma_non_1 = mult_icholLnon(WF1 + Bmu) + gamma_non_2 = mult_icholLnon(WF2 + Bmu) + + Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] + Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) + + gamma_tar_1 = mult_icholLtar(WF1 + 0.5 * Bmu) + gamma_tar_2 = mult_icholLtar(WF2 + 0.5 * Bmu) + + Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] + Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) + + scores = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) + scores += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 + scores += ( + 2 * logLnon + - logLtar + - logdet_pdmat(self.B) + + np.inner(np.dot(self.mu, self.B), self.mu) + ) + scores *= 0.5 + return scores + + def llr_NvsM_book(self, D1, D2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with the exact formula (by the book). + + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + D2: tuple of sufficient statistics for the test sides (N2, F2, S2). + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ + assert self.is_init + + N1, F1, _ = D1 + N2, F2, _ = D2 + + Bmu = np.dot(self.mu, self.B) + + scores = np.zeros((len(N1), len(N2)), dtype=float_cpu()) + for N1_i in np.unique(N1): + for N2_j in np.unique(N2): + i = np.where(N1 == N1_i)[0] + j = np.where(N2 == N2_j)[0] + + L1 = self.B + N1_i * self.W + mult_icholL1, logcholL1 = invert_trimat( + sla.cholesky(L1, lower=False, overwrite_a=True), + right_inv=True, + return_logdet=True, + )[:2] + logL1 = 2 * logcholL1 + + L2 = self.B + N2_j * self.W + mult_icholL2, logcholL2 = invert_trimat( + sla.cholesky(L2, lower=False, overwrite_a=True), + right_inv=True, + return_logdet=True, + )[:2] + logL2 = 2 * logcholL2 + + Ltar = self.B + (N1_i + N2_j) * self.W + mult_icholLtar, logcholLtar = invert_trimat( + sla.cholesky(Ltar, lower=False, overwrite_a=True), + right_inv=True, + return_logdet=True, + )[:2] + logLtar = 2 * logcholLtar + + WF1 = np.dot(F1[i, :], self.W) + WF2 = np.dot(F2[j, :], self.W) + + gamma_non_1 = mult_icholL1(WF1 + Bmu) + gamma_non_2 = mult_icholL2(WF2 + Bmu) + + Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] + Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) + + gamma_tar_1 = mult_icholLtar(WF1 + 0.5 * Bmu) + gamma_tar_2 = mult_icholLtar(WF2 + 0.5 * Bmu) + + Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] + Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) + + scores_ij = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) + scores_ij += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 + scores_ij += logL1 + logL2 - logLtar + scores[np.ix_(i, j)] = scores_ij + + scores += -logdet_pdmat(self.B) + np.inner(np.dot(self.mu, self.B), self.mu) + scores *= 0.5 + return scores + + def sample( + self, num_classes, num_samples_per_class, rng=None, seed=1024, return_y=False + ): + """Draws samples from the PLDA model. + + Args: + num_classes: number of classes to sample. + num_samples_per_class: number of samples to sample per each class. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ + assert self.is_init + + if rng is None: + rng = np.random.default_rng(seed=seed) + + Sb = invert_pdmat(self.B, return_inv=True)[-1] + chol_Sb = sla.cholesky(Sb, lower=False) + Sw = invert_pdmat(self.W, return_inv=True)[-1] + chol_Sw = sla.cholesky(Sw, lower=False) + + x_dim = self.mu.shape[0] + z = rng.normal(size=(num_classes * num_samples_per_class, x_dim)).astype( + dtype=float_cpu(), copy=False + ) + z = np.dot(z, chol_Sw) + y = rng.normal(size=(num_classes, x_dim)).astype(dtype=float_cpu(), copy=False) + y = np.dot(y, chol_Sb) + self.mu + y = np.repeat(y, num_samples_per_class, axis=0) + + if return_y: + return y + z, y + + return y + z + + def weighted_avg_params(self, mu, B, W, w_mu, w_B, w_W): + """Performs weighted average of the model parameters + and some given parameters. + + Args: + mu: other mean vector + w_mu: weight of the given mean vector. + + """ + super().weigthed_avg_params(mu, w_mu) + if w_B > 0: + Sb0 = invert_pdmat(self.B, return_inv=True)[-1] + Sb = invert_pdmat(B, return_inv=True)[-1] + Sb = w_B * Sb + (1 - w_B) * Sb0 + self.B = invert_pdmat(Sb, return_inv=True)[-1] + if w_W > 0: + Sw0 = invert_pdmat(self.W, return_inv=True)[-1] + Sw = invert_pdmat(W, return_inv=True)[-1] + Sw = w_W * Sw + (1 - w_W) * Sw0 + self.W = invert_pdmat(Sw, return_inv=True)[-1] + + def weighted_avg_model(self, plda, w_mu, w_B, w_W): + """Performs weighted average of the model parameters + and those of another model given as input. + + Args: + plda: other PLDA model. + + """ + self.weighted_avg_params(plda.mu, plda.B, plda.W, w_mu, w_B, w_W) diff --git a/hyperion/pdfs/plda/plda.py b/hyperion/np/pdfs/plda/plda.py similarity index 75% rename from hyperion/pdfs/plda/plda.py rename to hyperion/np/pdfs/plda/plda.py index 16dee5ea..92f77090 100644 --- a/hyperion/pdfs/plda/plda.py +++ b/hyperion/np/pdfs/plda/plda.py @@ -6,12 +6,31 @@ import numpy as np from scipy import linalg as sla -from ...hyp_defs import float_cpu -from ...utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....hyp_defs import float_cpu +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase class PLDA(PLDABase): + """Class for Probabilistic Discriminant Analysis (PLDA) model. + .. math:: + \mathbf{x}_{ij} = \mu + \mathbf{V} \mathbf{y}_i + \mathbf{U} \mathbf{z}_{ij} + \varepsilon_{ij} + + Attributes: + y_dim: speaker factor dimension. + z_dim: channel factor dimension. + mu: class-independent mean. + V: speaker factor loading matrix. + U: channel factor loading matrix. + D: Precision of the additional channel noise. + fullcov_iD: floor for the inverse of D. + update_mu: whether to update mu or not when training the model. + update_V: whether to update V or not when training the model. + update_U: whether to update U or not when training the model. + update_D: whether to update D or not when training the model. + x_dim: data dimension. + """ + def __init__( self, y_dim=None, @@ -25,9 +44,20 @@ def __init__( update_V=True, update_U=True, update_D=True, + epochs=20, + ml_md="ml+md", + md_epochs=None, **kwargs ): - super(PLDA, self).__init__(y_dim=y_dim, mu=mu, update_mu=update_mu, **kwargs) + super().__init__( + y_dim=y_dim, + mu=mu, + update_mu=update_mu, + epochs=epochs, + ml_md=ml_md, + md_epochs=md_epochs, + **kwargs + ) self.z_dim = z_dim if V is not None: self.y_dim = V.shape[0] @@ -52,6 +82,7 @@ def __init__( self._VWV = None def validate(self): + """Validates the model parameters.""" assert self.mu.shape[0] >= self.V.shape[0] assert self.mu.shape[0] == self.V.shape[1] assert self.mu.shape[0] >= self.U.shape[0] @@ -60,6 +91,7 @@ def validate(self): @property def is_init(self): + """Returns True if the model has been initialized.""" if self._is_init: return True if ( @@ -75,6 +107,7 @@ def is_init(self): return self._is_init def compute_aux(self): + """Computes auxiliary variables.""" DV = self.V * self.D DU = self.U * self.D self._DU = DU @@ -89,6 +122,11 @@ def compute_aux(self): self._VWV = np.dot(self.V, self._VW) def initialize(self, D): + """initializes the model. + + Args: + D: tuple of sufficient statistics (N, F, S) + """ N, F, S = D self.x_dim = F.shape[1] M = F.shape[0] @@ -118,10 +156,29 @@ def initialize(self, D): def compute_py_g_x( self, D, return_cov=False, return_logpy_0=False, return_acc=False ): - + """Computes the posterior P(y|x) + + Args: + D: tuple of sufficient statistics (N, F, S) + return_cov: whether or not to return the posterior covariances. + return_logpy_0: whether or not to return log P(y=0|x). + return_acc: whether or not to return Ry and Py accumulators. + + Returns: + Speaker factor posterior means with shape (num_speakers, y_dim) + Speaker factor posterior convariances with shape (num_speakers, y_dim, y_dim) + log P(y=0|x) with shape (num_spakers,) + Ry accumlator for ML step with shape (y_dim, y_dim) + Py accumlator for MD step with shape (y_dim, y_dim) + """ assert self.is_init + if isinstance(D, tuple): + N, F, S = D + else: + F = D + N = np.ones((F.shape[0],), dtype=F.dtype) + S = None - N, F, S = D Fc = F - self.mu M = F.shape[0] @@ -203,6 +260,14 @@ def compute_py_g_x( return tuple(r) def Estep(self, D): + """Expectation step. + + Args: + D: tuple with sufficient statistics (N, F, S) + + Returns: + Tuple of statistics with accumlated expectations. + """ N, F, S = D y, logpy, Ry, Py = self.compute_py_g_x(D, return_logpy_0=True, return_acc=True) @@ -264,6 +329,14 @@ def Estep(self, D): return stats def elbo(self, stats): + """Computes the objective function. + + Args: + stats: tuple of expectations computed at the Estep. + + Returns: + log P(X) + """ N, M, F, S, logpy_x = stats[:5] logD = np.sum(np.log(self.D)) @@ -281,6 +354,12 @@ def elbo(self, stats): return elbo def MstepML(self, stats): + """Maximum likelihood estimation step. + + Args: + stats: tuple of expectations computed at the Estep. + + """ N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py, Rz1, Rz, Ryz, Cz = stats if self.update_mu and not self.update_V and not self.update_U: @@ -357,6 +436,12 @@ def MstepML(self, stats): self.compute_aux() def MstepMD(self, stats): + """Minimum divergence estimation step. + + Args: + stats: tuple of expectations computed at the Estep. + + """ N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py, Rz1, Rz, Ryz, Cz = stats mu_y = y_acc / M Cov_y = Py / M - np.outer(mu_y, mu_y) @@ -384,6 +469,7 @@ def MstepMD(self, stats): self.compute_aux() def get_config(self): + """Returns the model configuration dict.""" config = { "update_D": self.update_D, "update_U": self.update_U, @@ -394,17 +480,41 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"mu": self.mu, "V": self.V, "U": self.U, "D": self.D} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "V", "U", "D"] params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) def log_probx_g_y(self, x, y): + """Computes logP(X|Y) + + Args: + x: data samples with shape (num_samples, x_dim). + y: speaker factors for each sample with shape (num_samples, y_dim). + + Returns: + log P(X|Y) array with shape (num_samples,) + """ iW = np.diag(1 / self.D) + np.dot(self.U.T, self.U) mult_W, _, logiW = invert_pdmat(iW, return_logdet=True) delta = x - self.mu - np.dot(y, self.V) @@ -417,18 +527,35 @@ def log_probx_g_y(self, x, y): return logp def log_probx_g_yz(self, x, y, z): + """Computes logP(X|Y,Z) + + Args: + x: data samples with shape (num_samples, x_dim). + y: speaker factors for each sample with shape (num_samples, y_dim). + z: channel factors for each sample with shape (num_samples, z_dim). + + Returns: + log P(X|Y,Z) array with shape (num_samples,) + """ logD = np.sum(np.log(self.D)) delta = x - self.mu - np.dot(y, self.V) - np.dot(z, self.U) logp = ( - -x.shape[-1] * np.log(2 * np.pi) - + logD - - np.sum(self.D * delta ** 2, axis=-1) + -x.shape[-1] * np.log(2 * np.pi) + logD - np.sum(self.D * delta**2, axis=-1) ) logp /= 2 return logp def llr_1vs1(self, x1, x2): + """log-likelihood ratio between target and non-target hypothesis for + the case of one enrollment and one test segments. + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + Returns: + Score matrix with shape (num_enrollment_segments, num_test_segments). + """ assert self.is_init WV = self._VW VV = self._VWV @@ -472,7 +599,17 @@ def llr_1vs1(self, x1, x2): return scores def llr_NvsM_book(self, D1, D2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with the exact formula (by the book). + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + D2: tuple of sufficient statistics for the test sides (N2, F2, S2). + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ assert self.is_init N1, F1, _ = D1 @@ -539,8 +676,19 @@ def llr_NvsM_book(self, D1, D2): return scores def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): + """Draws samples from the PLDA model. + + Args: + num_classes: number of classes to sample. + num_samples_per_class: number of samples to sample per each class. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) x_dim = self.mu.shape[0] @@ -562,8 +710,15 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): return y + z1 + z2 def weighted_avg_params(self, mu, V, U, D, w_mu, w_B, w_W): + """Performs weighted average of the model parameters + and some given parameters. + + Args: + mu: other mean vector + w_mu: weight of the given mean vector. - super(PLDA, self).weigthed_avg_params(mu, w_mu) + """ + super().weigthed_avg_params(mu, w_mu) if w_B > 0: Sb0 = np.dot(self.V.T, self.V) Sb = np.dot(V.T, V) @@ -582,26 +737,15 @@ def weighted_avg_params(self, mu, V, U, D, w_mu, w_B, w_W): U = U[:, -self.z_dim :] self.U = U.T iD = np.diag(Sw - np.dot(self.U.T, self.U)).copy() - # print(Sw[:10,:10]) - # print(np.dot(self.U.T, self.U)) - # print(iD[:10]) iD[iD < self.floor_iD] = self.floor_iD self.D = 1 / iD - # if w_W > 0: - # Sw0 = np.dot(self.U.T, self.U) - # Sw = np.dot(U.T, U) - # Sw = w_W*Sw + (1-w_W)*Sw0 - # w, U = sla.eigh(Sw, overwrite_a=True) - # U = np.sqrt(w)*U - # U = U[:,-self.z_dim:] - # self.U = U.T - - # if w_D > 0: - # Sd0 = 1/self.D - # Sd = 1/D - # Sd = w_D*Sd + (1-w_D)*Sd0 - # self.D = 1/Sd - def weighted_avg_model(self, plda, w_mu, w_B, w_W): + """Performs weighted average of the model parameters + and those of another model given as input. + + Args: + plda: other PLDA model. + + """ self.weighted_avg_params(plda.mu, plda.V, plda.U, plda.D, w_mu, w_B, w_W) diff --git a/hyperion/np/pdfs/plda/plda_base.py b/hyperion/np/pdfs/plda/plda_base.py new file mode 100644 index 00000000..a4a308e0 --- /dev/null +++ b/hyperion/np/pdfs/plda/plda_base.py @@ -0,0 +1,571 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from enum import Enum + +import numpy as np + +from ....hyp_defs import float_cpu +from ...transforms import LNorm +from ..core.pdf import PDF + + +class PLDALLRNvsMMethod(str, Enum): + vavg = "vavg" + lnorm_vavg = "lnorm-vavg" + savg = "savg" + book = "book" + + @staticmethod + def choices(): + return [ + PLDALLRNvsMMethod.vavg, + PLDALLRNvsMMethod.lnorm_vavg, + PLDALLRNvsMMethod.savg, + PLDALLRNvsMMethod.book, + ] + + +class PLDABase(PDF): + """Abstract Base class for different versions of + Probabilistic Linear Discriminant Analysis (PLDA) models. + + Attributes: + y_dim: speaker factor dimension. + mu: class-independent mean. + update_mu: whether to update mu or not when training the model. + x_dim: data dimension. + """ + + def __init__( + self, + y_dim=None, + mu=None, + update_mu=True, + epochs=20, + ml_md="ml+md", + md_epochs=None, + **kwargs, + ): + super().__init__(**kwargs) + self.mu = mu + self.y_dim = y_dim + self.update_mu = update_mu + if mu is not None: + self.x_dim = mu.shape[0] + + self.epochs = epochs + self.ml_md = ml_md + self.md_epochs = md_epochs + + def initialize(self, D): + """initializes the model. + + Args: + D: tuple of sufficient statistics (N, F, S) + """ + pass + + def compute_py_g_x(self, D): + """Computes the posterior P(y|x) + + Args: + D: tuple of sufficient statistics (N, F, S) + """ + pass + + def fit( + self, + x, + class_ids=None, + ptheta=None, + sample_weight=None, + x_val=None, + class_ids_val=None, + ptheta_val=None, + sample_weight_val=None, + epochs=None, + ml_md=None, + md_epochs=None, + ): + """Trains the model. + + Args: + x: train data matrix with shape (num_samples, x_dim). + class_ids: class identifiers [0, num_clases-1] for training data. + ptheta: probability of belonging to a class with shape (num_samples, num_classes) for training data. + sample_weight: weight of each sample in the training loss shape (num_samples,). + x_val: validation data matrix with shape (num_val_samples, x_dim). + class_ids_val: class identifiers [0, num_clases-1] for val data. + ptheta_val: probability of belonging to a class with shape (num_samples, num_classes) for val. data. + sample_weight_val: weight of each sample in the val. loss. + epochs: number of EM steps. + ml_md: whether to do maximum likelihood estimation ("ml"), minimum divergence ("md") or both ("ml+md"). + md_epochs: in which epochs to do MD estimation, if None, MD is done in all epochs. + + Returns: + log p(X) of the training data. + log p(x) per sample. + log p(X) of the val. data, if present. + log p(x) of the val. data per sample, if present. + """ + if epochs is None: + epochs = self.epochs + if ml_md is None: + ml_md = self.ml_md + if md_epochs is None: + md_epochs = self.md_epochs + + use_ml = False if ml_md == "md" else True + use_md = False if ml_md == "ml" else True + + assert not (class_ids is None and ptheta is None) + if class_ids is None: + D = self.compute_stats_soft(x, ptheta, sample_weight=sample_weight) + else: + D = self.compute_stats_hard(x, class_ids, sample_weight=sample_weight) + + if x_val is not None: + assert not (class_ids_val is None and ptheta_val is None) + if class_ids_val is None: + D_val = self.compute_stats_soft( + x_val, ptheta_val, sample_weight=sample_weight_val + ) + else: + D_val = self.compute_stats_hard( + x_val, class_ids_val, sample_weight=sample_weight_val + ) + + if not self.is_init: + self.initialize(D) + + elbo = np.zeros((epochs,), dtype=float_cpu()) + elbo_val = np.zeros((epochs,), dtype=float_cpu()) + for epoch in range(epochs): + stats = self.Estep(D) + elbo[epoch] = self.elbo(stats) + if x_val is not None: + stats_val = self.Estep(D_val) + elbo_val[epoch] = self.elbo(stats_val) + + if use_ml: + self.MstepML(stats) + if use_md and (md_epochs is None or epoch in md_epochs): + self.MstepMD(stats) + + elbo_norm = elbo / np.sum(D[0]) + if x_val is None: + return elbo, elbo_norm + else: + elbo_val_norm = elbo_val / np.sum(D_val[0]) + return elbo, elbo_norm, elbo_val, elbo_val_norm + + def Estep(self, x): + """Expectation step.""" + pass + + def MstepML(self, x): + """Maximum likelihood step.""" + pass + + def MstepMD(self, x): + """Minimum Divergence step.""" + pass + + def fit_adapt_weighted_avg_model( + self, + x, + class_ids=None, + ptheta=None, + sample_weight=None, + x_val=None, + class_ids_val=None, + ptheta_val=None, + sample_weight_val=None, + epochs=20, + ml_md="ml+md", + md_epochs=None, + plda0=None, + w_mu=1, + w_B=0.5, + w_W=0.5, + ): + """Adapts a PLDA model to new data. The adapted model is weighted averaged with the prior after each epoch. + + Args: + x: train data matrix with shape (num_samples, x_dim). + class_ids: class identifiers [0, num_clases-1] for training data. + ptheta: probability of belonging to a class with shape (num_samples, num_classes) for training data. + sample_weight: weight of each sample in the training loss shape (num_samples,). + x_val: validation data matrix with shape (num_val_samples, x_dim). + class_ids_val: class identifiers [0, num_clases-1] for val data. + ptheta_val: probability of belonging to a class with shape (num_samples, num_classes) for val. data. + sample_weight_val: weight of each sample in the val. loss. + epochs: number of EM steps. + ml_md: whether to do maximum likelihood estimation ("ml"), minimum divergence ("md") or both ("ml+md"). + md_epochs: in which epochs to do MD estimation, if None, MD is done in all epochs. + plda0: prior model. + w_mu: weigth of the prior on the mean. + w_B: weight of the prior on the between-class precision. + w_W: weight of the prior on the within-class precision. + + Returns: + log p(X) of the training data. + log p(x) per sample. + log p(X) of the val. data, if present. + log p(x) of the val. data per sample, if present. + """ + + assert self.is_init + use_ml = False if ml_md == "md" else True + use_md = False if ml_md == "ml" else True + + assert not (class_ids is None and ptheta is None) + if class_ids is None: + D = self.compute_stats_soft(x, ptheta, sample_weight=sample_weight) + else: + D = self.compute_stats_hard(x, class_ids, sample_weight=sample_weight) + + if x_val is not None: + assert not (class_ids_val is None and ptheta_val is None) + if class_ids_val is None: + D_val = self.compute_stats_soft( + x_val, ptheta_val, sample_weight=sample_weight_val + ) + else: + D_val = self.compute_stats_hard( + x_val, class_ids_val, sample_weight=sample_weight_val + ) + + elbo = np.zeros((epochs,), dtype=float_cpu()) + elbo_val = np.zeros((epochs,), dtype=float_cpu()) + for epoch in range(epochs): + stats = self.Estep(D) + elbo[epoch] = self.elbo(stats) + if x_val is not None: + stats_val = self.Estep(D_val) + elbo_val[epoch] = self.elbo(stats_val) + + if use_ml: + self.MstepML(stats) + if use_md and (md_epochs is None or epoch in md_epochs): + self.MstepMD(stats) + + self.weighted_avg_model(plda0, w_mu, w_B, w_W) + + elbo_norm = elbo / np.sum(D[0]) + if x_val is None: + return elbo, elbo_norm + else: + elbo_val_norm = elbo_val / np.sum(D_val[0]) + return elbo, elbo_norm, elbo_val, elbo_val_norm + + @staticmethod + def compute_stats_soft(x, p_theta, sample_weight=None, scal_factor=None): + """Computes sufficient statistics need by PLDA model using soft class assigments. + + Args: + x: input data with shape (num_samples, x_dim) + p_theta: soft class assigments with shape (num_samples, num_classes) + sample_weight: weight of each sample for training with shape (num_samples, ) + scal_factor: scaling factor for sufficient statistics (Themos factor) + + Returns: + N: zero order stats with shape (num_classes,) + F: first order stats with shape (num_classes, x_dim) + S: Accumulated second order stats with sahpe (x_dim, x_dim) + """ + if sample_weight is not None: + p_theta = sample_weight[:, None] * p_theta + if scal_factor is not None: + p_theta *= scal_factor + N = np.sum(p_theta, axis=0) + F = np.dot(p_theta.T, x) + wx = np.sum(p_theta, axis=1, keepdims=True) * x + S = np.dot(x.T, wx) + return N, F, S + + @staticmethod + def compute_stats_hard(x, class_ids, sample_weight=None, scale_factor=None): + """Computes sufficient statistics need by PLDA model using soft class assigments. + + Args: + x: input data with shape (num_samples, x_dim) + class_ids: integer [0, num_classes-1] vector indicating the class of each sample. + sample_weight: weight of each sample for training with shape (num_samples, ) + scal_factor: scaling factor for sufficient statistics (Themos factor) + + Returns: + N: zero order stats with shape (num_classes,) + F: first order stats with shape (num_classes, x_dim) + S: Accumulated second order stats with sahpe (x_dim, x_dim) + """ + x_dim = x.shape[1] + num_classes = np.max(class_ids) + 1 + N = np.zeros((num_classes,), dtype=float_cpu()) + F = np.zeros((num_classes, x_dim), dtype=float_cpu()) + if sample_weight is not None: + wx = sample_weight[:, None] * x + else: + wx = x + + for i in range(num_classes): + idx = class_ids == i + if sample_weight is None: + N[i] = np.sum(idx).astype(float_cpu()) + F[i] = np.sum(x[idx], axis=0) + else: + N[i] = np.sum(sample_weight[idx]) + F[i] = np.sum(wx[idx], axis=0) + + S = np.dot(x.T, wx) + if scale_factor is not None: + N *= scale_factor + F *= scale_factor + S *= scale_factor + + return N, F, S + + @staticmethod + def compute_stats_hard_v0(x, class_ids, sample_weight=None, scal_factor=None): + x_dim = x.shape[1] + num_classes = np.max(class_ids) + 1 + p_theta = np.zeros((x.shape[0], num_classes), dtype=float_cpu()) + p_theta[np.arange(x.shape[0]), class_ids] = 1 + return PLDABase.compute_stats_soft(x, p_theta, sample_weight, scal_factor) + + @staticmethod + def center_stats(D, mu): + """Centers the sufficient statistics by the PLDA mean. + + Args: + D: tupe with sufficient stats (N, F, S). + mu: mean vector. + + Returns: + Centered N, F, S + """ + N, F, S = D + Fc = F - np.outer(N, mu) + Fmu = np.outer(np.sum(F, axis=0), mu) + Sc = S - Fmu - Fmu.T + np.sum(N) * np.outer(mu, mu) + return N, Fc, Sc + + def llr_1vs1(self, x1, x2): + """log-likelihood ratio between target and non-target hypothesis for + the case of one enrollment and one test segments. + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + + Returns: + Score matrix with shape (num_enrollment_segments, num_test_segments). + """ + pass + + def llr_NvsM_book(self, D1, D2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with the exact formula (by the book). + + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + D2: tuple of sufficient statistics for the test sides (N2, F2, S2). + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ + pass + + def llr_NvsM( + self, x1, x2, ids1=None, ids2=None, method=PLDALLRNvsMMethod.lnorm_vavg + ): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + ids1: integer array mapping from segments to + enrollment-sides in [0, num_enroll_sides-1] + ids2: integer array mapping from segments to + test-sides in [0, num_test_sides-1] + method: evaluation method in ["book" (exact formula), + "vavg" (vector averaging), "vavg-lnorm" (vector averagin + lnorm), + "savg" (score averaging)] + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ + if method == PLDALLRNvsMMethod.savg: + return self.llr_NvsM_savg(x1, ids1, x2, ids2) + + D1 = x1 if ids1 is None else self.compute_stats_hard(x1, class_ids=ids1) + D2 = x2 if ids2 is None else self.compute_stats_hard(x2, class_ids=ids2) + + if method == PLDALLRNvsMMethod.book: + return self.llr_NvsM_book(D1, D2) + if method == PLDALLRNvsMMethod.vavg: + return self.llr_NvsM_vavg(D1, D2, do_lnorm=False) + if method == PLDALLRNvsMMethod.lnorm_vavg: + return self.llr_NvsM_vavg(D1, D2, do_lnorm=True) + + raise ValueError(f"wrong llr {method}") + + def llr_NvsM_vavg(self, D1, D2, do_lnorm=True): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with vector averaging. + + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + D2: tuple of sufficient statistics for the test sides (N2, F2, S2). + do_lnorm: whether or not to do length norm. after vector averaging. + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ + x1 = D1[1] / np.expand_dims(D1[0], axis=-1) + x2 = D2[1] / np.expand_dims(D2[0], axis=-1) + if do_lnorm: + lnorm = LNorm() + x1 = lnorm.predict(x1) + x2 = lnorm.predict(x2) + + return self.llr_1vs1(x1, x2) + + def llr_NvsM_savg(self, x1, ids1, x2, ids2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + ids1: integer array mapping from segments to + enrollment-sides in [0, num_enroll_sides-1] + ids2: integer array mapping from segments to + test-sides in [0, num_test_sides-1] + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ + scores_1vs1 = self.llr_1vs1(x1, x2) + N, F, _ = self.compute_stats_hard(scores_1vs1, ids1) + scores_Nvs1 = F / N[:, None] + N, F, _ = self.compute_stats_hard(scores_Nvs1.T, ids2) + scores = F.T / N + return scores + + def llr_Nvs1(self, x1, x2, ids1=None, method=PLDALLRNvsMMethod.lnorm_vavg): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_test_segmens, x_dim). + ids1: integer array mapping from segments to + enrollment-sides in [0, num_enroll_sides-1] + method: evaluation method in ["book" (exact formula), + "vavg" (vector averaging), "vavg-lnorm" (vector averagin + lnorm), + "savg" (score averaging)] + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ + if method == PLDALLRNvsMMethod.savg: + return self.llr_Nvs1_savg(x1, ids1, x2) + + D1 = x1 if ids1 is None else self.compute_stats_hard(x1, class_ids=ids1) + + if method == PLDALLRNvsMMethod.book: + D2 = self.compute_stats_hard(x2, np.arange(x2.shape[0])) + return self.llr_NvsM_book(D1, D2) + if method == PLDALLRNvsMMethod.vavg: + return self.llr_Nvs1_vavg(D1, x2, do_lnorm=False) + if method == PLDALLRNvsMMethod.lnorm_vavg: + return self.llr_Nvs1_vavg(D1, x2, do_lnorm=True) + + raise ValueError(f"wrong llr {method}") + + def llr_Nvs1_vavg(self, D1, x2, do_lnorm=True): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with vector averaging. + + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + x2: test vectors with shape (num_test_segmens, x_dim). + do_lnorm: whether or not to do length norm. after vector averaging. + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ + x1 = D1[1] / np.expand_dims(D1[0], axis=-1) + if do_lnorm: + lnorm = LNorm() + x1 = lnorm.predict(x1) + x2 = lnorm.predict(x2) + + return self.llr_1vs1(x1, x2) + + def llr_Nvs1_savg(self, x1, ids1, x2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + ids1: integer array mapping from segments to + enrollment-sides in [0, num_enroll_sides-1] + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ + scores_1vs1 = self.llr_1vs1(x1, x2) + N, F, _ = self.compute_stats_hard(scores_1vs1, ids1) + scores = F / N[:, None] + return scores + + def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): + """Draws samples from the PLDA model. + + Args: + num_classes: number of classes to sample. + num_samples_per_class: number of samples to sample per each class. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ + pass + + def get_config(self): + """Returns the model configuration dict.""" + config = {"y_dim": self.y_dim, "update_mu": self.update_mu} + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def weigthed_avg_params(self, mu, w_mu): + """Performs weighted average of the model parameters + and some given parameters. + + Args: + mu: other mean vector + w_mu: weight of the given mean vector. + + """ + self.mu = w_mu * mu + (1 - w_mu) * self.mu + + def weigthed_avg_model(self, plda): + """Performs weighted average of the model parameters + and those of another model given as input. + + Args: + plda: other PLDA model. + + """ + pass diff --git a/hyperion/np/pdfs/plda/splda.py b/hyperion/np/pdfs/plda/splda.py new file mode 100644 index 00000000..32fc4628 --- /dev/null +++ b/hyperion/np/pdfs/plda/splda.py @@ -0,0 +1,596 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import numpy as np +from scipy import linalg as sla + +from ....hyp_defs import float_cpu +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat +from .plda_base import PLDABase + + +class SPLDA(PLDABase): + """Class for Simplied Probabilistic Discriminant Analysis (SPLDA). + .. math:: + \mathbf{x}_{ij} = \mu + \mathbf{V} \mathbf{y}_i + \varepsilon_{ij} + + Attributes: + y_dim: speaker factor dimension. + mu: class-independent mean. + V: speaker factor loading matrix. + W: within-class precision. + fullcov_W: whether W is full-precision matrix or not. + update_mu: whether to update mu or not when training the model. + update_V: whether to update V or not when training the model. + update_W: whether to update W or not when training the model. + x_dim: data dimension. + """ + + def __init__( + self, + y_dim=None, + mu=None, + V=None, + W=None, + fullcov_W=True, + update_mu=True, + update_V=True, + update_W=True, + epochs=20, + ml_md="ml+md", + md_epochs=None, + **kwargs + ): + super().__init__( + y_dim=y_dim, + mu=mu, + update_mu=update_mu, + epochs=epochs, + ml_md=ml_md, + md_epochs=md_epochs, + **kwargs + ) + if V is not None: + self.y_dim = V.shape[0] + self.V = V + self.W = W + self.fullcov_W = fullcov_W + self.update_V = update_V + self.update_W = update_W + + def validate(self): + """Validates the model parameters.""" + assert self.mu.shape[0] >= self.V.shape[0] + assert self.mu.shape[0] == self.V.shape[1] + assert self.mu.shape[0] == self.W.shape[0] + assert self.mu.shape[0] == self.W.shape[1] + + @property + def is_init(self): + """Returns True if the model has been initialized.""" + if self._is_init: + return True + if self.mu is not None and self.V is not None and self.W is not None: + self.validate() + self._is_init = True + return self._is_init + + def initialize(self, D): + """initializes the model. + + Args: + D: tuple of sufficient statistics (N, F, S) + """ + N, F, S = D + self.x_dim = F.shape[1] + M = F.shape[0] + N_tot = np.sum(N) + + Vytilde = F / N[:, None] + mu = np.mean(Vytilde, axis=0) + + Vy = Vytilde - mu + U, s, Vt = sla.svd(Vy, full_matrices=False, overwrite_a=True) + V = s[: self.y_dim, None] * Vt[: self.y_dim, :] + NVytilde = N[:, None] * Vytilde + C = (S - np.dot(NVytilde.T, Vytilde)) / N_tot + if self.fullcov_W: + W = invert_pdmat(C, return_inv=True)[-1] + else: + W = 1 / np.diag(C) + + self.mu = mu + self.V = V + self.W = W + + def compute_py_g_x( + self, D, return_cov=False, return_logpy_0=False, return_acc=False + ): + """Computes the posterior P(y|x) + + Args: + D: tuple of sufficient statistics (N, F, S) + return_cov: whether or not to return the posterior covariances. + return_logpy_0: whether or not to return log P(y=0|x). + return_acc: whether or not to return Ry and Py accumulators. + + Returns: + Speaker factor posterior means with shape (num_speakers, y_dim) + Speaker factor posterior convariances with shape (num_speakers, y_dim, y_dim) + log P(y=0|x) with shape (num_spakers,) + Ry accumlator for ML step with shape (y_dim, y_dim) + Py accumlator for MD step with shape (y_dim, y_dim) + """ + if isinstance(D, tuple): + N, F, S = D + else: + F = D + N = np.ones((F.shape[0],), dtype=F.dtype) + S = None + + Fc = F - self.mu + + M = F.shape[0] + y_dim = self.y_dim + + WV = np.dot(self.W, self.V.T) + VV = np.dot(self.V, WV) + + compute_inv = return_cov or return_acc + return_tuple = compute_inv or return_logpy_0 + + N_is_int = False + if np.all(np.ceil(N) == N): + N_is_int = True + + I = np.eye(y_dim, dtype=float_cpu()) + gamma = np.dot(Fc, WV) + if N_is_int: + iterator = np.unique(N) + else: + iterator = range(M) + + y = np.zeros((M, y_dim), dtype=float_cpu()) + if return_cov: + Sigma_y = np.zeros((M, y_dim, y_dim), dtype=float_cpu()) + else: + Sigma_y = None + + if return_logpy_0: + logpy = -0.5 * y_dim * np.log(2 * np.pi) * np.ones((M,), dtype=float_cpu()) + + if return_acc: + Py = np.zeros((y_dim, y_dim), dtype=float_cpu()) + Ry = np.zeros((y_dim, y_dim), dtype=float_cpu()) + + for k in iterator: + if N_is_int: + i = (N == k).nonzero()[0] + N_i = k + M_i = len(i) + else: + i = k + N_i = N[k] + M_i = 1 + + L_i = I + N_i * VV + r = invert_pdmat( + L_i, + right_inv=True, + return_logdet=return_logpy_0, + return_inv=compute_inv, + ) + + mult_iL = r[0] + if return_logpy_0: + logL = r[2] + if compute_inv: + iL = r[-1] + + y[i, :] = mult_iL(gamma[i, :]) + + if return_cov: + Sigma_y[i, :, :] = iL + + if return_logpy_0: + logpy[i] += 0.5 * (logL - np.sum(y[i, :] * gamma[i, :], axis=-1)) + + if return_acc: + Py += M_i * iL + Ry += N_i * M_i * iL + + if not return_tuple: + return y + + r = [y] + if return_cov: + r += [Sigma_y] + if return_logpy_0: + r += [logpy] + if return_acc: + r += [Ry, Py] + return tuple(r) + + def Estep(self, D): + """Expectation step. + + Args: + D: tuple with sufficient statistics (N, F, S) + + Returns: + Tuple of statistics with accumlated expectations. + """ + N, F, S = D + y, logpy, Ry, Py = self.compute_py_g_x(D, return_logpy_0=True, return_acc=True) + + M = F.shape[0] + N_tot = np.sum(N) + F_tot = np.sum(F, axis=0) + + y_acc = np.sum(y, axis=0) + Cy = np.dot(F.T, y) + + Niy = y * N[:, None] + Ry1 = np.sum(Niy, axis=0) + Ry += np.dot(Niy.T, y) + Py += np.dot(y.T, y) + + logpy_acc = np.sum(logpy) + + stats = (N_tot, M, F_tot, S, logpy_acc, y_acc, Ry1, Ry, Cy, Py) + return stats + + def elbo(self, stats): + """Computes the objective function. + + Args: + stats: tuple of expectations computed at the Estep. + + Returns: + log P(X) + """ + N, M, F, S, logpy_x = stats[:5] + + logW = logdet_pdmat(self.W) + Fmu = np.outer(F, self.mu) + Shat = S - Fmu - Fmu.T + N * np.outer(self.mu, self.mu) + + logpx_y = 0.5 * ( + -N * self.x_dim * np.log(2 * np.pi) + + N * logW + - np.inner(self.W.ravel(), Shat.ravel()) + ) + logpy = -0.5 * M * self.y_dim * np.log(2 * np.pi) + + elbo = logpx_y + logpy - logpy_x + return elbo + + def MstepML(self, stats): + """Maximum likelihood estimation step. + + Args: + stats: tuple of expectations computed at the Estep. + + """ + N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py = stats + + a = np.hstack((Ry, Ry1[:, None])) + b = np.hstack((Ry1, N)) + Rytilde = np.vstack((a, b)) + + Cytilde = np.hstack((Cy, F[:, None])) + + if self.update_mu and not self.update_V: + self.mu = (F - np.dot(Ry1, self.V)) / N + + if not self.update_mu and self.update_V: + iRy_mult = invert_pdmat(Ry, right_inv=False)[0] + self.V = iRy_mult(Cy.T - np.outer(Ry1, self.mu)) + + if self.update_mu and self.update_V: + iRytilde_mult = invert_pdmat(Rytilde, right_inv=False)[0] + Vtilde = iRytilde_mult(Cytilde.T) + self.V = Vtilde[:-1, :] + self.mu = Vtilde[-1, :] + + if self.update_W: + if self.update_mu and self.update_V: + iW = (S - np.dot(Cy, self.V) - np.outer(F, self.mu)) / N + else: + Vtilde = np.vstack((self.V, self.mu)) + CVt = np.dot(Cytilde, Vtilde) + iW = (S - CVt - CVt.T + np.dot(np.dot(Vtilde.T, Rytilde), Vtilde)) / N + if self.fullcov_W: + self.W = invert_pdmat(iW, return_inv=True)[-1] + else: + self.W = np.diag(1 / np.diag(iW)) + + def MstepMD(self, stats): + """Minimum divergence estimation step. + + Args: + stats: tuple of expectations computed at the Estep. + + """ + N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py = stats + mu_y = y_acc / M + + if self.update_mu: + self.mu += np.dot(mu_y, self.V) + + if self.update_V: + Cov_y = Py / M - np.outer(mu_y, mu_y) + chol_Cov_y = sla.cholesky(Cov_y, lower=False, overwrite_a=True) + self.V = np.dot(chol_Cov_y, self.V) + + def get_config(self): + """Returns the model configuration dict.""" + config = { + "update_W": self.update_W, + "update_V": self.update_V, + "fullcov_W": self.fullcov_W, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"mu": self.mu, "V": self.V, "W": self.W} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["mu", "V", "W"] + params = cls._load_params_to_dict(f, config["name"], param_list) + kwargs = dict(list(config.items()) + list(params.items())) + return cls(**kwargs) + + def log_probx_g_y(self, x, y): + """Computes logP(X|Y) + + Args: + x: data samples with shape (num_samples, x_dim). + y: speaker factors for each sample with shape (num_samples, y_dim). + + Returns: + log P(X|Y) array with shape (num_samples,) + """ + logW = logdet_pdmat(self.W) + delta = x - self.mu - np.dot(y, self.V) + logp = ( + -x.shape[-1] * np.log(2 * np.pi) + + logW + - np.sum(np.dot(delta, self.W) * delta, axis=-1) + ) + logp /= 2 + return logp + + def llr_1vs1(self, x1, x2): + """log-likelihood ratio between target and non-target hypothesis for + the case of one enrollment and one test segments. + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + + Returns: + Score matrix with shape (num_enrollment_segments, num_test_segments). + """ + WV = np.dot(self.W, self.V.T) + VV = np.dot(self.V, WV) + I = np.eye(self.y_dim, dtype=float_cpu()) + + Lnon = I + VV + mult_icholLnon, logcholLnon = invert_trimat( + sla.cholesky(Lnon, lower=False, overwrite_a=True), + right_inv=True, + return_logdet=True, + )[:2] + logLnon = 2 * logcholLnon + + Ltar = I + 2 * VV + mult_icholLtar, logcholLtar = invert_trimat( + sla.cholesky(Ltar, lower=False, overwrite_a=True), + right_inv=True, + return_logdet=True, + )[:2] + logLtar = 2 * logcholLtar + + VWF1 = np.dot(x1 - self.mu, WV) + VWF2 = np.dot(x2 - self.mu, WV) + + gamma_non_1 = mult_icholLnon(VWF1) + gamma_non_2 = mult_icholLnon(VWF2) + + Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] + Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) + + gamma_tar_1 = mult_icholLtar(VWF1) + gamma_tar_2 = mult_icholLtar(VWF2) + + Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] + Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) + + scores = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) + scores += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 + scores += 2 * logLnon - logLtar + scores *= 0.5 + return scores + + def llr_NvsM_book(self, D1, D2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with the exact formula (by the book). + + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + D2: tuple of sufficient statistics for the test sides (N2, F2, S2). + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ + N1, F1, _ = D1 + N2, F2, _ = D2 + + WV = np.dot(self.W, self.V.T) + VV = np.dot(self.V, WV) + I = np.eye(self.y_dim, dtype=float_cpu()) + + F1 -= N1[:, None] * self.mu + F2 -= N2[:, None] * self.mu + + scores = np.zeros((len(N1), len(N2)), dtype=float_cpu()) + for N1_i in np.unique(N1): + for N2_j in np.unique(N2): + i = np.where(N1 == N1_i)[0] + j = np.where(N2 == N2_j)[0] + L1 = I + N1_i * VV + mult_icholL1, logcholL1 = invert_trimat( + sla.cholesky(L1, lower=False, overwrite_a=True), + right_inv=True, + return_logdet=True, + )[:2] + logL1 = 2 * logcholL1 + + L2 = I + N2_j * VV + mult_icholL2, logcholL2 = invert_trimat( + sla.cholesky(L2, lower=False, overwrite_a=True), + right_inv=True, + return_logdet=True, + )[:2] + logL2 = 2 * logcholL2 + + Ltar = I + (N1_i + N2_j) * VV + mult_icholLtar, logcholLtar = invert_trimat( + sla.cholesky(Ltar, lower=False, overwrite_a=True), + right_inv=True, + return_logdet=True, + )[:2] + logLtar = 2 * logcholLtar + + VWF1 = np.dot(F1[i, :], WV) + VWF2 = np.dot(F2[j, :], WV) + + gamma_non_1 = mult_icholL1(VWF1) + gamma_non_2 = mult_icholL2(VWF2) + + Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] + Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) + + gamma_tar_1 = mult_icholLtar(VWF1) + gamma_tar_2 = mult_icholLtar(VWF2) + + Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] + Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) + + scores_ij = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) + scores_ij += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 + scores_ij += logL1 + logL2 - logLtar + scores[np.ix_(i, j)] = scores_ij + + scores *= 0.5 + return scores + + def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): + """Draws samples from the PLDA model. + + Args: + num_classes: number of classes to sample. + num_samples_per_class: number of samples to sample per each class. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ + if rng is None: + rng = np.random.default_rng(seed=seed) + + Sw = invert_pdmat(self.W, return_inv=True)[-1] + chol_Sw = sla.cholesky(Sw, lower=False) + + x_dim = self.mu.shape[0] + z = rng.normal(size=(num_classes * num_samples_per_class, x_dim)).astype( + dtype=float_cpu(), copy=False + ) + z = np.dot(z, chol_Sw) + y = rng.normal(size=(num_classes, self.y_dim)).astype( + dtype=float_cpu(), copy=False + ) + y = np.dot(y, self.V) + self.mu + y = np.repeat(y, num_samples_per_class, axis=0) + + return y + z + + def weighted_avg_params(self, mu, V, W, w_mu, w_B, w_W): + """Performs weighted average of the model parameters + and some given parameters. + + Args: + mu: other mean vector + w_mu: weight of the given mean vector. + + """ + super().weigthed_avg_params(mu, w_mu) + if w_B > 0: + Sb0 = np.dot(self.V.T, self.V) + Sb = np.dot(V.T, V) + Sb = w_B * Sb + (1 - w_B) * Sb0 + w, V = sla.eigh(Sb, overwrite_a=True) + w = w[-self.y_dim :] + V = np.sqrt(w) * V[:, -self.y_dim :] + self.V = V.T + + if w_W > 0: + Sw0 = invert_pdmat(self.W, return_inv=True)[-1] + Sw = invert_pdmat(W, return_inv=True)[-1] + Sw = w_W * Sw + (1 - w_W) * Sw0 + self.W = invert_pdmat(Sw, return_inv=True)[-1] + + def weighted_avg_model(self, plda, w_mu, w_B, w_W): + """Performs weighted average of the model parameters + and those of another model given as input. + + Args: + plda: other PLDA model. + + """ + self.weighted_avg_params(plda.mu, plda.V, plda.W, w_mu, w_B, w_W) + + def project(self, T, delta_mu=None): + """Transforms the PLDA parameters given an affine transformation + of the data. + + Args: + T: data projection matrix. + delta_mu: data shift vector. + + Returns: + Projected PLDA model. + """ + mu = self.mu + if mu is not None: + mu -= delta_mu + mu = np.dot(mu, T) + V = np.dot(self.V, T) + Sw = invert_pdmat(self.W, return_inv=True)[-1] + Sw = np.dot(T.T, np.dot(Sw, T)) + W = invert_pdmat(Sw, return_inv=True)[-1] + + return SPLDA(mu=mu, V=V, W=W, fullcov_W=True) diff --git a/hyperion/np/preprocessing/__init__.py b/hyperion/np/preprocessing/__init__.py new file mode 100644 index 00000000..8cbe932a --- /dev/null +++ b/hyperion/np/preprocessing/__init__.py @@ -0,0 +1,6 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .resampler import Resampler diff --git a/hyperion/np/preprocessing/resampler.py b/hyperion/np/preprocessing/resampler.py new file mode 100644 index 00000000..1c3e5901 --- /dev/null +++ b/hyperion/np/preprocessing/resampler.py @@ -0,0 +1,46 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +class Resampler: + def __init__(self, target_sample_freq: float): + self.target_sample_freq = target_sample_freq + self.resamplers = {} + + def _get_resampler(self, input_sample_freq): + if input_sample_freq in self.resamplers: + return self.resamplers[input_sample_freq] + + import torch + import torchaudio.transforms as tat + + try: + resampler = tat.Resample( + int(input_sample_freq), + int(self.target_sample_freq), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="sinc_interp_kaiser", + beta=14.769656459379492, + ) + except: + resampler = tat.Resample( + int(input_sample_freq), + int(self.target_sample_freq), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492, + ) + resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() + self.resamplers[fs] = resampler_f + return resampler_f + + def __call__(self, x, sample_freq: float): + if sample_freq == self.target_sample_freq: + return x, sample_freq + + resampler = self._get_resampler(sample_freq) + return resampler(x), self.target_sample_freq diff --git a/hyperion/np/score_norm/__init__.py b/hyperion/np/score_norm/__init__.py new file mode 100644 index 00000000..7707b669 --- /dev/null +++ b/hyperion/np/score_norm/__init__.py @@ -0,0 +1,12 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +from .adapt_s_norm import AdaptSNorm +from .s_norm import SNorm +from .t_norm import TNorm +from .tz_norm import TZNorm +from .z_norm import ZNorm +from .zt_norm import ZTNorm diff --git a/hyperion/np/score_norm/adapt_s_norm.py b/hyperion/np/score_norm/adapt_s_norm.py new file mode 100644 index 00000000..294893ae --- /dev/null +++ b/hyperion/np/score_norm/adapt_s_norm.py @@ -0,0 +1,500 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import math + +import h5py +import numpy as np + +from .score_norm import ScoreNorm + + +class AdaptSNorm(ScoreNorm): + """Class for adaptive S-Norm. + + Attributes: + nbest: number of samples selected to compute the statistics for each trial + by the adaptive algorith + nbest_discard: discard the nbest trials with higher scores, which could + be actual target trials. + std_floor: floor for standard deviations. + """ + + def __init__( + self, + nbest=100, + nbest_discard=0, + nbest_sel_method="highest-other-side", + **kwargs, + ): + super().__init__(*kwargs) + self.nbest = nbest + self.nbest_discard = nbest_discard + self.nbest_sel_method = nbest_sel_method + + def __call__( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test=None, + mask_enr_coh=None, + return_stats=False, + ): + return self.predict( + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + ) + + def predict( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test=None, + mask_enr_coh=None, + return_stats=False, + ): + """Normalizes the scores. + + Args: + scores: score matrix enroll vs. test. + scores_coh_test: score matrix cohort vs. test. + scores_enr_coh: score matrix enroll vs cohort. + mask_coh_test: binary matrix to mask out target trials + from cohort vs test matrix. + mask_enr_coh: binary matrix to mask out target trials + from enroll vs. cohort matrix. + + """ + + assert scores_enr_coh.shape[1] == scores_coh_test.shape[0] + assert self.nbest_discard < scores_enr_coh.shape[1] + if self.nbest > scores_enr_coh.shape[1] - self.nbest_discard: + nbest = scores_enr_coh.shape[1] - self.nbest_discard + else: + nbest = self.nbest + + if mask_coh_test is not None: + scores_coh_test[~mask_coh_test] = 0 + if mask_enr_coh is not None: + scores_enr_coh[~mask_enr_coh] = 0 + + if self.nbest_sel_method == "highest-other-side": + return self._norm_highest_other_side( + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ) + elif self.nbest_sel_method == "highest-same-side": + return self._norm_highest_same_side( + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ) + else: + raise Exception(f"invalid cohort selection method {self.nbest_sel_method}") + + def _norm_highest_other_side0( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ): + + if return_stats: + mu_z = np.zeros_like(scores) + mu_t = np.zeros_like(scores) + if self.norm_var: + s_z = np.zeros_like(scores) + s_t = np.zeros_like(scores) + else: + s_z = s_t = 1.0 + + scores_z_norm = np.zeros_like(scores) + best_idx = np.flipud(np.argsort(scores_coh_test, axis=0))[ + self.nbest_discard : self.nbest_discard + nbest + ] + for i in range(scores.shape[1]): + best_idx_i = best_idx[:, i] + + best_scores_i = scores_enr_coh[:, best_idx_i] + mu_z_i = np.mean(best_scores_i, axis=1, keepdims=False) + + if mask_enr_coh is None: + s_z_i = np.std(best_scores_i, axis=1, keepdims=False) + else: + norm = np.mean(mask_enr_coh[:, best_idx_i], axis=1, keepdims=False) + mu_z_i /= norm + s_z_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=1, keepdims=False) / norm + - mu_z_i ** 2 + ) + + s_z_i = np.clip(s_z_i, a_min=1e-5, a_max=None) + if not self.norm_var: + s_z_i = 1.0 + + scores_z_norm[:, i] = (scores[:, i] - mu_z_i) / s_z_i + if return_stats: + mu_z[:, i] = mu_z_i + if self.norm_var: + s_z[:, i] = s_z_i + + scores_t_norm = np.zeros_like(scores) + best_idx = np.fliplr(np.argsort(scores_enr_coh, axis=1))[ + :, self.nbest_discard : self.nbest_discard + nbest + ] + for i in range(scores.shape[0]): + best_idx_i = best_idx[i] + best_scores_i = scores_coh_test[best_idx_i, :] + mu_t_i = np.mean(best_scores_i, axis=0, keepdims=False) + + if mask_coh_test is None: + s_t_i = np.std(best_scores_i, axis=0, keepdims=False) + else: + norm = np.mean(mask_coh_test[best_idx_i, :], axis=0, keepdims=False) + mu_t_i /= norm + s_t_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=0, keepdims=False) / norm + - mu_t_i ** 2 + ) + + s_t_i = np.clip(s_t_i, a_min=1e-5, a_max=None) + if not self.norm_var: + s_t_i = 1.0 + + scores_t_norm[i, :] = (scores[i, :] - mu_t_i) / s_t_i + if return_stats: + mu_t[i, :] = mu_t_i + if self.norm_var: + s_t[i, :] = s_t_i + + scores_norm = (scores_z_norm + scores_t_norm) / np.sqrt(2) + if return_stats: + return scores_norm, mu_z, s_z, mu_t, s_t + else: + return scores_norm + + def _norm_highest_other_side( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ): + + # this is very memory intensive, so we pass to f32 + scores_coh_test = scores_coh_test.astype("float32", copy=False) + scores_enr_coh = scores_enr_coh.astype("float32", copy=False) + + best_idx = np.argsort(-scores_coh_test, axis=0)[ + self.nbest_discard : self.nbest_discard + nbest + ].T # (n_test, n_best) + + mem = nbest * scores_enr_coh.shape[0] * scores.shape[1] * 4 / 2 ** 30 + # limit mem to 10 GB + num_groups = math.ceil(mem / 10) + num_el_group = int(math.ceil(scores.shape[1] / num_groups)) + scores_enr_coh = np.expand_dims(scores_enr_coh, 0) + if mask_enr_coh is not None: + mask_enr_coh = np.expand_dims(scores_enr_coh, 0) + + mu_z = [] + s_z = [] + for start in range(0, scores.shape[1], num_el_group): + stop = min(start + num_el_group, scores.shape[1]) + best_idx_i = np.expand_dims(best_idx[start:stop], 1) + best_scores_i = np.take_along_axis(scores_enr_coh, best_idx_i, axis=-1) + mu_z_i = best_scores_i.mean(axis=-1) + + if mask_enr_coh is None: + s_z_i = np.std(best_scores_i, axis=-1) + else: + mask_i = np.take_along_axis(mask_enr_coh, best_idx_i, axis=-1) + norm = mask_i.mean(axis=-1) + mu_z_i /= norm + s_z_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=-1) / norm - mu_z_i ** 2 + ) + + del best_scores_i + mu_z.append(mu_z_i.T) + s_z.append(s_z_i.T) + + mu_z = np.concatenate(mu_z, axis=-1) + s_z = np.concatenate(s_z, axis=-1) + + s_z = np.clip(s_z, a_min=1e-5, a_max=None) + if not self.norm_var: + s_z = 1.0 + + scores_z_norm = (scores - mu_z) / s_z + + scores_enr_coh = scores_enr_coh[0] # unsqueeze + best_idx = np.argsort(-scores_enr_coh, axis=1)[ + :, self.nbest_discard : self.nbest_discard + nbest + ].T + + mem = nbest * scores.shape[0] * scores_coh_test.shape[1] * 4 / 2 ** 30 + # limit mem to 10 GB + num_groups = math.ceil(mem / 10) + num_el_group = int(math.ceil(scores.shape[0] / num_groups)) + scores_coh_test = np.expand_dims(scores_coh_test, -1) + if mask_coh_test is not None: + mask_coh_test = np.expand_dims(mask_coh_test, -1) + + mu_t = [] + s_t = [] + for start in range(0, scores.shape[0], num_el_group): + stop = min(start + num_el_group, scores.shape[0]) + best_idx_i = np.expand_dims(best_idx[:, start:stop], 1) + # print(scores_coh_test.shape, best_idx_i.shape) + best_scores_i = np.take_along_axis(scores_coh_test, best_idx_i, axis=0) + # print(best_scores_i.shape) + mu_t_i = best_scores_i.mean(axis=0) + if mask_enr_coh is None: + s_t_i = np.std(best_scores_i, axis=0) + else: + mask_i = np.take_along_axis(mask_coh_test, best_idx_i, axis=0) + norm = mask_i.mean(axis=0) + mu_t_i /= norm + s_t_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=0) / norm - mu_t_i ** 2 + ) + + # print(best_scores_i.shape, mu_t_i.shape) + del best_scores_i + mu_t.append(mu_t_i.T) + s_t.append(s_t_i.T) + + mu_t = np.concatenate(mu_t, axis=0) + s_t = np.concatenate(s_t, axis=0) + + s_t = np.clip(s_t, a_min=1e-5, a_max=None) + if not self.norm_var: + s_t = 1.0 + + scores_t_norm = (scores - mu_t) / s_t + + scores_norm = (scores_z_norm + scores_t_norm) / np.sqrt(2) + if return_stats: + return scores_norm, mu_z, s_z, mu_t, s_t + else: + return scores_norm + + def _norm_highest_same_side0( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ): + + if return_stats: + mu_z = np.zeros_like(scores) + mu_t = np.zeros_like(scores) + if self.norm_var: + s_z = np.zeros_like(scores) + s_t = np.zeros_like(scores) + else: + s_z = s_t = 1.0 + + best_idx = np.fliplr(np.argsort(scores_enr_coh, axis=1))[ + :, self.nbest_discard : self.nbest_discard + nbest + ] + + scores_z_norm = np.zeros_like(scores) + for i in range(scores.shape[0]): + best_idx_i = best_idx[i] + best_scores_i = scores_enr_coh[:, best_idx_i] + mu_z_i = np.mean(best_scores_i, axis=1, keepdims=False) + + if mask_coh_test is None: + s_z_i = np.std(best_scores_i, axis=1, keepdims=False) + else: + norm = np.mean(mask_enr_coh[:, best_idx_i], axis=1, keepdims=False) + mu_z_i /= norm + s_z_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=1, keepdims=False) / norm + - mu_z_i ** 2 + ) + + s_z_i = np.clip(s_z_i, a_min=1e-5, a_max=None) + if not self.norm_var: + s_z_i = 1.0 + + scores_z_norm[:, i] = (scores[:, i] - mu_z_i) / s_z_i + if return_stats: + mu_z[:, i] = mu_z_i + if self.norm_var: + s_z[:, i] = s_z_i + + best_idx = np.flipud(np.argsort(scores_coh_test, axis=0))[ + self.nbest_discard : self.nbest_discard + nbest + ] + scores_t_norm = np.zeros_like(scores) + for i in range(scores.shape[1]): + best_idx_i = best_idx[:, i] + + best_scores_i = scores_coh_test[best_idx_i, :] + mu_t_i = np.mean(best_scores_i, axis=0, keepdims=False) + + if mask_enr_coh is None: + s_t_i = np.std(best_scores_i, axis=0, keepdims=False) + else: + norm = np.mean(mask_coh_test[best_idx_i, :], axis=0, keepdims=False) + mu_t_i /= norm + s_t_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=0, keepdims=False) / norm + - mu_t_i ** 2 + ) + + s_t_i = np.clip(s_t_i, a_min=1e-5, a_max=None) + if not self.norm_var: + s_t_i = 1.0 + + scores_t_norm[i, :] = (scores[i, :] - mu_t_i) / s_t_i + if return_stats: + mu_t[i, :] = mu_t_i + if self.norm_var: + s_t[i, :] = s_t_i + + scores_norm = (scores_z_norm + scores_t_norm) / np.sqrt(2) + if return_stats: + return scores_norm, mu_z, s_z, mu_t, s_t + else: + return scores_norm + + def _norm_highest_same_side( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ): + + # this is very memory intensive, so we pass to f32 + scores_coh_test = scores_coh_test.astype("float32", copy=False) + scores_enr_coh = scores_enr_coh.astype("float32", copy=False) + + best_idx = np.argsort(-scores_enr_coh, axis=1)[ + :, self.nbest_discard : self.nbest_discard + nbest + ] + + mem = nbest * scores_enr_coh.shape[0] * scores.shape[0] * 4 / 2 ** 30 + # limit mem to 10 GB + num_groups = math.ceil(mem / 10) + num_el_group = int(math.ceil(scores.shape[0] / num_groups)) + scores_enr_coh = np.expand_dims(scores_enr_coh, 0) + if mask_enr_coh is not None: + mask_enr_coh = np.expand_dims(scores_enr_coh, 0) + + mu_z = [] + s_z = [] + for start in range(0, scores.shape[0], num_el_group): + stop = min(start + num_el_group, scores.shape[0]) + best_idx_i = np.expand_dims(best_idx[start:stop], 1) + best_scores_i = np.take_along_axis(scores_enr_coh, best_idx_i, axis=-1) + mu_z_i = best_scores_i.mean(axis=-1) + + if mask_enr_coh is None: + s_z_i = np.std(best_scores_i, axis=-1) + else: + mask_i = np.take_along_axis(mask_enr_coh, best_idx_i, axis=-1) + norm = mask_i.mean(axis=-1) + mu_z_i /= norm + s_z_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=-1) / norm - mu_z_i ** 2 + ) + + del best_scores_i + mu_z.append(mu_z_i.T) + s_z.append(s_z_i.T) + + mu_z = np.concatenate(mu_z, axis=-1) + s_z = np.concatenate(s_z, axis=-1) + + s_z = np.clip(s_z, a_min=1e-5, a_max=None) + if not self.norm_var: + s_z = 1.0 + + scores_z_norm = (scores - mu_z) / s_z + + best_idx = np.argsort(-scores_coh_test, axis=0)[ + self.nbest_discard : self.nbest_discard + nbest + ] # (n_best, n_test) + + mem = nbest * scores.shape[1] * scores_coh_test.shape[1] * 4 / 2 ** 30 + # limit mem to 10 GB + num_groups = math.ceil(mem / 10) + num_el_group = int(math.ceil(scores.shape[1] / num_groups)) + scores_coh_test = np.expand_dims(scores_coh_test, -1) + if mask_coh_test is not None: + mask_coh_test = np.expand_dims(mask_coh_test, -1) + + mu_t = [] + s_t = [] + for start in range(0, scores.shape[1], num_el_group): + stop = min(start + num_el_group, scores.shape[1]) + best_idx_i = np.expand_dims(best_idx[:, start:stop], 1) + # print(scores_coh_test.shape, best_idx_i.shape) + best_scores_i = np.take_along_axis(scores_coh_test, best_idx_i, axis=0) + # print(best_scores_i.shape) + mu_t_i = best_scores_i.mean(axis=0) + if mask_enr_coh is None: + s_t_i = np.std(best_scores_i, axis=0) + else: + mask_i = np.take_along_axis(mask_coh_test, best_idx_i, axis=0) + norm = mask_i.mean(axis=0) + mu_t_i /= norm + s_t_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=0) / norm - mu_t_i ** 2 + ) + + # print(best_scores_i.shape, mu_t_i.shape) + del best_scores_i + mu_t.append(mu_t_i.T) + s_t.append(s_t_i.T) + + mu_t = np.concatenate(mu_t, axis=0) + s_t = np.concatenate(s_t, axis=0) + + s_t = np.clip(s_t, a_min=1e-5, a_max=None) + if not self.norm_var: + s_t = 1.0 + + scores_t_norm = (scores - mu_t) / s_t + + scores_norm = (scores_z_norm + scores_t_norm) / np.sqrt(2) + if return_stats: + return scores_norm, mu_z, s_z, mu_t, s_t + else: + return scores_norm diff --git a/hyperion/np/score_norm/s_norm.py b/hyperion/np/score_norm/s_norm.py new file mode 100644 index 00000000..4c991d95 --- /dev/null +++ b/hyperion/np/score_norm/s_norm.py @@ -0,0 +1,46 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np + +from .score_norm import ScoreNorm +from .t_norm import TNorm +from .z_norm import ZNorm + + +class SNorm(ScoreNorm): + """Class for S-Norm, symmetric score normalization.""" + + def __init__(self, **kwargs): + super().__init__(*kwargs) + self.t_norm = TNorm(**kwargs) + self.z_norm = ZNorm(**kwargs) + + def predict( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test=None, + mask_enr_coh=None, + ): + """Normalizes the scores. + + Args: + scores: score matrix enroll vs. test. + scores_coh_test: score matrix cohort vs. test. + scores_enr_coh: score matrix enroll vs cohort. + mask_coh_test: binary matrix to mask out target trials + from cohort vs test matrix. + mask_enr_coh: binary matrix to mask out target trials + from enroll vs. cohort matrix. + + """ + + scores_z_norm = self.z_norm.predict(scores, scores_enr_coh, mask_enr_coh) + scores_t_norm = self.t_norm.predict(scores, scores_coh_test, mask_coh_test) + + return (scores_z_norm + scores_t_norm) / np.sqrt(2) diff --git a/hyperion/np/score_norm/score_norm.py b/hyperion/np/score_norm/score_norm.py new file mode 100644 index 00000000..9b40c7d7 --- /dev/null +++ b/hyperion/np/score_norm/score_norm.py @@ -0,0 +1,29 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import numpy as np + +from ..np_model import NPModel + + +class ScoreNorm(NPModel): + """Base class for score normalization + + Attributes: + std_floor: floor for standard deviations. + """ + + def __init__(self, norm_var=True, std_floor=1e-5, **kwargs): + super().__init__(*kwargs) + self.norm_var = norm_var + self.std_floor = std_floor + + def forward(self, **kwargs): + """Overloads predict function.""" + return self.predict(**kwargs) + + def __call__(self, *args, **kwargs): + """Overloads predict function.""" + return self.predict(*args, **kwargs) diff --git a/hyperion/np/score_norm/t_norm.py b/hyperion/np/score_norm/t_norm.py new file mode 100644 index 00000000..bf514b3d --- /dev/null +++ b/hyperion/np/score_norm/t_norm.py @@ -0,0 +1,45 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np + +from .score_norm import ScoreNorm + + +class TNorm(ScoreNorm): + """Class for T-Norm score normalization.""" + + def predict(self, scores, scores_coh_test, mask=None): + """Normalizes the scores. + + Args: + scores: score matrix enroll vs. test. + scores_coh_test: score matrix cohort vs. test. + mask: binary matrix to mask out target trials + from cohort vs test matrix. + + """ + if mask is None: + mu_t = np.mean(scores_coh_test, axis=0, keepdims=True) + if self.norm_var: + s_t = np.std(scores_coh_test, axis=0, keepdims=True) + else: + scores_coh_test[mask == False] = 0 + n_t = np.mean(mask, axis=0, keepdims=True) + mu_t = np.mean(scores_coh_test, axis=0, keepdims=True) / n_t + if self.norm_var: + s_t = np.sqrt( + np.mean(scores_coh_test ** 2, axis=0, keepdims=True) / n_t + - mu_t ** 2 + ) + + if self.norm_var: + s_t[s_t < self.std_floor] = self.std_floor + else: + s_t = 1.0 + + scores_norm = (scores - mu_t) / s_t + return scores_norm diff --git a/hyperion/np/score_norm/tz_norm.py b/hyperion/np/score_norm/tz_norm.py new file mode 100644 index 00000000..6127091d --- /dev/null +++ b/hyperion/np/score_norm/tz_norm.py @@ -0,0 +1,54 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import numpy as np + +from .score_norm import ScoreNorm +from .t_norm import TNorm +from .z_norm import ZNorm + + +class TZNorm(ScoreNorm): + """Class for TZ-Norm score normalization.""" + + def __init__(self, **kwargs): + super().__init__(*kwargs) + self.t_norm = TNorm(**kwargs) + self.z_norm = ZNorm(**kwargs) + + def predict( + self, + scores, + scores_coh_test, + scores_enr_coh, + scores_coh_coh, + mask_coh_test=None, + mask_enr_coh=None, + mask_coh_coh=None, + ): + """Normalizes the scores. + + Args: + scores: score matrix enroll vs. test. + scores_coh_test: score matrix cohort vs. test. + scores_enr_coh: score matrix enroll vs cohort. + scores_coh_coh: score matrix cohort vs cohort. + mask_coh_test: binary matrix to mask out target trials + from cohort vs test matrix. + mask_enr_coh: binary matrix to mask out target trials + from enroll vs. cohort matrix. + mask_coh_coh: binary matrix to mask out target trials + from cohort vs. cohort matrix. + """ + + scores_t_norm = self.t_norm.predict(scores, scores_coh_test, mask_coh_test) + scores_enr_coh_t_norm = self.t_norm.predict( + scores_enr_coh, scores_coh_coh, mask_coh_coh + ) + scores_tz_norm = self.z_norm.predict( + scores_t_norm, scores_enr_coh_t_norm, mask_enr_coh + ) + + return scores_tz_norm diff --git a/hyperion/np/score_norm/z_norm.py b/hyperion/np/score_norm/z_norm.py new file mode 100644 index 00000000..7b9e32d8 --- /dev/null +++ b/hyperion/np/score_norm/z_norm.py @@ -0,0 +1,46 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import numpy as np + +from .score_norm import ScoreNorm + + +class ZNorm(ScoreNorm): + """ + Class for Z-Norm score normalization. + """ + + def predict(self, scores, scores_enr_coh, mask=None): + """Normalizes the scores. + + Args: + scores: score matrix enroll vs. test. + scores_enr_coh: score matrix enroll vs cohort. + mask: binary matrix to mask out target trials + from enroll vs. cohort matrix. + + """ + if mask is None: + mu_z = np.mean(scores_enr_coh, axis=1, keepdims=True) + if self.norm_var: + s_z = np.std(scores_enr_coh, axis=1, keepdims=True) + else: + scores_enr_coh[mask == False] = 0 + n_z = np.mean(mask, axis=1, keepdims=True) + mu_z = np.mean(scores_enr_coh, axis=1, keepdims=True) / n_z + if self.norm_var: + s_z = np.sqrt( + np.mean(scores_enr_coh ** 2, axis=1, keepdims=True) / n_z + - mu_z ** 2 + ) + + if self.norm_var: + s_z[s_z < self.std_floor] = self.std_floor + else: + s_z = 1.0 + + scores_norm = (scores - mu_z) / s_z + return scores_norm diff --git a/hyperion/np/score_norm/zt_norm.py b/hyperion/np/score_norm/zt_norm.py new file mode 100644 index 00000000..078dd8ce --- /dev/null +++ b/hyperion/np/score_norm/zt_norm.py @@ -0,0 +1,55 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np + +from .score_norm import ScoreNorm +from .t_norm import TNorm +from .z_norm import ZNorm + + +class ZTNorm(ScoreNorm): + """Class ZT-Norm score-normalization.""" + + def __init__(self, **kwargs): + super().__init__(*kwargs) + self.t_norm = TNorm(**kwargs) + self.z_norm = ZNorm(**kwargs) + + def predict( + self, + scores, + scores_coh_test, + scores_enr_coh, + scores_coh_coh, + mask_coh_test=None, + mask_enr_coh=None, + mask_coh_coh=None, + ): + """Normalizes the scores. + + Args: + scores: score matrix enroll vs. test. + scores_coh_test: score matrix cohort vs. test. + scores_enr_coh: score matrix enroll vs cohort. + scores_coh_coh: score matrix cohort vs cohort. + mask_coh_test: binary matrix to mask out target trials + from cohort vs test matrix. + mask_enr_coh: binary matrix to mask out target trials + from enroll vs. cohort matrix. + mask_coh_coh: binary matrix to mask out target trials + from cohort vs. cohort matrix. + """ + + scores_z_norm = self.z_norm.predict(scores, scores_enr_coh, mask_enr_coh) + scores_coh_test_z_norm = self.z_norm.predict( + scores_coh_test, scores_coh_coh, mask_coh_coh + ) + scores_zt_norm = self.t_norm.predict( + scores_z_norm, scores_coh_test_z_norm, mask_coh_test + ) + + return scores_zt_norm diff --git a/hyperion/np/transforms/__init__.py b/hyperion/np/transforms/__init__.py new file mode 100644 index 00000000..c963e32b --- /dev/null +++ b/hyperion/np/transforms/__init__.py @@ -0,0 +1,19 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .cent_whiten import CentWhiten +from .cent_whiten_up import CentWhitenUP +from .coral import CORAL +from .gaussianizer import Gaussianizer +from .lda import LDA +from .lnorm import LNorm +from .lnorm_up import LNormUP +from .mvn import MVN +from .nap import NAP +from .nda import NDA +from .pca import PCA +from .sb_sw import SbSw +from .skl_tsne import SklTSNE +from .transform_list import TransformList diff --git a/hyperion/np/transforms/cent_whiten.py b/hyperion/np/transforms/cent_whiten.py new file mode 100644 index 00000000..35e79d80 --- /dev/null +++ b/hyperion/np/transforms/cent_whiten.py @@ -0,0 +1,181 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import h5py +import numpy as np +import scipy.linalg as la +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ..np_model import NPModel +from ..pdfs import Normal + + +class CentWhiten(NPModel): + """Class to do centering and whitening of i-vectors. + + Attributes: + mu: data mean vector + T: whitening projection. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + """ + + def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): + super().__init__(**kwargs) + self.mu = mu + self.T = T + self.update_mu = update_mu + self.update_T = update_T + + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + if self.mu is not None: + x = x - self.mu + if self.T is not None: + if self.T.ndim == 1: + x = x * self.T + else: + x = np.dot(x, self.T) + return x + + def fit(self, x=None, sample_weight=None, mu=None, S=None): + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + sample_weight: weight for each training sample. + mu: precomputed mean (used if x is None). + S: precomputed convariances (used if x is None). + """ + if x is not None: + if x.shape[0] > x.shape[1]: + gauss = Normal(x_dim=x.shape[1]) + gauss.fit(x=x, sample_weight=sample_weight) + mu = gauss.mu + S = gauss.Sigma + else: + mu = np.mean(x, axis=0) + S = np.eye(x.shape[1]) + + if self.update_mu: + self.mu = mu + + if self.update_T: + d, V = la.eigh(S) + V *= np.sqrt(1 / d) + V = np.fliplr(V) + + p = V[0, :] < 0 + V[:, p] *= -1 + + nonzero = d > 0 + if not np.all(nonzero): + V = V[:, nonzero[::-1]] + + self.T = V + + def get_config(self): + """Returns the model configuration dict.""" + config = {"update_mu": self.update_mu, "update_t": self.update_T} + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"mu": self.mu, "T": self.T} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["mu", "T"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls(mu=params["mu"], T=params["T"], **config) + + @classmethod + def load_mat(cls, file_path): + with h5py.File(file_path, "r") as f: + mu = np.asarray(f["mu"], dtype="float32") + T = np.asarray(f["T"], dtype="float32") + return cls(mu, T) + + def save_mat(self, file_path): + with h5py.File(file_path, "w") as f: + f.create_dataset("mu", data=self.mu) + f.create_dataset("T", data=self.T) + + @staticmethod + def filter_args(**kwargs): + valid_args = ("update_mu", "update_T", "name") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--update-mu", + default=ActionYesNo, + type=bool, + help=("updates centering parameter"), + ) + + parser.add_argument( + "--update-T", + default=True, + type=ActionYesNo, + help=("updates whitening parameter"), + ) + + parser.add_argument("--name", default="lnorm") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + add_argparse_args = add_class_args diff --git a/hyperion/np/transforms/cent_whiten_up.py b/hyperion/np/transforms/cent_whiten_up.py new file mode 100644 index 00000000..7e677d16 --- /dev/null +++ b/hyperion/np/transforms/cent_whiten_up.py @@ -0,0 +1,74 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np +import scipy.linalg as la + +from ..np_model import NPModel +from ..pdfs import Normal +from .cent_whiten import CentWhiten + + +class CentWhitenUP(CentWhiten): + """Class to do centering and whitening with uncertainty propagation. + + Attributes: + mu: data mean vector + T: whitening projection. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + """ + + def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): + super().__init__(mu, T, update_mu, update_T, **kwargs) + + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + x_dim = int(x.shape[-1] / 2) + m_x = x[:, :x_dim] + s2_x = x[:, x_dim:] + m_x = super().predict(m_x) + for i in range(x.shape[0]): + s2_x[i] = np.diag(np.dot(self.T.T * s2_x[i], self.T)) + return np.hstack((m_x, s2_x)) + + def fit(self, x, sample_weight=None): + """Trains the transformation parameters. + + Args: + x: training samples with shape (num_samples, x_dim) + """ + x = x[:, : int(x.shape[-1] / 2)] + super().fit(x, sample_weight=sample_weight) diff --git a/hyperion/np/transforms/coral.py b/hyperion/np/transforms/coral.py new file mode 100644 index 00000000..90cc9774 --- /dev/null +++ b/hyperion/np/transforms/coral.py @@ -0,0 +1,171 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np +import scipy.linalg as la + +from ..np_model import NPModel + + +class CORAL(NPModel): + """Class to do CORAL. + + https://arxiv.org/abs/1612.01939 + + Attributes: + mu: mean shift between both domains. + T_col: recoloring projection. + T_white: whitening projection. + update_mu: whether or not to update mu when training. + update_T: wheter or not to update T_col and T_white when training. + alpha_mu: weight of the in-domain data when computing in-domain mean. + alpha_T: weight of the in-domain data when computing in-domain covariance. + """ + + def __init__( + self, + mu=None, + T_col=None, + T_white=None, + update_mu=True, + update_T=True, + alpha_mu=1, + alpha_T=1, + **kwargs + ): + super().__init__(**kwargs) + self.mu = mu + self.T_col = T_col + self.T_white = T_white + self.T = None + self.update_mu = update_mu + self.update_T = update_T + self.alpha_mu = alpha_mu + self.alpha_T = alpha_T + + def get_config(self): + """Returns the model configuration dict.""" + config = { + "update_mu": self.update_mu, + "update_t": self.update_T, + "alpha_mu": self.alpha_mu, + "alpha_T": self.alpha_T, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def _compute_T(self): + if self.T_col is not None and self.T_white is not None: + self.T = np.dot(self.T_white, self.T_col) + + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + if self.T is None: + self._compute_T() + if self.mu is not None: + x = x - self.mu + + if self.T is not None: + x = np.dot(x, self.T) + + return x + + def fit(self, x, sample_weight=None, x_out=None, sample_weight_out=None): + """Trains the model. + + Args: + x: in-domain data samples with shape (num_samples, x_dim). + sample_weight: weight for each in-domain training sample. + x_out: out-domain data samples with shape (num_samples, x_dim). + sample_weight_out: weight for each out-domain training sample. + """ + if x_out is None: + assert self.T_white is not None + else: + mu_out = np.mean(x_out, axis=0) + if self.update_T: + delta = x_out - mu_out + S_out = np.dot(delta.T, delta) / x_out.shape[0] + # zero-phase component analysis (ZCA) + d, V = la.eigh(S_out) + self.T_white = np.dot(V * (1 / np.sqrt(d)), V.T) + + mu_in = np.mean(x, axis=0) + if self.update_T: + delta = x - mu_in + S_in = np.dot(delta.T, delta) / x.shape[0] + if self.alpha_T < 1: + S_in = self.alpha_T * S_in + (1 - self.alpha_T) * S_out + # zero-phase component analysis (ZCA) + d, V = la.eigh(S_in) + d[d < 0] = 0 + self.T_col = np.dot(V * np.sqrt(d), V.T) + + if self.update_mu: + self.mu = self.alpha_mu * (mu_out - mu_in) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["mu", "T_col", "T_white"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls( + mu=params["mu"], + T_col=params["T_col"], + T_white=params["T_white"], + **config, + ) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = { + "mu": self.mu, + "T_col": self.T_col, + "T_white": self.T_white, + } + self._save_params_from_dict(f, params) diff --git a/hyperion/np/transforms/gaussianizer.py b/hyperion/np/transforms/gaussianizer.py new file mode 100644 index 00000000..2c208e02 --- /dev/null +++ b/hyperion/np/transforms/gaussianizer.py @@ -0,0 +1,159 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging + +import h5py +import numpy as np +import scipy.linalg as la +from scipy.special import erfinv + +from ...hyp_defs import float_cpu +from ..np_model import NPModel + + +class Gaussianizer(NPModel): + """Class to make i-vector distribution standard Normal. + + Args: + max_vectors: maximum number of background vectors needed to + compute the Gaussianization. + r: background vector matrix obtained by fit function. + """ + + def __init__(self, max_vectors=None, r=None, **kwargs): + super().__init__(**kwargs) + self.max_vectors = max_vectors + self.r = r + + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + # px_cum = np.linspace(0, 1, self.r.shape[0] + 2)[1:-1] + px_cum = np.linspace(0, 1, self.r.shape[0] + 3)[1:-1] + y_map = erfinv(2 * px_cum - 1) * np.sqrt(2) + + # r = self.r[1:] + r = self.r + y = np.zeros_like(x) + for i in range(x.shape[1]): + y_index = np.searchsorted(r[:, i], x[:, i]) + logging.debug(y_index) + y[:, i] = y_map[y_index] + + return y + + def fit(self, x): + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + """ + r = np.sort(x, axis=0, kind="heapsort") + # x = np.zeros((1, x.shape[-1]), dtype=float_cpu()) + + if r.shape[0] > self.max_vectors: + index = np.round( + np.linspace(0, r.shape[0] - 1, self.max_vectors, dtype=float) + ).astype(int) + r = r[index, :] + + # self.r = np.vstack((x, r)) + self.r = r + + def get_config(self): + """Returns the model configuration dict.""" + config = {"max_vectors": self.max_vectors} + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"r": self.r} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["r"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls( + r=params["r"], max_vectors=config["max_vectors"], name=config["name"] + ) + + @classmethod + def load_mat(cls, file_path): + with h5py.File(file_path, "r") as f: + r = np.asarray(f["r"], dtype="float32") + return cls(r=r) + + def save_mat(self, file_path): + with h5py.File(file_path, "w") as f: + f.create_dataset("r", data=self.r) + + @staticmethod + def filter_args(**kwargs): + valid_args = ("max_vectors", "name") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is None: + p1 = "--" + else: + p1 = "--" + prefix + "." + + parser.add_argument( + p1 + "max-vectors", + default=None, + type=int, + help=("maximum number of background vectors"), + ) + + parser.add_argument(p1 + "name", default="gauss") + + add_arparse_args = add_class_args diff --git a/hyperion/np/transforms/lda.py b/hyperion/np/transforms/lda.py new file mode 100644 index 00000000..b7a50f80 --- /dev/null +++ b/hyperion/np/transforms/lda.py @@ -0,0 +1,195 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np +import scipy.linalg as la +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ..np_model import NPModel +from .sb_sw import SbSw + + +class LDA(NPModel): + """Class to do linear discriminant analysis. + + Attributes: + mu: data mean vector + T: LDA projection. + lda_dim: LDA dimension. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + """ + + def __init__( + self, mu=None, T=None, lda_dim=None, update_mu=True, update_T=True, **kwargs + ): + super().__init__(**kwargs) + self.mu = mu + self.T = T + if T is None: + self.lda_dim = lda_dim + else: + self.lda_dim = T.shape[1] + self.update_mu = update_mu + self.update_T = update_T + + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + if self.mu is not None: + x = x - self.mu + return np.dot(x, self.T) + + def fit(self, x, y, mu=None, Sb=None, Sw=None): + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + y: training labels as integers in [0, num_classes-1] with shape (num_samples,) + mu: precomputed mean. + Sb: precomputed between-class covariance. + Sw: precomputed within-class covariance. + """ + if mu is None or Sb is None or Sw is None: + sbsw = SbSw() + sbsw.fit(x, y) + mu = sbsw.mu + Sb = sbsw.Sb + Sw = sbsw.Sw + + if self.update_mu: + self.mu = mu + + if not self.update_T: + return + + assert Sb.shape == Sw.shape + + try: + d, V = la.eigh(Sb, Sw) + except: + alpha = 1e-2 * np.max(np.diag(Sw)) + d, V = la.eigh(Sb, alpha * np.eye(Sw.shape[0]) + Sw) + V = np.fliplr(V) + + p = V[0, :] < 0 + V[:, p] *= -1 + + if self.lda_dim is not None: + assert self.lda_dim <= V.shape[1] + V = V[:, : self.lda_dim] + + self.T = V + + def get_config(self): + """Returns the model configuration dict.""" + config = { + "lda_dim": self.lda_dim, + "update_mu": self.update_mu, + "update_t": self.update_T, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"mu": self.mu, "T": self.T} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["mu", "T"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls(mu=params["mu"], T=params["T"], **config) + + @classmethod + def load_mat(cls, file_path): + with h5py.File(file_path, "r") as f: + mu = np.asarray(f["mu"], dtype="float32") + T = np.asarray(f["T"], dtype="float32") + return cls(mu, T) + + def save_mat(self, file_path): + with h5py.File(file_path, "w") as f: + f.create_dataset("mu", data=self.mu) + f.create_dataset("T", data=self.T) + + @staticmethod + def filter_args(**kwargs): + valid_args = ("update_mu", "update_T", "name", "lda_dim") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--update-mu", + default=True, + action=ActionYesNo, + help=("updates centering parameter"), + ) + parser.add_argument( + "--update-T", + default=True, + action=ActionYesNo, + help=("updates projection parameter"), + ) + + parser.add_argument( + "--lda-dim", required=True, help=("output dimension of LDA") + ) + + parser.add_argument("--name", dest="name", default="lda") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) diff --git a/hyperion/np/transforms/lnorm.py b/hyperion/np/transforms/lnorm.py new file mode 100644 index 00000000..302dedbe --- /dev/null +++ b/hyperion/np/transforms/lnorm.py @@ -0,0 +1,24 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import h5py +import numpy as np + +from .cent_whiten import CentWhiten + + +class LNorm(CentWhiten): + """Class to do length normalization. + + Attributes: + mu: data mean vector + T: whitening projection. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + """ + + def predict(self, x): + x = super().predict(x) + mx = np.sqrt(np.sum(x ** 2, axis=1, keepdims=True)) + 1e-10 + return np.sqrt(x.shape[1]) * x / mx diff --git a/hyperion/np/transforms/lnorm_up.py b/hyperion/np/transforms/lnorm_up.py new file mode 100644 index 00000000..2f3c1baf --- /dev/null +++ b/hyperion/np/transforms/lnorm_up.py @@ -0,0 +1,32 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np + +from .cent_whiten_up import CentWhitenUP + + +class LNormUP(CentWhitenUP): + """Class to do Lenght Normalization with uncertainty propagation. + + Attributes: + mu: data mean vector + T: whitening projection. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + """ + + def predict(self, x): + x = super().predict(x) + x_dim = int(x.shape[-1] / 2) + m_x = x[:, :x_dim] + s2_x = x[:, x_dim:] + + mx2 = np.sum(m_x ** 2, axis=1, keepdims=True) + 1e-10 + m_x /= np.sqrt(mx2) + s2_x /= mx2 + + return np.hstack((m_x, s2_x)) diff --git a/hyperion/np/transforms/mvn.py b/hyperion/np/transforms/mvn.py new file mode 100644 index 00000000..f8154148 --- /dev/null +++ b/hyperion/np/transforms/mvn.py @@ -0,0 +1,96 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np +import scipy.linalg as la + +from ..np_model import NPModel + + +class MVN(NPModel): + """Class to do global mean and variance normalization. + + Attributes: + mu: data mean vector + s: standard deviation vector. + + """ + + def __init__(self, mu=None, s=None, **kwargs): + super().__init__(**kwargs) + self.mu = mu + self.s = s + + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + if self.mu is not None: + x = x - self.mu + if self.s is not None: + x = x / self.s + return x + + def fit(self, x): + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + """ + self.mu = np.mean(x, axis=0) + self.s = np.std(x, axis=0) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"mu": self.mu, "s": self.s} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["mu", "s"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls(mu=params["mu"], s=params["s"], name=config["name"]) diff --git a/hyperion/np/transforms/nap.py b/hyperion/np/transforms/nap.py new file mode 100644 index 00000000..c826e887 --- /dev/null +++ b/hyperion/np/transforms/nap.py @@ -0,0 +1,121 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np +import scipy.linalg as la + +from ..np_model import NPModel + + +class NAP(NPModel): + """Class to do nuissance attribute projection. + + Attributes: + U: NAP projection. + """ + + def __init__(self, U=None, U_dim=None, **kwargs): + super().__init__(**kwargs) + self.U = U + if U is None: + self.U_dim = U_dim + else: + self.U_dim = U.shape[0] + + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return x - np.dot(np.dot(x, self.U.T), self.U) + + def fit(self, x, y): + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + y: training labels as integers in [0, num_classes-1] with shape (num_samples,) + """ + u_ids = np.unique(y) + M = np.sqrt(len(u_ids)) + for i in u_ids: + idx = y == i + N = np.sqrt(len(idx)) + mu_i = np.mean(x[idx, :], axis=0) + xx[idx, :] = (x[idx, :] - mu_i) / N + xx /= M + _, s, Vt = np.svd(xx, full_matrices=False, overwrite_a=True) + idx = (np.argsort(s)[::-1])[: self.U_dim] + self.U = Vt[idx, :] + + def get_config(self): + """Returns the model configuration dict.""" + config = { + "U_dim": self.U_dim, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"U": self.U} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["U"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls(U=params["U"], name=config["name"]) + + @classmethod + def load_mat(cls, file_path): + with h5py.File(file_path, "r") as f: + U = np.asarray(f["U"], dtype="float32") + return cls(U) + + def save_mat(self, file_path): + with h5py.File(file_path, "w") as f: + f.create_dataset("U", data=self.U) diff --git a/hyperion/np/transforms/nda.py b/hyperion/np/transforms/nda.py new file mode 100644 index 00000000..13fe6aef --- /dev/null +++ b/hyperion/np/transforms/nda.py @@ -0,0 +1,157 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import h5py +import numpy as np +import scipy.linalg as la + +from ...hyp_defs import float_cpu +from ..np_model import NPModel +from .sb_sw import NSbSw + + +class NDA(NPModel): + """Class to do nearest-neighbors discriminant analysis + + Attributes: + mu: data mean vector + T: NDA projection. + """ + + def __init__( + self, mu=None, T=None, nda_dim=None, update_mu=True, update_T=True, **kwargs + ): + super().__init__(**kwargs) + self.mu = mu + self.T = T + if T is None: + self.nda_dim = nda_dim + else: + self.nda_dim = T.shape[1] + self.update_mu = update_mu + self.update_T = update_T + + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + if self.mu is not None: + x = x - self.mu + return np.dot(x, self.T) + + def fit(self, x, y, mu=None, Sb=None, Sw=None): + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + y: training labels as integers in [0, num_classes-1] with shape (num_samples,) + mu: precomputed mean. + Sb: precomputed between-class covariance. + Sw: precomputed within-class covariance. + """ + if mu is None or Sb is None or Sw is None: + sbsw = NSbSw() + sbsw.fit(x, y) + mu = sbsw.mu + Sb = sbsw.Sb + Sw = sbsw.Sw + + if self.update_mu: + self.mu = mu + + if not self.update_T: + return + + assert Sb.shape == Sw.shape + + try: + d, V = la.eigh(Sb, Sw) + except: + alpha = 1e-2 * np.max(np.diag(Sw)) + d, V = la.eigh(Sb, alpha * np.eye(Sw.shape[0]) + Sw) + V = np.fliplr(V) + + p = V[0, :] < 0 + V[:, p] *= -1 + + if self.nda_dim is not None: + assert self.nda_dim <= V.shape[1] + V = V[:, : self.nda_dim] + + self.T = V + + def get_config(self): + """Returns the model configuration dict.""" + config = { + "nda_dim": self.nda_dim, + "update_mu": self.update_mu, + "update_t": self.update_T, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"mu": self.mu, "T": self.T} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["mu", "T"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls(mu=params["mu"], T=params["T"], **config) + + @classmethod + def load_mat(cls, file_path): + with h5py.File(file_path, "r") as f: + mu = np.asarray(f["mu"], dtype="float32") + T = np.asarray(f["T"], dtype="float32") + return cls(mu, T) + + def save_mat(self, file_path): + with h5py.File(file_path, "w") as f: + f.create_dataset("mu", data=self.mu) + f.create_dataset("T", data=self.T) diff --git a/hyperion/np/transforms/pca.py b/hyperion/np/transforms/pca.py new file mode 100644 index 00000000..98b6c192 --- /dev/null +++ b/hyperion/np/transforms/pca.py @@ -0,0 +1,267 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import h5py +import numpy as np +import scipy.linalg as la +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from numpy.linalg import matrix_rank + +from ..np_model import NPModel + + +class PCA(NPModel): + """Class to do principal component analysis + + Attributes: + mu: data mean vector + T: LDA projection. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + pca_dim: pca dimension (optional). + pca_var_r: pca variance ratio to retain, overrides pca_dim (optional). + pca_min_dim: minimum dimension of PCA when using pca_var_r. + whiten: whitens the data after PCA. + """ + + def __init__( + self, + mu=None, + T=None, + update_mu=True, + update_T=True, + pca_dim=None, + pca_var_r=None, + pca_min_dim=2, + whiten=False, + **kwargs + ): + super().__init__(**kwargs) + self.mu = mu + self.T = T + self.update_mu = update_mu + self.update_T = update_T + self.pca_dim = pca_dim + self.pca_var_r = pca_var_r + self.pca_min_dim = pca_min_dim + self.whiten = whiten + + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + if self.mu is not None: + x = x - self.mu + return np.dot(x, self.T) + + @staticmethod + def get_pca_dim_for_var_ratio(x, var_r=1, min_dim=2): + if var_r == 1: + rank = matrix_rank(x) + if rank <= min_dim: + # it may have failed, let's try the cov + rank = matrix_rank(np.dot(x.T, x)) + else: + sv = la.svd(x, compute_uv=False) + Ecc = np.cumsum(sv**2) + Ecc = Ecc / Ecc[-1] + rank = np.where(Ecc > var_r)[0][0] + + rank = max(min_dim, rank) + return rank + + def fit(self, x=None, mu=None, S=None): + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + y: training labels as integers in [0, num_classes-1] with shape (num_samples,) + mu: precomputed mean. + S: precomputed total covariance. + """ + if x is not None: + mu = np.mean(x, axis=0) + delta = x - mu + S = np.dot(delta.T, delta) / x.shape[0] + + if self.update_mu: + self.mu = mu + + if self.update_T: + d, V = la.eigh(S) + d = np.flip(d) + V = np.fliplr(V) + + # This makes the Transform unique + p = V[0, :] < 0 + V[:, p] *= -1 + + if self.pca_var_r is not None: + var_acc = np.cumsum(d) + var_r = var_acc / var_acc[-1] + self.pca_dim = max( + np.where(var_r > self.pca_var_r)[0][0], self.pca_min_dim + ) + + if self.whiten: + # the projected features will be whitened + # do not whithen dimension with eigenvalue eq. to 0. + is_zero = d <= 0 + if np.any(is_zero): + max_dim = np.where(is_zero)[0][0] + V = V[:, :max_dim] * 1 / np.sqrt(d[:max_dim]) + if self.pca_dim is None: + self.pca_dim = max_dim + else: + self.pca_dim = min(max_dim, self.pca_dim) + else: + V = V * 1 / np.sqrt(d) + + if self.pca_dim is not None: + assert self.pca_dim <= V.shape[1] + V = V[:, : self.pca_dim] + + self.T = V + + def get_config(self): + """Returns the model configuration dict.""" + config = { + "update_mu": self.update_mu, + "update_t": self.update_T, + "pca_dim": self.pca_dim, + "pca_var_r": self.pca_var_r, + "pca_min_dim": self.pca_min_dim, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + params = {"mu": self.mu, "T": self.T} + self._save_params_from_dict(f, params) + + @classmethod + def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ + param_list = ["mu", "T"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls( + mu=params["mu"], + T=params["T"], + **config, + ) + + @classmethod + def load_mat(cls, file_path): + with h5py.File(file_path, "r") as f: + mu = np.asarray(f["mu"], dtype="float32") + T = np.asarray(f["T"], dtype="float32") + return cls(mu, T) + + def save_mat(self, file_path): + with h5py.File(file_path, "w") as f: + f.create_dataset("mu", data=self.mu) + f.create_dataset("T", data=self.T) + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "update_mu", + "update_T", + "name", + "pca_dim", + "pca_var_r", + "pca_min_dim", + "whiten", + ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--update-mu", + default=True, + action=ActionYesNo, + help=("updates centering parameter"), + ) + parser.add_argument( + "--update-T", + default=True, + action=ActionYesNo, + help=("updates whitening parameter"), + ) + parser.add_argument( + "--whiten", + default=False, + action=ActionYesNo, + help=("whitens the data after projection"), + ) + + parser.add_argument( + "--pca-dim", default=None, type=int, help=("output dimension of PCA") + ) + + parser.add_argument( + "--pca-var-r", + default=None, + type=float, + help=("proportion of variance to keep when choosing the PCA dimension"), + ) + + parser.add_argument( + "--pca-min-dim", default=2, type=int, help=("min. output dimension of PCA") + ) + + parser.add_argument("--name", dest="name", default="pca") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + add_argparse_args = add_class_args diff --git a/hyperion/transforms/sb_sw.py b/hyperion/np/transforms/sb_sw.py similarity index 79% rename from hyperion/transforms/sb_sw.py rename to hyperion/np/transforms/sb_sw.py index 83c8d185..e182c8e6 100644 --- a/hyperion/transforms/sb_sw.py +++ b/hyperion/np/transforms/sb_sw.py @@ -2,18 +2,24 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py - +import numpy as np import scipy.linalg as la from sklearn.neighbors import BallTree -from ..hyp_model import HypModel -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu +from ..np_model import NPModel -class SbSw(HypModel): - """Class to compute between and within class matrices""" +class SbSw(NPModel): + """Class to compute between and within class covariance matrices. + + Args: + Sb: between-class cov. matrix. + Sw: within-class cov. matrix. + mu: data mean vector. + num_classes: number of classes. + """ def __init__(self, Sb=None, Sw=None, mu=None, num_classes=0, **kwargs): super(SbSw, self).__init__(**kwargs) @@ -22,7 +28,7 @@ def __init__(self, Sb=None, Sw=None, mu=None, num_classes=0, **kwargs): self.mu = None self.num_classes = num_classes - def fit(self, x, class_ids, sample_weight=None, class_weights=None, normalize=True): + def fit(self, x, class_ids, normalize=True): dim = x.shape[1] if self.Sb is None: self.Sb = np.zeros((dim, dim)) @@ -75,7 +81,7 @@ def save_params(self, f): @classmethod def load(cls, file_path): with h5py.File(file_path, "r") as f: - config = self.load_config_from_json(f["config"]) + config = cls.load_config_from_json(f["config"]) param_list = ["mu", "Sb", "Sw", "num_classes"] params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) @@ -83,12 +89,26 @@ def load(cls, file_path): class NSbSw(SbSw): + """Class to compute nearest neighbour between and within class + covariance matrices. + https://www.isca-speech.org/archive/pdfs/interspeech_2014/sadjadi14_interspeech.pdf + + Args: + K: number of neighbours. + alpha: distance exponent that determines how fast the weight of the samples decays + when they get far from the classification boundary. + Sb: between-class cov. matrix. + Sw: within-class cov. matrix. + mu: data mean vector. + num_classes: number of classes. + """ + def __init__(self, K=10, alpha=1, **kwargs): - super(NSbSw, self).__init__(**kwargs) + super().__init__(**kwargs) self.K = K self.alpha = alpha - def fit(self, x, class_ids, sample_weight=None, class_weights=None, normalize=True): + def fit(self, x, class_ids, normalize=True): dim = x.shape[1] self.Sb = np.zeros((dim, dim), dtype=float_cpu()) self.Sw = np.zeros((dim, dim), dtype=float_cpu()) @@ -139,6 +159,7 @@ def normalize(self): self.Sw /= self.num_classes def get_config(self): + """Returns the model configuration dict.""" config = {"K": self.K, "alpha": self.alpha} base_config = super(NSbSw, self).get_config() return dict(list(base_config.items()) + list(config.items())) diff --git a/hyperion/transforms/skl_tsne.py b/hyperion/np/transforms/skl_tsne.py similarity index 84% rename from hyperion/transforms/skl_tsne.py rename to hyperion/np/transforms/skl_tsne.py index 048be0c7..fbff7df3 100644 --- a/hyperion/transforms/skl_tsne.py +++ b/hyperion/np/transforms/skl_tsne.py @@ -2,15 +2,14 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import numpy as np - +from jsonargparse import ActionParser, ArgumentParser from sklearn.manifold import TSNE -from ..hyp_model import HypModel +from ..np_model import NPModel -class SklTSNE(HypModel): +class SklTSNE(NPModel): """Wrapper class for sklearn TSNE manifold learner Attributes: @@ -24,7 +23,7 @@ class SklTSNE(HypModel): metric: the metric to use when calculating distance between instances in ['cosine', 'euclidean', 'l1', 'l2', 'precomputed'] or callable function. init: initialization method in ['random', 'pca'] or embedding matrix of shape (num_samples, num_comp) verbose: verbosity level. - rng: RandomState instance + rng: default_rng instance rng_seed: seed for random number generator method: gradient calculation method in [‘barnes_hut’, 'exact'] angle: angle thetha in Barnes-Hut TSNE @@ -54,6 +53,7 @@ def __init__( super().__init__(**kwargs) self.rng_seed = rng_seed if rng is None: + #rng = np.random.default_rng(seed=rng_seed) rng = np.random.RandomState(seed=rng_seed) self._tsne = TSNE( @@ -121,10 +121,48 @@ def angle(self): def num_jobs(self): return self._tsne.n_jobs + def __call__(self, x): + """Trains and applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Trains and applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + def predict(self, x): + """Trains and applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ return self._tsne.fit_transform(x) def fit(self, x): + """Trains and applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ return self._tsne.fit_transform(x) def save_params(self, f): @@ -135,6 +173,7 @@ def load_params(cls, f, config): return cls(**config) def get_config(self): + """Returns the model configuration dict.""" config = { "tsne_dim": self.tsne_dim, "perplexity": self.perplexity, @@ -155,6 +194,11 @@ def get_config(self): @staticmethod def filter_args(**kwargs): + """Filters the arguments corresponding to this model from a dictionary. + + Returns + Dictionary containing valid options to initialize the model. + """ valid_args = ( "tsne_dim", "perplexity", @@ -174,6 +218,12 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): + """Adds model options to parser. + + Args: + parser: parser object. + prefix: prefix str to add to the argument names. + """ if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/np/transforms/transform_list.py b/hyperion/np/transforms/transform_list.py new file mode 100644 index 00000000..58da16eb --- /dev/null +++ b/hyperion/np/transforms/transform_list.py @@ -0,0 +1,111 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging + +import h5py +import numpy as np + +from ..np_model import NPModel +from .cent_whiten import CentWhiten +from .cent_whiten_up import CentWhitenUP +from .gaussianizer import Gaussianizer +from .lda import LDA +from .lnorm import LNorm +from .lnorm_up import LNormUP +from .mvn import MVN +from .nap import NAP +from .nda import NDA +from .pca import PCA + + +class TransformList(NPModel): + """Class to perform a sequence of transformations + + Attributes: + transforms: list of transformation objects. + """ + + def __init__(self, transforms, **kwargs): + super().__init__(**kwargs) + if not isinstance(transforms, list): + transforms = [transforms] + self.transforms = transforms + if transforms is not None: + self.update_names() + + def append(self, t): + """Appends a transformation to the list. + + Args: + t: transformation object. + """ + self.transforms.append(t) + if self.name is not None: + t.name = self.name + "/" + t.name + + def __call__(self, x): + """Applies the list of transformations to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the list of transformations to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def predict(self, x): + """Applies the list of transformations to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + for t in self.transforms: + x = t.predict(x) + return x + + def update_names(self): + if self.name is not None: + for t in self.transforms: + t.name = self.name + "/" + t.name + + def get_config(self): + config = super().get_config() + config_t = {} + for i in range(len(self.transforms)): + config_t[i] = self.transforms[i].get_config() + config["transforms"] = config_t + return config + + def save_params(self, f): + for t in self.transforms: + t.save_params(f) + + @classmethod + def load_params(cls, f, config): + config_ts = config["transforms"] + transforms = [] + for i in range(len(config_ts)): + config_t = config_ts[str(i)] + logging.debug(config_t) + class_t = globals()[config_t["class_name"]] + t = class_t.load_params(f, config_t) + transforms.append(t) + return cls(transforms, name=config["name"]) diff --git a/hyperion/pdfs/__init__.py b/hyperion/pdfs/__init__.py deleted file mode 100644 index 91af5497..00000000 --- a/hyperion/pdfs/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -from .core import * -from .mixtures import * -from .plda import * -from .jfa import * -from .hmm import * diff --git a/hyperion/pdfs/core/__init__.py b/hyperion/pdfs/core/__init__.py deleted file mode 100644 index 2defe6d4..00000000 --- a/hyperion/pdfs/core/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - - -from .pdf import PDF -from .exp_family import ExpFamily -from .normal_diag_cov import NormalDiagCov, DiagNormal -from .normal import Normal diff --git a/hyperion/pdfs/core/exp_family.py b/hyperion/pdfs/core/exp_family.py deleted file mode 100644 index 44fc172c..00000000 --- a/hyperion/pdfs/core/exp_family.py +++ /dev/null @@ -1,157 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from abc import ABCMeta, abstractmethod -from .pdf import PDF - - -class ExpFamily(PDF): - __metaclass__ = ABCMeta - - def __init__(self, eta=None, **kwargs): - super(ExpFamily, self).__init__(**kwargs) - self.eta = eta - self.A = None - - @property - def is_init(self): - if not self._is_init: - self._compute_nat_std() - if self.eta is not None and self.A is not None: - self.validate() - self._is_init = True - return self._is_init - - def fit( - self, x, sample_weight=None, x_val=None, sample_weight_val=None, batch_size=None - ): - - N, u_x = self.Estep(x=x, sample_weight=sample_weight, batch_size=batch_size) - self.Mstep(N, u_x) - elbo = self.elbo(x, N=N, u_x=u_x) - elbo = [elbo, elbo / N] - - if x_val is not None: - N, u_x = self.Estep( - x=x_val, sample_weight=sample_weight_val, batch_size=batch_size - ) - elbo_val = self.elbo(x_val, N=N, u_x=u_x) - elbo += [elbo_val, elbo_val / N] - return elbo - - def log_h(self, x): - return 0 - - def accum_log_h(self, x, sample_weight=None): - if sample_weight is None: - return np.sum(self.log_h(x)) - return np.sum(sample_weight * self.log_h(x)) - - def compute_suff_stats(self, x): - return x - - def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): - if u_x is not None or batch_size is None: - return self._accum_suff_stats_1batch(x, u_x, sample_weight) - else: - return self._accum_suff_stats_nbatches(x, sample_weight, batch_size) - - def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): - if u_x is None: - u_x = self.compute_suff_stats(x) - if sample_weight is None: - N = u_x.shape[0] - else: - u_x *= sample_weight[:, None] - N = np.sum(sample_weight) - acc_u_x = np.sum(u_x, axis=0) - return N, acc_u_x - - def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): - sw_i = None - for i1 in range(0, x.shape[0], batch_size): - i2 = np.minimum(i1 + batch_size, x.shape[0]) - x_i = x[i1:i2, :] - if sample_weight is not None: - sw_i = sample_weight[i1:i2] - N_i, u_x_i = self._accum_suff_stats_1batch(x_i, sample_weight=sw_i) - if i1 == 0: - N = N_i - u_x = u_x_i - else: - N += N_i - u_x += u_x_i - return N, u_x - - def add_suff_stats(self, N, u_x): - assert len(N) == len(u_x) - acc_N = N[1] - acc_u_x = u_x[1] - for i in range(1, len(N)): - acc_N += N - acc_u_x += u[i] - return acc_N, acc_u_x - - def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): - return self.accum_suff_stats(x, u_x, sample_weight, batch_size) - - @abstractmethod - def Mstep(self, stats): - pass - - def elbo(self, x, u_x=None, N=1, log_h=None, sample_weight=None, batch_size=None): - assert self.is_init - if u_x is None: - N, u_x = self.accum_suff_stats( - x, sample_weight=sample_weight, batch_size=batch_size - ) - if log_h is None: - log_h = self.accum_log_h(x, sample_weight=sample_weight) - return log_h + np.inner(u_x, self.eta) - N * self.A - - def log_prob(self, x, u_x=None, method="nat"): - if method == "nat": - return self.log_prob_nat(x, u_x) - else: - return self.log_prob_std(x) - - def log_prob_nat(self, x, u_x=None): - assert self.is_init - if u_x is None: - u_x = self.compute_suff_stats(x) - return self.log_h(x) + np.inner(u_x, self.eta) - self.A - - @staticmethod - def compute_A_nat(eta): - raise NotImplementedError() - - @staticmethod - def compute_A_std(params): - raise NotImplementedError() - - @staticmethod - def compute_eta(param): - raise NotImplementedError() - - @staticmethod - def compute_std(eta): - raise NotImplementedError() - - @abstractmethod - def _compute_nat_params(self): - pass - - @abstractmethod - def _compute_std_params(self): - pass - - def _compute_nat_std(self): - pass - - @abstractmethod - def validate(self): - pass diff --git a/hyperion/pdfs/core/normal.py b/hyperion/pdfs/core/normal.py deleted file mode 100644 index b1ff4224..00000000 --- a/hyperion/pdfs/core/normal.py +++ /dev/null @@ -1,322 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py -import scipy.linalg as la -from scipy.special import erf - -from ...hyp_defs import float_cpu -from ...utils.plotting import ( - plot_gaussian_1D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D, - plot_gaussian_3D, -) -from ...utils.math import ( - invert_pdmat, - invert_trimat, - symmat2vec, - vec2symmat, - fullcov_varfloor, - logdet_pdmat, -) - -from .exp_family import ExpFamily - - -class Normal(ExpFamily): - def __init__( - self, - mu=None, - Lambda=None, - var_floor=1e-5, - update_mu=True, - update_Lambda=True, - **kwargs - ): - super(Normal, self).__init__(**kwargs) - self.mu = mu - self.Lambda = Lambda - self.var_floor = var_floor - self.update_mu = update_mu - self.update_Lambda = update_Lambda - - self._compute_nat_std() - - self._logLambda = None - self._cholLambda = None - self._Sigma = None - - def _compute_nat_std(self): - if self.mu is not None and self.Lambda is not None: - self._validate_mu() - self._validate_Lambda() - self._compute_nat_params() - elif self.eta is not None: - self._validate_eta() - self.A = self.compute_A_nat(self.eta) - self._compute_std_params() - - @property - def logLambda(self): - if self._logLambda is None: - assert self.is_init - f, L, logL = invert_pdmat(self.Lambda, return_logdet=True) - self._logLambda = logL - self._cholLambda = L.T - return self._logLambda - - @property - def cholLambda(self): - if self._cholLambda is None: - assert self.is_init - f, L, logL = invert_pdmat(self.Lambda, return_logdet=True) - self._logLambda = logL - self._cholLambda = L.T - return self._cholLambda - - @property - def Sigma(self): - if self._Sigma is None: - assert self.is_init - self._Sigma = invert_pdmat(self.Lambda, return_inv=True)[-1] - return self._Sigma - - def initialize(self): - self.validate() - self._compute_nat_std() - - def stack_suff_stats(self, F, S=None): - if S is None: - return F - return np.hstack((F, S)) - - def unstack_suff_stats(self, stats): - F = stats[: self.x_dim] - S = stats[self.x_dim :] - return F, S - - def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): - if u_x is None: - if sample_weight is None: - N = x.shape[0] - F = np.sum(x, axis=0) - S = symmat2vec(np.dot(x.T, x)) - else: - N = np.sum(sample_weight) - wx = sample_weight[:, None] * x - F = np.sum(wx, axis=0) - S = symmat2vec(np.dot(wx.T, x)) - return N, self.stack_suff_stats(F, S) - else: - return self._accum_suff_stats_1batch(x, u_x, sample_weight) - - def norm_suff_stats(self, N, u_x, return_order2=False): - assert self.is_init - - F, S = self.unstack_suff_stats(u_x) - F_norm = np.dot(F - N * self.mu, self.cholLambda.T) - if return_order2: - SS = vec2symat(S) - Fmu = np.outer(self.F, self.mu) - SS = SS - Fmu - Fmu.T + N * np.outer(self.mu, self.mu) - SS = np.dot(self.cholLambda, np.dot(SS, self.cholLambda.T)) - S = symmat2vec(SS) - return N, self.stack_suff_stats(F_norm, S) - return N, F_norm - - def Mstep(self, N, u_x): - - F, S = self.unstack_suff_stats(u_x) - - if self.update_mu: - self.mu = F / N - - if self.update_Lambda: - S = vec2symmat(S / N) - S -= np.outer(self.mu, self.mu) - # S = fullcov_varfloor(S, self.var_floor) - self.Lambda = invert_pdmat(S, return_inv=True)[-1] - self._Sigma = None - self._logLambda = None - self._cholLambda = None - - self._compute_nat_params() - - def log_prob_std(self, x): - assert self.is_init - mah_dist2 = np.sum(np.dot(x - self.mu, self.cholLambda) ** 2, axis=1) - return ( - 0.5 * self.logLambda - - 0.5 * self.x_dim * np.log(2 * np.pi) - - 0.5 * mah_dist2 - ) - - # def eval_logcdf(self, x): - # delta = np.dot((x-self.mu), self.cholLambda) - # lk = 0.5*(1+erf(delta/np.sqrt(2))) - # print(x-self.mu) - # print(la.cholesky(self.Lambda,lower=True)) - # print(self.cholLambda) - # print(delta) - # print(lk) - # return np.sum(np.log(lk+1e-20), axis=-1) - - def sample(self, num_samples, rng=None, seed=1024): - assert self.is_init - - if rng is None: - rng = np.random.RandomState(seed) - return rng.multivariate_normal(self.mu, self.Sigma, size=(num_samples,)).astype( - float_cpu() - ) - # x=rng.normal(size=(num_samples, self.x_dim)) - # cholS=la.cholesky(self.Sigma, lower=False, overwrite_a=True) - # return self.mu+np.dot(x, cholS) - - def get_config(self): - config = { - "var_floor": self.var_floor, - "update_mu": self.update_mu, - "update_lambda": self.update_Lambda, - } - base_config = super(Normal, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - - assert self.is_init - - params = {"mu": self.mu, "Lambda": self.Lambda} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "Lambda"] - params = self._load_params_to_dict(f, config["name"], param_list) - return cls( - x_dim=config["x_dim"], - mu=params["mu"], - Lambda=params["Lambda"], - var_floor=config["var_floor"], - update_mu=config["update_mu"], - update_Lambda=config["update_lambda"], - name=config["name"], - ) - - def _validate_mu(self): - assert self.mu.shape[0] == self.x_dim - - def _validate_Lambda(self): - assert self.Lambda.shape == (self.x_dim, self.x_dim) - - def _validate_eta(self): - assert self.eta.shape[0] == (self.x_dim ** 2 + 3 * self.x_dim) / 2 - - def validate(self): - if self.mu is not None and self.Lambda is not None: - self._validate_mu() - self._validate_Lambda() - - if self.eta is not None: - self._validate_eta() - - @staticmethod - def compute_eta(mu, Lambda): - Lmu = np.dot(mu, Lambda) - eta = np.hstack((Lmu, -symmat2vec(Lambda, diag_factor=0.5))) - return eta - - @staticmethod - def compute_x_dim_from_eta(eta): - x_dim = 0.5 * (-3 + np.sqrt(9 + 8 * eta.shape[-1])) - assert int(x_dim) == x_dim - return int(x_dim) - - @staticmethod - def compute_std(eta): - x_dim = Normal.compute_x_dim_from_eta(eta) - eta1 = eta[:x_dim] - eta2 = vec2symmat(eta[x_dim:], diag_factor=2) / 2 - Lambda = -2 * eta2 - f = invert_pdmat(-eta2, right_inv=True)[0] - mu = 0.5 * f(eta1) - return mu, Lambda - - @staticmethod - def compute_A_nat(eta): - x_dim = Normal.compute_x_dim_from_eta(eta) - eta1 = eta[:x_dim] - eta2 = vec2symmat(eta[x_dim:], diag_factor=2) / 2 - f, _, log_minus_eta2 = invert_pdmat(-eta2, right_inv=True, return_logdet=True) - r1 = 0.5 * x_dim * np.log(2 * np.pi) - r2 = 0.25 * np.inner(f(eta1), eta1) - r3 = -0.5 * x_dim * np.log(2) - 0.5 * log_minus_eta2 - return r1 + r2 + r3 - - @staticmethod - def compute_A_std(mu, Lambda): - x_dim = mu.shape[0] - r1 = 0.5 * x_dim * np.log(2 * np.pi) - r2 = -0.5 * logdet_pdmat(Lambda) - r3 = 0.5 * np.inner(np.dot(mu, Lambda), mu) - return r1 + r2 + r3 - - def _compute_nat_params(self): - self.eta = self.compute_eta(self.mu, self.Lambda) - self.A = self.compute_A_std(self.mu, self.Lambda) - # self.A = self.compute_A_nat(self.eta) - # Lmu = np.dot(self.Lambda, self.mu[:, None]) - # muLmu = np.dot(self.mu, Lmu) - # lnr = 0.5*self.lnLambda - 0.5*self.x_dim*np.log(2*np.pi)-0.5*muLmu - # Lambda=np.copy(self.Lambda) - # Lambda[np.diag_indices(self.x_dim)] /= 2 - # self.eta=np.vstack((lnr, Lmu, symmat2vec(Lambda)[:, None])) - - def _compute_std_params(self): - self.mu, self.Lambda = self.compute_std(self.eta) - self._cholLambda = None - self._logLambda = None - self._Sigma = None - - @staticmethod - def compute_suff_stats(x): - d = x.shape[1] - u = np.zeros((x.shape[0], int(d + d * (d + 1) / 2)), dtype=float_cpu()) - u[:, :d] = x - k = d - for i in range(d): - for j in range(i, d): - u[:, k] = x[:, i] * x[:, j] - k += 1 - return u - - def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): - assert self.is_init - mu = self.mu[feat_idx] - C = invert_pdmat(self.Lambda, return_inv=True)[-1][feat_idx, feat_idx] - plot_gaussian_1D(mu, C, num_sigmas, num_pts, **kwargs) - - def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - assert self.is_init - mu = self.mu[feat_idx] - j, i = np.meshgrid(feat_idx, feat_idx) - C = invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] - plot_gaussian_ellipsoid_2D(mu, C, num_sigmas, num_pts, **kwargs) - - def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - assert self.is_init - mu = self.mu[feat_idx] - j, i = np.meshgrid(feat_idx, feat_idx) - C = invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] - plot_gaussian_3D(mu, C, num_sigmas, num_pts, **kwargs) - - def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): - assert self.is_init - mu = self.mu[feat_idx] - j, i = np.meshgrid(feat_idx, feat_idx) - C = invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] - plot_gaussian_ellipsoid_3D(mu, C, num_sigmas, num_pts, **kwargs) diff --git a/hyperion/pdfs/core/normal_diag_cov.py b/hyperion/pdfs/core/normal_diag_cov.py deleted file mode 100644 index 562d3899..00000000 --- a/hyperion/pdfs/core/normal_diag_cov.py +++ /dev/null @@ -1,264 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py -from scipy.special import erf - -# import matplotlib.pyplot as plt -# import matplotlib.mlab as mlab - -from ...hyp_defs import float_cpu -from ...utils.plotting import ( - plot_gaussian_1D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D, - plot_gaussian_3D, -) - -from .exp_family import ExpFamily - - -class NormalDiagCov(ExpFamily): - def __init__( - self, - mu=None, - Lambda=None, - var_floor=1e-5, - update_mu=True, - update_Lambda=True, - **kwargs - ): - super(NormalDiagCov, self).__init__(**kwargs) - self.mu = mu - self.Lambda = Lambda - self.var_floor = var_floor - self.update_mu = update_mu - self.update_Lambda = update_Lambda - - self._compute_nat_std() - - self._logLambda = None - self._cholLambda = None - self._Sigma = None - - def _compute_nat_std(self): - if self.mu is not None and self.Lambda is not None: - self._validate_mu() - self._validate_Lambda() - self._compute_nat_params() - elif self.eta is not None: - self._validate_eta() - self.A = self.compute_A_nat(self.eta) - self._compute_std_params() - - @property - def logLambda(self): - if self._logLambda is None: - assert self.is_init - self._logLambda = np.sum(np.log(self.Lambda)) - return self._logLambda - - @property - def cholLambda(self): - if self._cholLambda is None: - assert self.is_init - self._cholLambda = np.sqrt(self.Lambda) - return self._cholLambda - - @property - def Sigma(self): - if self._Sigma is None: - assert self.is_init - self._Sigma = 1.0 / self.Lambda - return self._Sigma - - def initialize(self): - self.validate() - self._compute_nat_std() - assert self.is_init - - def stack_suff_stats(self, F, S=None): - if S is None: - return F - return np.hstack((F, S)) - - def unstack_suff_stats(self, stats): - F = stats[: self.x_dim] - S = stats[self.x_dim :] - return F, S - - def norm_suff_stats(self, N, u_x=None, return_order2=False): - assert self.is_init - F, S = self.unstack_suff_stats(u_x) - F_norm = self.cholLambda * (F - N * self.mu) - if return_order2: - S = S - 2 * self.mu * F + N * self.mu ** 2 - S *= self.Lambda - return N, self.stack_suff_stats(F_norm, S) - return N, F_norm - - def Mstep(self, N, u_x): - - F, S = self.unstack_suff_stats(u_x) - - if self.update_mu: - self.mu = F / N - - if self.update_Lambda: - S = S / N - self.mu ** 2 - S[S < self.var_floor] = self.var_floor - self.Lambda = 1 / S - self._Sigma = S - self._cholLambda = None - self._logLambda = None - - self._compute_nat_params() - - def log_prob_std(self, x): - assert self.is_init - mah_dist2 = np.sum(((x - self.mu) * self.cholLambda) ** 2, axis=1) - return ( - 0.5 * self.logLambda - - 0.5 * self.x_dim * np.log(2 * np.pi) - - 0.5 * mah_dist2 - ) - - def log_cdf(self, x): - assert self.is_init - delta = (x - self.mu) * self.cholLambda - lk = 0.5 * (1 + erf(delta / np.sqrt(2))) - return np.sum(np.log(lk + 1e-10), axis=-1) - - def sample(self, num_samples, rng=None, seed=1024): - assert self.is_init - if rng is None: - rng = np.random.RandomState(seed) - x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) - return self.mu + 1.0 / self.cholLambda * x - - def get_config(self): - config = { - "var_floor": self.var_floor, - "update_mu": self.update_mu, - "update_lambda": self.update_Lambda, - } - base_config = super(NormalDiagCov, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - assert self.is_init - params = {"mu": self.mu, "Lambda": self.Lambda} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "Lambda"] - params = self._load_params_to_dict(f, config["name"], param_list) - return cls( - x_dim=config["x_dim"], - mu=params["mu"], - Lambda=params["Lambda"], - var_floor=config["var_floor"], - update_mu=config["update_mu"], - update_Lambda=config["update_lambda"], - name=config["name"], - ) - - def _validate_mu(self): - assert self.mu.shape[0] == self.x_dim - - def _validate_Lambda(self): - assert self.Lambda.shape[0] == self.x_dim - assert np.all(self.Lambda > 0) - - def _validate_eta(self): - assert self.eta.shape[0] == self.x_dim * 2 - - def validate(self): - if self.mu is not None and self.Lambda is not None: - self._validate_mu() - self._validate_Lambda() - - if self.eta is not None: - self._validate_eta() - - @staticmethod - def compute_eta(mu, Lambda): - Lmu = Lambda * mu - eta = np.hstack((Lmu, -0.5 * Lambda)) - return eta - - @staticmethod - def compute_std(eta): - x_dim = int(eta.shape[0] / 2) - eta1 = eta[:x_dim] - eta2 = eta[x_dim:] - mu = -0.5 * eta1 / eta2 - Lambda = -2 * eta2 - return mu, Lambda - - @staticmethod - def compute_A_nat(eta): - x_dim = int(eta.shape[0] / 2) - eta1 = eta[:x_dim] - eta2 = eta[x_dim:] - r1 = 0.5 * x_dim * np.log(2 * np.pi) - r2 = -1 / 4 * np.sum(eta1 * eta1 / eta2) - r3 = -1 / 2 * np.sum(np.log(-2 * eta2)) - return r1 + r2 + r3 - - @staticmethod - def compute_A_std(mu, Lambda): - x_dim = mu.shape[0] - r1 = 0.5 * x_dim * np.log(2 * np.pi) - r2 = -0.5 * np.sum(np.log(Lambda)) - r3 = 0.5 * np.sum(mu * mu * Lambda) - return r1 + r2 + r3 - - def _compute_nat_params(self): - self.eta = self.compute_eta(self.mu, self.Lambda) - self.A = self.compute_A_nat(self.eta) - # Lmu = self.Lambda*self.mu - # muLmu = np.sum(self.mu*Lmu) - # lnr = 0.5*self.lnLambda - 0.5*self.x_dim*np.log(2*np.pi)-0.5*muLmu - # self.eta=np.hstack((lnr, Lmu, -0.5*self.Lambda)).T - - def _compute_std_params(self): - self.mu, self.Lambda = self.compute_std(self.eta) - self._cholLambda = None - self._logLambda = None - self._Sigma = None - - @staticmethod - def compute_suff_stats(x): - d = x.shape[1] - u = np.zeros((x.shape[0], 2 * d), dtype=float_cpu()) - u[:, :d] = x - u[:, d:] = x * x - return u - - def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[feat_idx] - C = 1 / self.Lambda[feat_idx] - plot_gaussian_1D(mu, C, num_sigmas, num_pts, **kwargs) - - def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[feat_idx] - C = np.diag(1.0 / self.Lambda[feat_idx]) - plot_gaussian_ellipsoid_2D(mu, C, num_sigmas, num_pts, **kwargs) - - def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[feat_idx] - C = np.diag(1.0 / self.Lambda[feat_idx]) - plot_gaussian_3D(mu, C, num_sigmas, num_pts, **kwargs) - - def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[feat_idx] - C = np.diag(1.0 / self.Lambda[feat_idx]) - plot_gaussian_ellipsoid_3D(mu, C, num_sigmas, num_pts, **kwargs) - - -DiagNormal = NormalDiagCov diff --git a/hyperion/pdfs/core/pdf.py b/hyperion/pdfs/core/pdf.py deleted file mode 100644 index 2764780c..00000000 --- a/hyperion/pdfs/core/pdf.py +++ /dev/null @@ -1,36 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from abc import ABCMeta, abstractmethod -from ...hyp_model import HypModel - - -class PDF(HypModel): - __metaclass__ = ABCMeta - - def __init__(self, x_dim=1, **kwargs): - super(PDF, self).__init__(**kwargs) - self.x_dim = x_dim - - def get_config(self): - config = {"x_dim": self.x_dim} - base_config = super(PDF, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - @abstractmethod - def log_prob(self, x): - pass - - def eval_llk(self, x): - return self.log_prob(x) - - @abstractmethod - def sample(self, num_samples): - pass - - def generate(self, num_samples, **kwargs): - return self.generate(num_samples, **kwargs) diff --git a/hyperion/pdfs/jfa/jfa_total.py b/hyperion/pdfs/jfa/jfa_total.py deleted file mode 100644 index 74fe0f95..00000000 --- a/hyperion/pdfs/jfa/jfa_total.py +++ /dev/null @@ -1,258 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -from scipy import linalg as sla - -from ...hyp_defs import float_cpu -from ...utils.math import ( - invert_pdmat, - invert_trimat, - logdet_pdmat, - vec2symmat, - symmat2vec, -) -from ..core.pdf import PDF - - -class JFATotal(PDF): - def __init__(self, K, y_dim=None, T=None, **kwargs): - super(JFATotal, self).__init__(**kwargs) - if T is not None: - y_dim = T.shape[0] - - self.K = K - self.y_dim = y_dim - self.T = T - - # aux - self._TT = None - self.__upptr = None - - def reset_aux(self): - self._TT = None - - @property - def is_init(): - if self._is_init: - return True - if self.T is not None: - self._is_init = True - return self._is_init - - def initialize(self, N, F): - assert N.shape[0] == self.K - - self.T = np.random.randn(self.y_dim, F.shape[1]).astype(float_cpu(), copy=False) - - def compute_py_g_x( - self, N, F, G=None, return_cov=False, return_elbo=False, return_acc=False - ): - assert self.is_init - x_dim = int(F.shape[1] / self.K) - M = F.shape[0] - y_dim = self.y_dim - - compute_inv = return_cov or return_acc - return_tuple = compute_inv or return_elbo - - TF = np.dot(F, self.T.T) - L = self.compute_L(self.TT, N, self._upptr) - y = np.zeros((M, y_dim), dtype=float_cpu()) - - if return_cov: - Sy = np.zeros((M, y_dim * (y_dim + 1) / 2), dtype=float_cpu()) - else: - Sy = None - - if return_elbo: - elbo = np.zeros((M,), dtype=float_cpu()) - - if return_acc: - Py = np.zeros((y_dim, y_dim), dtype=float_cpu()) - Ry = np.zeros((self.K, y_dim * (y_dim + 1) / 2), dtype=float_cpu()) - - Li = np.zeros((self.y_dim, self.y_dim), dtype=float_cpu()) - for i in range(N.shape[0]): - Li[self._upptr] = L[i] - r = invert_pdmat( - Li, right_inv=True, return_logdet=return_elbo, return_inv=compute_inv - ) - mult_iL = r[0] - if return_elbo: - elbo[i] = -r[2] / 2 - if compute_inv: - iL = r[-1] - - y[i] = mult_iL(TF[i]) - - if return_cov: - Sy[i] = iL[self.__upptr] - - if return_acc: - iL += np.outer(y[i], y[i]) - Py += iL - Ry += iL[self.__uppr] * N[i][:, None] - - if not return_tuple: - return y - - r = [y] - - if return_cov: - r += [Sy] - - if return_elbo: - if G is not None: - elbo += G - elbo += 0.5 * np.sum(VF * y, axis=-1) - r += [elbo] - - if return_acc: - r += [Ry, Py] - - return tuple(r) - - def Estep(self, N, F, G=None): - - y, elbo, Ry, Py = self.compute_py_g_x( - N, F, G, return_elbo=True, return_acc=True - ) - - M = y.shape[0] - y_acc = np.sum(y, axis=0) - Cy = np.dot(F, y) - - elbo = np.sum(elbo) - - stats = (elbo, M, y_acc, Ry, Cy, Py) - return stats - - def MstepML(self, stats): - _, M, y_acc, Ry, Cy, _ = stats - T = np.zeros_like(self.T) - Ryk = np.zeros((self.y_dim, self.y_dim), dtype=float_cpu()) - x_dim = T.shape[1] / self.K - for k in range(self.K): - idx = k * x_dim - Ryk[self._upptr] = Ry[k] - iRyk_mult = invert_pdmat(Ryk, right_inv=False)[0] - T[:, idx : idx + x_dim] = iRyk_mult(Cy[idx : idx + x_dim].T) - - self.T = T - self.reset_aux() - - def MstepMD(self, stats): - _, M, y_acc, Ry, Cy, Py = stats - mu_y = y_acc / M - Cy = Py / M - np.outer(my_y, mu_y) - chol_Cy = la.cholesky(Cy, lower=False, overwrite_a=True) - self.T = np.dot(chol_Cy, self.T) - - self.reset_aux() - - def fit( - self, - N, - F, - G=None, - N_val=None, - F_val=None, - epochs=20, - ml_md="ml+md", - md_epochs=None, - ): - - use_ml = False if ml_md == "md" else True - use_md = False if ml_md == "ml" else True - - if not self.is_init: - self.initialize(N, F) - - elbo = np.zeros((epochs,), dtype=float_cpu()) - elbo_val = np.zeros((epochs,), dtype=float_cpu()) - for epoch in range(epochs): - - stats = self.Estep(N, F, G) - elbo[epoch] = stats[0] - if N_val is not None and F_val is not None: - _, elbo_val_e = self.compute_py_x(N, F, G, return_elbo=True) - elbo_val[epoch] = np.sum(elbo_val_e) - - if use_ml: - self.MstepML(stats) - if use_md and (md_epochs is None or epoch in md_epochs): - self.MstepMD(stats) - - elbo_norm = elbo / np.sum(N) - if x_val is None: - return elbo, elbo_norm - else: - elbo_val_norm = elbo_val / np.sum(N_val) - return elbo, elbo_norm, elbo_val, elbo_val_norm - - @property - def TT(self): - if self._TT is None: - self._TT = self.compute_TT(self.T, self.K) - return self._TT - - @property - def _upptr(self): - if self.__upptr is None: - I = np.eye(self.y_dim, dtype=float_cpu()) - self.__upptr = np.triu(I).ravel() - return self.__upptr - - @staticmethod - def compute_TT(self, T, K, upptr): - x_dim = int(T.shape[1] / K) - y_dim = T.shape[0] - TT = np.zeros((K, y_dim * (y_dim + 1) / 2), dtype=float_cpu()) - for k in range(K): - idx = k * x_dim - T_k = T[:, idx : idx + x_dim] - TT_k = np.dot(T_k, T_k.T) - TT[k] = TT_k[self._upptr] - - return TT - - @staticmethod - def compute_L(TT, N, upptr): - y_dim = self._upptr.shape[0] - I = np.eye(y_dim, dtype=float_cpu())[self._upptr] - return I + np.dot(N, TT) - - @staticmethod - def normalize_T(T, chol_prec): - Tnorm = np.zeros_like(T) - K = chol_prec.shape[0] - x_dim = int(T.shape[1] / K) - for k in range(K): - idx = k * x_dim - Tnorm[:, idx : idx + x_dim] = np.dot( - T[:, idx : idx + x_dim], chol_prec[k].T - ) - - return Tnorm - - def get_config(self): - config = {"K": self.K} - base_config = super(JFATotal, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {"T": self.T} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["T"] - params = cls._load_params_to_dict(f, config["name"], param_list) - kwargs = dict(list(config.items()) + list(params.items())) - return cls(**kwargs) - - def sample(self, num_samples): - pass diff --git a/hyperion/pdfs/mixtures/__init__.py b/hyperion/pdfs/mixtures/__init__.py deleted file mode 100644 index f9168905..00000000 --- a/hyperion/pdfs/mixtures/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - - -from .exp_family_mixture import ExpFamilyMixture -from .gmm_diag_cov import GMMDiagCov, DiagGMM -from .gmm_tied_diag_cov import GMMTiedDiagCov, DiagGMMTiedCov -from .gmm import GMM diff --git a/hyperion/pdfs/mixtures/exp_family_mixture.py b/hyperion/pdfs/mixtures/exp_family_mixture.py deleted file mode 100644 index 113bb8fc..00000000 --- a/hyperion/pdfs/mixtures/exp_family_mixture.py +++ /dev/null @@ -1,538 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import numpy as np - -import logging -from abc import ABCMeta, abstractmethod - -from ...hyp_defs import float_cpu -from ...utils.math import softmax, logsumexp -from ...utils.queues import GeneratorQueue -from ..core import PDF - - -class ExpFamilyMixture(PDF): - __metaclass__ = ABCMeta - - def __init__( - self, num_comp=1, pi=None, eta=None, min_N=0, update_pi=True, **kwargs - ): - super().__init__(**kwargs) - if pi is not None: - num_comp = len(pi) - self.num_comp = num_comp - self.pi = pi - self.eta = eta - self.min_N = min_N - self.A = None - self._log_pi = None - self.update_pi = update_pi - - @property - def is_init(self): - if not self._is_init: - if self.eta is not None and self.A is not None and self.pi is not None: - self.validate() - self._is_init = True - return self._is_init - - @property - def log_pi(self): - if self._log_pi is None: - self._log_pi = np.log(self.pi + 1e-15) - return self._log_pi - - def _validate_pi(self): - assert len(self.pi) == self.num_comp - - def fit( - self, - x, - sample_weight=None, - x_val=None, - sample_weight_val=None, - epochs=10, - batch_size=None, - ): - - if not self.is_init: - self.initialize(x) - - log_h = self.accum_log_h(x, sample_weight) - if x_val is not None: - log_h_val = self.accum_log_h(x_val, sample_weight_val) - - elbo = np.zeros((epochs,), dtype=float_cpu()) - elbo_val = np.zeros((epochs,), dtype=float_cpu()) - for epoch in range(epochs): - N, u_x = self.Estep(x=x, sample_weight=sample_weight, batch_size=batch_size) - elbo[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h) - self.Mstep(N, u_x) - - if x_val is not None: - N, u_x = self.Estep( - x=x_val, sample_weight=sample_weight_val, batch_size=batch_size - ) - elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) - - if x_val is None: - return elbo, elbo / x.shape[0] - else: - return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] - - def fit_generator( - self, - generator, - train_steps, - epochs=10, - val_data=None, - val_steps=0, - max_queue_size=10, - workers=1, - use_multiprocessing=False, - ): - - do_validation = bool(validation_data) - val_gen = ( - hasattr(validation_data, "next") - or hasattr(validation_data, "__next__") - or isinstance(validation_data, Sequence) - ) - if val_gen and not validation_steps: - raise ValueError( - "When using a generator for validation data, " - "you must specify a value for " - "`validation_steps`." - ) - - if do_validation and not val_gen: - x, u_x_val, sample_weight_val = self.tuple2data(val_data) - log_h_val = self.accum_log_h(x, sample_weight_val) - - elbo = np.zeros((epochs,), dtype=float_cpu()) - elbo_val = np.zeros((epochs,), dtype=float_cpu()) - for epoch in range(epochs): - N, u_x, log_h = self.Estep_generator( - generator, - train_steps, - return_log_h=True, - max_queue_size=max_queue_size, - workers=workers, - use_multiprocessing=use_multiprocessing, - ) - - self.Mstep(N, u_x) - elbo[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h) - - if val_data is not None: - if val_gen: - N, u_x, log_h_val = self.Estep_generator( - generator, - train_steps, - return_log_h=True, - max_queue_size=max_queue_size, - workers=workers, - use_multiprocessing=use_multiprocessing, - ) - else: - N, u_x = self.Estep(x_val, u_x_val, sample_weight_val) - elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) - - if x_val is None: - return elbo, elbo / x.shape[0] - else: - return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] - - def log_h(self, x): - return 0 - - def accum_log_h(self, x, sample_weight=None): - if sample_weight is None: - return np.sum(self.log_h(x)) - return np.sum(sample_weight * self.log_h(x)) - - def compute_log_pz(self, x, u_x=None, mode="nat"): - if u_x is None: - u_x = self.compute_suff_stats(x) - return np.dot(u_x, self.eta.T) - self.A + self.log_pi - - def compute_pz(self, x, u_x=None, mode="nat"): - if mode == "nat": - return self.compute_pz_nat(x, u_x) - else: - return self.compute_pz_std(x) - - def compute_pz_nat(self, x, u_x=None): - if u_x is None: - u_x = self.compute_suff_stats(x) - logr = np.dot(u_x, self.eta.T) - self.A + self.log_pi - return softmax(logr) - - def compute_pz_std(self, x): - return self.compute_pz_nat(x) - - def compute_suff_stats(self, x): - return x - - def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): - if u_x is not None or batch_size is None: - return self._accum_suff_stats_1batch(x, u_x, sample_weight) - else: - return self._accum_suff_stats_nbatches(x, sample_weight, batch_size) - - def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): - if u_x is None: - u_x = self.compute_suff_stats(x) - z = self.compute_pz_nat(x, u_x) - if sample_weight is not None: - z *= sample_weight[:, None] - - N = np.sum(z, axis=0) - acc_u_x = np.dot(z.T, u_x) - # L_z=gmm.ElnP_z_w(N,gmm.lnw)-gmm.Elnq_z(z); - return N, acc_u_x - - def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): - sw_i = None - for i1 in range(0, x.shape[0], batch_size): - i2 = np.minimum(i1 + batch_size, x.shape[0]) - x_i = x[i1:i2, :] - if sample_weight is not None: - sw_i = sample_weight[i1:i2] - N_i, u_x_i = self._accum_suff_stats_1batch(x_i, sample_weight=sw_i) - if i1 == 0: - N = N_i - u_x = u_x_i - else: - N += N_i - u_x += u_x_i - return N, u_x - - def accum_suff_stats_segments( - self, x, segments, u_x=None, sample_weight=None, batch_size=None - ): - K = self.num_comp - num_segments = len(segments) - N = np.zeros((num_segments, K), dtype=float_cpu()) - acc_u_x = np.zeros((num_segments, K, self.eta.shape[1]), dtype=float_cpu()) - u_x_i = None - sw_i = None - for i in range(num_segments): - start = int(segments[i][0]) - end = int(segments[i][1]) + 1 - x_i = x[start:end] - if u_x is not None: - u_x_i = u_x[start:end] - if sample_weight is not None: - sw_i = sample_weight[start:end] - N_i, acc_u_x_i = self.accum_suff_stats( - x_i, u_x=u_x_i, sample_weight=sw_i, batch_size=batch_size - ) - N[i] = N_i - acc_u_x[i] = acc_u_x_i - - return N, acc_u_x - - def accum_suff_stats_segments_prob( - self, x, prob, u_x=None, sample_weight=None, batch_size=None - ): - if u_x is not None or batch_size is None: - return self._accum_suff_stats_segments_prob_1batch( - x, prob, u_x, sample_weight - ) - else: - return self._accum_suff_stats_segments_prob_nbatches( - x, prob, sample_weight, batch_size - ) - - def _accum_suff_stats_segments_prob_1batch( - self, x, prob, u_x=None, sample_weight=None - ): - if u_x is None: - u_x = self.compute_suff_stats(x) - z = self.compute_pz_nat(x, u_x) - if sample_weight is not None: - z *= sample_weight[:, None] - - K = len(self.pi) - num_segments = prob.shape[1] - N = np.zeros((num_segments, K), float_cpu()) - acc_u_x = np.zeros((num_segments, K, self.eta.shape[1]), float_cpu()) - - for i in range(num_segments): - z_i = z * prob[:, i][:, None] - N[i] = np.sum(z_i, axis=0) - acc_u_x[i] = np.dot(z_i.T, u_x) - - return N, acc_u_x - - def _accum_suff_stats_segments_prob_nbatches( - self, x, prob, sample_weight, batch_size - ): - - sw_i = None - for i1 in range(0, x.shape[0], batch_size): - i2 = np.minimum(i1 + batch_size, x.shape[0]) - x_i = x[i1:i2, :] - prob_i = prob[i1:i2, :] - if sample_weight is not None: - sw_i = sample_weight[i1:i2] - N_i, u_x_i = self._accum_suff_stats_segments_prob_1batch( - x_i, prob_i, sample_weight=sw_i - ) - if i1 == 0: - N = N_i - u_x = u_x_i - else: - N += N_i - u_x += u_x_i - return N, u_x - - def accum_suff_stats_sorttime( - self, - x, - frame_length, - frame_shift, - u_x=None, - sample_weight=None, - batch_size=None, - ): - if u_x is not None or batch_size is None: - return self._accum_suff_stats_sorttime_1batch( - x, frame_length, frame_shift, u_x, sample_weight - ) - else: - return self._accum_suff_stats_sorttime_nbatches( - x, frame_length, frame_shift, sample_weight, batch_size - ) - - def _accum_suff_stats_sorttime_1batch( - self, x, frame_length, frame_shift, u_x=None, sample_weight=None - ): - - K = len(self.pi) - num_frames = x.shape[0] - num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) - if num_segments == 1: - return self._accum_suff_stats_1batch(self, x, u_x, sample_weight) - - if u_x is None: - u_x = self.compute_suff_stats(x) - z = self.compute_pz_nat(x, u_x) - if sample_weight is not None: - z *= sample_weight[:, None] - - N = np.zeros((num_segments, K), float_cpu()) - acc_u_x = np.zeros((num_segments, K, self.eta.shape[1]), float_cpu()) - - start1 = int(frame_shift - 1) - end1 = int((num_segments - 1) * frame_shift) - start2 = int(start1 + frame_length) - end2 = int(end1 + frame_length) - cum_N = np.cumsum(z, axis=0) - N[0] = cum_N[frame_length - 1] - N[1:] = cum_N[start2:end2:frame_shift] - cum_N[start1:end1:frame_shift] - - for k in range(K): - cum_u_x_k = np.cumsum(z[:, k][:, None] * u_x, axis=0) - acc_u_x[0, k] = cum_u_x_k[frame_length - 1] - acc_u_x[1:, k] = ( - cum_u_x_k[start2:end2:frame_shift] - cum_u_x_k[start1:end1:frame_shift] - ) - - return N, acc_u_x - - def _accum_suff_stats_sorttime_nbatches( - self, x, frame_length, frame_shift, sample_weight, batch_size - ): - - K = len(self.pi) - num_frames = x.shape[0] - num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) - if num_segments == 1: - return self._accum_suff_stats_1batch(self, x, u_x, sample_weight) - - num_segments_per_batch = np.floor((num_frames - frame_length) / frame_shift + 1) - batch_size = int((num_segments_per_batch - 1) * frame_shift + frame_length) - batch_shift = int(num_segments_per_batch * frame_shift) - - N = np.zeros((num_segments, K), float_cpu()) - acc_u_x = np.zeros((num_segments, K, self.eta.shape[1]), float_cpu()) - - sw_i = None - cur_segment = 0 - for i1 in range(0, x.shape[0], batch_shift): - i2 = np.minimum(i1 + batch_size, x.shape[0]) - x_i = x[i1:i2, :] - if sample_weight is not None: - sw_i = sample_weight[i1:i2] - N_i, u_x_i = self._accum_suff_stats_sorttime_1batch( - x_i, frame_length, frame_shift, sample_weight=sw_i - ) - num_segments_i = N_i.shape[0] - N[cur_segment : cur_segment + num_segments_i] = N_i - acc_u_x[cur_segment : cur_segment + num_segments_i] = u_x_i - cur_segment += num_segments_i - return N, acc_u_x - - def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): - return self.accum_suff_stats(x, u_x, sample_weight, batch_size) - - def Estep_generator( - self, - generator, - num_steps, - return_log_h, - max_queue_size=10, - workers=1, - use_multiprocessin=False, - ): - wait_time = 0.01 # in secs - queue = None - N = None - acc_u_x = None - log_h = 0 - try: - queue = GeneratorQueue( - generator, use_multiprocessing=use_multiprocessing, wait_time=wait_time - ) - queue.start(workers=workers, max_queue_size=max_queue_size) - queue_generator = queue.get() - - cur_step = 0 - for cur_step in range(num_steps): - data = next(queue_generator) - x, u_x, sample_weight = self.tuple2data(data) - N_i, u_x_i = self.Estep(x, u_x, sample_weight) - if return_log_h: - log_h += self.accum_log_h(x) - if cur_step == 0: - N = N_i - acc_u_x = u_x_i - else: - N += N_i - acc_u_x += u_x_i - finally: - if enqueuer is not None: - enqueuer.stop() - - if return_log_h: - return N, acc_u_x, log_h - else: - return N, acc_u_x - - def sum_suff_stats(self, N, u_x): - assert len(N) == len(u_x) - acc_N = N[1] - acc_u_x = u_x[1] - for i in range(1, len(N)): - acc_N += N - acc_u_x += u[i] - return acc_N, acc_u_x - - @abstractmethod - def Mstep(self, stats): - pass - - def elbo(self, x, u_x=None, N=1, log_h=None, sample_weight=None, batch_size=None): - if u_x is None: - N, u_x = self.accum_suff_stats( - x, sample_weight=sample_weight, batch_size=batch_size - ) - if log_h is None: - log_h = self.accum_log_h(x, sample_weight=sample_weight) - return log_h + np.sum(u_x * self.eta) + np.inner(N, self.log_pi - self.A) - - def log_prob(self, x, u_x=None, mode="nat"): - if mode == "nat": - return self.log_prob_nat(x, u_x) - else: - return self.log_prob_std(x) - - def log_prob_nat(self, x, u_x=None): - if u_x is None: - u_x = self.compute_suff_stats(x) - llk_k = np.dot(u_x, self.eta.T) - self.A + self.log_pi - llk = logsumexp(llk_k) - return self.log_h(x) + llk - - @abstractmethod - def log_prob_std(self, x): - pass - - def log_prob_nbest(self, x, u_x=None, mode="nat", nbest_mode="master", nbest=1): - if mode == "nat": - return self.log_prob_nbest_nat(x, u_x, nbest_mode=nbest_mode, nbest=nbest) - else: - return self.log_prob_std(x, nbest_mode=nbest_mode, nbest=nbest) - - def log_prob_nbest_nat(self, x, u_x=None, nbest_mode="master", nbest=1): - - if u_x is None: - u_x = self.compute_suff_stats(x) - if nbest_mode == "master": - assert isinstance(nbest, int) - llk_k = np.dot(u_x, self.eta.T) - self.A + self.log_pi - nbest = np.argsort(llk_k)[: -(nbest + 1) : -1] - llk_k = llk_k[nbest] - else: - llk_k = np.dot(u_x, self.eta[nbest, :].T) - self.A + self.log_pi - llk = logsumexp(llk_k) - return self.log_h(x) + llk - - @abstractmethod - def log_prob_nbest_std(self, x, nbest_mode="master", nbest=1): - pass - - def get_config(self): - config = {"min_n": self.min_N, "update_pi": self.update_pi} - base_config = super(ExpFamilyMixture, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - @staticmethod - def tuple2data(data): - if isinstance(data, tuple): - if len(data) == 2: - x, u_x = data - if u_x.ndim == 2: - sample_weight = None - elif u_x.ndim == 1: - sample_weight = u_x - u_x = None - else: - raise ValueError("Generator output: " + str(data)) - elif len(data) == 3: - x, u_x, sample_weight = data - else: - raise ValueError("Generator output: " + str(data)) - else: - x = data - u_x = None - sample_weight = None - return x, u_x, sample_weight - - @staticmethod - def compute_A_nat(eta): - raise NotImplementedError() - - @staticmethod - def compute_A_std(params): - raise NotImplementedError() - - @staticmethod - def compute_eta(param): - raise NotImplementedError() - - @staticmethod - def compute_std(eta): - raise NotImplementedError() - - @abstractmethod - def _compute_nat_params(self): - pass - - @abstractmethod - def _compute_std_params(self): - pass diff --git a/hyperion/pdfs/mixtures/gmm.py b/hyperion/pdfs/mixtures/gmm.py deleted file mode 100644 index b71f0a61..00000000 --- a/hyperion/pdfs/mixtures/gmm.py +++ /dev/null @@ -1,433 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import numpy as np -import h5py -import scipy.linalg as la -from scipy.special import erf - - -from ...hyp_defs import float_cpu -from ...utils.math import ( - softmax, - logsumexp, - invert_pdmat, - invert_trimat, - symmat2vec, - vec2symmat, - fullcov_varfloor, - logdet_pdmat, -) -from ...utils.plotting import ( - plot_gaussian_1D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D, - plot_gaussian_3D, -) -from ...clustering import KMeans - -from ..core import Normal -from .exp_family_mixture import ExpFamilyMixture - - -class GMM(ExpFamilyMixture): - def __init__( - self, - mu=None, - Lambda=None, - var_floor=1e-3, - update_mu=True, - update_Lambda=True, - **kwargs - ): - super().__init__(**kwargs) - self.mu = mu - self.Lambda = Lambda - self.var_floor = var_floor - self.update_mu = update_mu - self.update_Lambda = update_Lambda - - self._compute_gmm_nat_std() - - self._logLambda = None - self._cholLambda = None - self._Sigma = None - - def _compute_gmm_nat_std(self): - if self.mu is not None and self.Lambda is not None: - self._validate_mu() - self._validate_Lambda() - self._compute_nat_params() - elif self.eta is not None: - self._validate_eta() - self.A = self.compute_A_nat(self.eta) - self._compute_std_params() - - def compute_Lambda_aux(self): - self._logLambda = np.zeros((self.num_comp,), dtype=float_cpu()) - self._cholLambda = np.zeros( - (self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu() - ) - for i, L in enumerate(self.Lambda): - f, L, logL = invert_pdmat(L, return_logdet=True) - self._logLambda[i] = logL - self._cholLambda[i] = L.T - - @property - def logLambda(self): - if self._logLambda is None: - self.compute_Lambda_aux() - return self._logLambda - - @property - def cholLambda(self): - if self._cholLambda is None: - self.compute_Lambda_aux() - return self._cholLambda - - @property - def Sigma(self): - if self._Sigma is None: - self._Sigma = np.zeros( - (self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu() - ) - for k in range(self.num_comp): - self._Sigma[k] = invert_pdmat(self.Lambda[k], return_inv=True)[-1] - return self._Sigma - - def initialize(self, x=None): - if x is None and self.mu is None and self.eta is None: - assert self.num_comp == 1 - self._initialize_stdnormal() - if x is not None: - self._initialize_kmeans(self.num_comp, x) - self.validate() - self._compute_gmm_nat_std() - - def _initialize_stdnormal(self): - self.pi = np.array([1], dtype=float_cpu()) - self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) - self.Lambda = np.zeros((1, self.x_dim, self.x_dim), dtype=float_cpu()) - self.Lambda[0] = np.eye(self.x_dim, dtype=float_cpu()) - - def _initialize_kmeans(self, num_comp, x): - if num_comp == 1: - self.pi = np.array([1], dtype=float_cpu()) - self.mu = np.mean(x, axis=0, keepdims=True) - self.Lambda = np.zeros((1, self.x_dim, self.x_dim), dtype=float_cpu()) - delta = x - self.mu - S = np.dot(delta.T, delta) / x.shape[0] - self.Lambda[0] = invert_pdmat(S, return_inv=True)[-1] - return - - kmeans = KMeans(num_clusters=num_comp) - loss, cluster_index = kmeans.fit(x, epochs=100) - - self.mu = kmeans.mu - self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) - self.Lambda = np.zeros( - (self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu() - ) - - for k in range(num_comp): - r = cluster_index == k - self.pi[k] = np.sum(r) / x.shape[0] - delta = x[r] - self.mu[k] - S = np.dot(delta.T, delta) / np.sum(r) - self.Lambda[k] = invert_pdmat(S, return_inv=True)[-1] - - def stack_suff_stats(self, F, S=None): - if S is None: - return F - return np.hstack((F, S)) - - def unstack_suff_stats(self, stats): - F = stats[:, : self.x_dim] - S = stats[:, self.x_dim :] - return F, S - - def norm_suff_stats(self, N, u_x, return_order2=False): - F, S = self.unstack_suff_stats(u_x) - F_norm = F - N[:, None] * self.mu - for k in range(self.num_comp): - F_norm[k] = np.dot(F_norm[k], self.cholLambda[k].T) - if return_order2: - SS = vec2symat(S[k]) - Fmu = np.outer(self.F[k], self.mu[k]) - SS = SS - Fmu - Fmu.T + N * np.outer(self.mu[k], self.mu[k]) - SS = np.dot(self.cholLambda[k], np.dot(SS, self.cholLambda[k].T)) - S[k] = symmat2vec(SS) - if return_order2: - return N, self.stack_suff_stats(F_norm, S) - return N, F_norm - - def Mstep(self, N, u_x): - - F, S = self.unstack_suff_stats(u_x) - - if self.update_mu: - self.mu = F / N[:, None] - - if self.update_Lambda: - C = np.zeros((self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu()) - for k in range(self.num_comp): - C[k] = vec2symmat(S[k] / N[k]) - C[k] -= np.outer(self.mu[k], self.mu[k]) - Sfloor = self.var_floor * np.mean(C, axis=0) - cholfloor = la.cholesky(Sfloor, overwrite_a=True) - for k in range(self.num_comp): - C[k] = fullcov_varfloor(C[k], cholfloor, F_is_chol=True) - self.Lambda[k] = invert_pdmat(C[k], return_inv=True)[-1] - self._Sigma = None - self._logLambda = None - self._cholLambda = None - - if self.update_pi: - N0 = N < self.min_N - if np.any(N0): - N[N0] = 0 - mu[N0] = 0 - S[N0] = 1 - self.pi = N / np.sum(N) - self._log_pi = None - - self._compute_nat_params() - - def split_comp(self, K=2): - - num_comp = self.num_comp * K - pi = np.repeat(self.pi, K) / K - Lambda = np.repeat(self.Lambda, K, axis=0) * (K ** 2) - mu = np.repeat(self.mu, K, axis=0) - - for g in range(self.num_comp): - w, v = la.eigh(self.Sigma[g]) - v *= np.sqrt(v) - if K == 2: - std_dev = np.sum(v, axis=1) - mu[2 * g] += std_dev - mu[2 * g + 1] -= std_dev - else: - for k in range(K): - factor = 2 * (np.random.uniform(size=(v.shape[1],)) > 0.5) - 1 - std_dev = np.sum(v * factor, axis=1) - mu[K * g + k] += std_dev - - config = self.get_config() - return GMM(pi=pi, mu=mu, Lambda=Lambda, **config) - - def log_prob_std(self, x): - r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) - llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) - for k in range(self.num_comp): - mah_dist2 = np.sum(np.dot(x - self.mu[k], self.cholLambda[k]) ** 2, axis=1) - llk_k[:, k] = r0[k] - 0.5 * mah_dist2 - - return logsumexp(llk_k, axis=-1) - - def sample(self, num_samples, rng=None, seed=1024): - if rng is None: - rng = np.random.RandomState(seed) - - r = rng.multinomial(1, self.pi, size=(num_samples,)) - x = np.zeros((num_samples, self.x_dim), dtype=float_cpu()) - for k in range(self.num_comp): - index = r[:, k] == 1 - n_k = np.sum(index) - if n_k == 0: - continue - x[index] = rng.multivariate_normal( - self.mu[k], self.Sigma[k], size=(n_k,) - ).astype(float_cpu()) - - return x - - def get_config(self): - config = { - "var_floor": self.var_floor, - "update_mu": self.update_mu, - "update_lambda": self.update_Lambda, - } - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {"pi": self.pi, "mu": self.mu, "Lambda": self.Lambda} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["pi", "mu", "Lambda"] - params = cls._load_params_to_dict(f, config["name"], param_list) - return cls( - x_dim=config["x_dim"], - pi=params["pi"], - mu=params["mu"], - Lambda=params["Lambda"], - var_floor=config["var_floor"], - min_N=config["min_n"], - update_pi=config["update_pi"], - update_mu=config["update_mu"], - update_Lambda=config["update_lambda"], - name=config["name"], - ) - - @classmethod - def load_from_kaldi(cls, file_path): - pi = None - eta1 = None - eta2 = None - num_comp = 0 - x_dim = 0 - success = False - with open(file_path, "r") as f: - while True: - line = f.readline() - if not line: - break - fields = line.rstrip().split() - if fields[0] == "": - pi = np.array([float(v) for v in fields[2:-1]], dtype=float_cpu()) - num_comp = len(pi) - elif fields[0] == "": - for k in range(num_comp): - line = f.readline() - fields = line.split() - if x_dim == 0: - x_dim = len(fields) - eta1 = np.zeros((num_comp, x_dim), dtype=float_cpu()) - eta2 = np.zeros( - (num_comp, int((x_dim ** 2 + 3 * x_dim) / 2)), - dtype=float_cpu(), - ) - - assert len(fields) == x_dim or len(fields) == x_dim + 1 - eta1[k] = [float(v) for v in fields[:x_dim]] - elif fields[0] == "": - L = np.zeros((x_dim, x_dim), dtype=float_cpu()) - for k in range(num_comp): - L[:, :] = 0 - for j in range(x_dim): - line = f.readline() - fields = line.split() - if j < x_dim - 1: - assert len(fields) == j + 1 - else: - assert len(fields) == x_dim + 1 - L[j, : j + 1] = [float(v) for v in fields[: j + 1]] - eta2[k] = -symmat2vec(L.T, diag_factor=0.5) - if k == num_comp - 1: - success = True - assert success - eta = np.hstack((eta1, eta2)) - return cls(x_dim=x_dim, pi=pi, eta=eta) - - def _validate_mu(self): - assert self.mu.shape[0] == self.num_comp - assert self.mu.shape[1] == self.x_dim - - def _validate_Lambda(self): - assert self.Lambda.shape[0] == self.num_comp - assert self.Lambda.shape[1] == self.x_dim - assert self.Lambda.shape[2] == self.x_dim - - def _validate_eta(self): - assert self.eta.shape[0] == self.num_comp - assert self.eta.shape[1] == (self.x_dim ** 2 + 3 * self.x_dim) / 2 - - def validate(self): - if self.pi is not None: - self._validate_pi() - - if self.mu is not None and self.Lambda is not None: - self._validate_mu() - self._validate_Lambda() - - if self.eta is not None: - self._validate_eta() - - @staticmethod - def compute_eta(mu, Lambda): - x_dim = mu.shape[-1] - eta_dim = int((x_dim ** 2 + 3 * x_dim) / 2) - eta = np.zeros((mu.shape[0], eta_dim), dtype=float_cpu()) - for k in range(mu.shape[0]): - eta[k] = Normal.compute_eta(mu[k], Lambda[k]) - - return eta - - @staticmethod - def compute_std(eta): - x_dim = Normal.compute_x_dim_from_eta(eta) - mu = np.zeros((eta.shape[0], x_dim), dtype=float_cpu()) - Lambda = np.zeros((eta.shape[0], x_dim, x_dim), dtype="float32") - for k in range(eta.shape[0]): - mu[k], Lambda[k] = Normal.compute_std(eta[k]) - - return mu, Lambda - - @staticmethod - def compute_A_nat(eta): - A = np.zeros((eta.shape[0],), dtype=float_cpu()) - for k in range(eta.shape[0]): - A[k] = Normal.compute_A_nat(eta[k]) - - return A - - @staticmethod - def compute_A_std(mu, Lambda): - A = np.zeros((mu.shape[0],), dtype=float_cpu()) - for k in range(mu.shape[0]): - A[k] = Normal.compute_A_std(mu[k], Lambda[k]) - - return A - - def _compute_nat_params(self): - self.eta = self.compute_eta(self.mu, self.Lambda) - self.A = self.compute_A_nat(self.eta) - - def _compute_std_params(self): - self.mu, self.Lambda = self.compute_std(self.eta) - self._cholLambda = None - self._logLambda = None - self._Sigma = None - - @staticmethod - def compute_suff_stats(x): - d = x.shape[1] - u = np.zeros((x.shape[0], int(d + d * (d + 1) / 2)), dtype=float_cpu()) - u[:, :d] = x - k = d - for i in range(d): - for j in range(i, d): - u[:, k] = x[:, i] * x[:, j] - k += 1 - return u - - def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - for k in range(mu.shape[0]): - C = invert_pdmat(self.Lambda[k], return_inv=True)[-1][feat_idx, feat_idx] - plot_gaussian_1D(mu[k], C, num_sigmas, num_pts, **kwargs) - - def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - j, i = np.meshgrid(feat_idx, feat_idx) - for k in range(mu.shape[0]): - C_k = invert_pdmat(self.Lambda[k], return_inv=True)[-1][i, j] - plot_gaussian_ellipsoid_2D(mu[k], C_k, num_sigmas, num_pts, **kwargs) - - def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - j, i = np.meshgrid(feat_idx, feat_idx) - for k in range(mu.shape[0]): - C_k = invert_pdmat(self.Lambda[k], return_inv=True)[-1][i, j] - plot_gaussian_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) - - def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - j, i = np.meshgrid(feat_idx, feat_idx) - for k in range(mu.shape[0]): - C_k = invert_pdmat(self.Lambda[k], return_inv=True)[-1][i, j] - plot_gaussian_ellipsoid_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) diff --git a/hyperion/pdfs/mixtures/gmm_diag_cov.py b/hyperion/pdfs/mixtures/gmm_diag_cov.py deleted file mode 100644 index b586a900..00000000 --- a/hyperion/pdfs/mixtures/gmm_diag_cov.py +++ /dev/null @@ -1,376 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py -from scipy.special import erf - -from ...hyp_defs import float_cpu -from ...utils.math import softmax, logsumexp -from ...utils.plotting import ( - plot_gaussian_1D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D, - plot_gaussian_3D, -) -from ...clustering import KMeans - -from .exp_family_mixture import ExpFamilyMixture - - -class GMMDiagCov(ExpFamilyMixture): - def __init__( - self, - mu=None, - Lambda=None, - var_floor=1e-3, - update_mu=True, - update_Lambda=True, - **kwargs - ): - super().__init__(**kwargs) - self.mu = mu - self.Lambda = Lambda - self.var_floor = var_floor - self.update_mu = update_mu - self.update_Lambda = update_Lambda - - self._compute_gmm_nat_std() - - self._logLambda = None - self._cholLambda = None - self._Sigma = None - - def _compute_gmm_nat_std(self): - if self.mu is not None and self.Lambda is not None: - self._validate_mu() - self._validate_Lambda() - self._compute_nat_params() - elif self.eta is not None: - self._validate_eta() - self.A = self.compute_A_nat(self.eta) - self._compute_std_params() - - @property - def logLambda(self): - if self._logLambda is None: - self._logLambda = np.sum(np.log(self.Lambda), axis=-1) - return self._logLambda - - @property - def cholLambda(self): - if self._cholLambda is None: - self._cholLambda = np.sqrt(self.Lambda) - return self._cholLambda - - @property - def Sigma(self): - if self._Sigma is None: - self._Sigma = 1.0 / self.Lambda - return self._Sigma - - def initialize(self, x=None): - if x is None and self.mu is None and self.eta is None: - assert self.num_comp == 1 - self._initialize_stdnormal() - if x is not None: - self._initialize_kmeans(self.num_comp, x) - self.validate() - self._compute_gmm_nat_std() - - def _initialize_stdnormal(self): - self.pi = np.array([1], dtype=float_cpu()) - self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) - self.Lambda = np.ones((1, self.x_dim), dtype=float_cpu()) - - def _initialize_kmeans(self, num_comp, x): - if num_comp == 1: - self.pi = np.array([1], dtype=float_cpu()) - self.mu = np.mean(x, axis=0, keepdims=True) - self.Lambda = 1 / np.std(x, axis=0, keepdims=True) ** 2 - return - - kmeans = KMeans(num_clusters=num_comp) - loss, cluster_index = kmeans.fit(x, epochs=100) - - self.mu = kmeans.mu - self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) - self.Lambda = np.zeros((self.num_comp, x.shape[-1]), dtype=float_cpu()) - for k in range(num_comp): - r = cluster_index == k - self.pi[k] = np.sum(r) / x.shape[0] - self.Lambda[k] = 1 / np.std(x[r], axis=0) ** 2 - - def stack_suff_stats(self, F, S=None): - if S is None: - return F - return np.hstack((F, S)) - - def unstack_suff_stats(self, stats): - F = stats[:, : self.x_dim] - S = stats[:, self.x_dim :] - return F, S - - def norm_suff_stats(self, N, u_x, return_order2=False): - F, S = self.unstack_suff_stats(acc_u_x) - F_norm = self.cholLambda * (F - N[:, None] * self.mu) - if return_order2: - S = S - 2 * self.mu * F + N * self.mu ** 2 - S *= self.Lambda - return N, self.stack_suff_stats(F_norm, S) - - return N, F_norm - - def Mstep(self, N, u_x): - - F, S = self.unstack_suff_stats(u_x) - - if self.update_mu: - self.mu = F / N[:, None] - - if self.update_Lambda: - S = S / N[:, None] - self.mu ** 2 - S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) - S = np.maximum(S, S_floor) - self.Lambda = 1 / S - self._Sigma = S - self._cholLambda = None - self._logLambda = None - - if self.update_pi: - N0 = N < self.min_N - if np.any(N0): - N[N0] = 0 - mu[N0] = 0 - S[N0] = 1 - self.pi = N / np.sum(N) - self._log_pi = None - - self._compute_nat_params() - - def split_comp(self, K=2): - - std_dev = 1 / self.cholLambda - - num_comp = self.num_comp * K - pi = np.repeat(self.pi, K) / K - Lambda = np.repeat(self.Lambda, K, axis=0) * (K ** 2) - mu = np.repeat(self.mu, K, axis=0) - - if K == 2: - mu[::2] += std_dev - mu[1::2] -= std_dev - else: - for k in range(K): - factor = 2 * (np.random.uniform(size=std_dev.shape) > 0.5) - 1 - mu[k::K] += factor * std_dev - - config = self.get_config() - return GMMDiagCov(pi=pi, mu=mu, Lambda=Lambda, **config) - - def log_prob_std(self, x): - r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) - llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) - for k in range(self.num_comp): - mah_dist2 = np.sum(((x - self.mu[k]) * self.cholLambda[k]) ** 2, axis=-1) - llk_k[:, k] = r0[k] - 0.5 * mah_dist2 - return logsumexp(llk_k, axis=-1) - - def log_cdf(self, x): - llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) - for k in range(self.num_comp): - delta = (x - self.mu[k]) * self.cholLambda[k] - lk = 0.5 * (1 + erf(delta / np.sqrt(2))) - llk_k[:, k] = self.log_pi[k] + np.sum(np.log(lk + 1e-20), axis=-1) - - return logsumexp(llk_k) - - def sample(self, num_samples, rng=None, seed=1024): - if rng is None: - rng = np.random.RandomState(seed) - - r = rng.multinomial(1, self.pi, size=(num_samples,)) - x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) - - for k in range(self.num_comp): - index = r[:, k] == 1 - x[index] = 1.0 / self.cholLambda[k] * x[index] + self.mu[k] - - return x - - def get_config(self): - config = { - "var_floor": self.var_floor, - "update_mu": self.update_mu, - "update_lambda": self.update_Lambda, - } - base_config = super(GMMDiagCov, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {"pi": self.pi, "mu": self.mu, "Lambda": self.Lambda} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["pi", "mu", "Lambda"] - params = self._load_params_to_dict(f, config["name"], param_list) - return cls( - x_dim=config["x_dim"], - pi=params["pi"], - mu=params["mu"], - Lambda=params["Lambda"], - var_floor=config["var_floor"], - min_N=config["min_n"], - update_pi=config["update_pi"], - update_mu=config["update_mu"], - update_Lambda=config["update_lambda"], - name=config["name"], - ) - - @classmethod - def load_from_kaldi(cls, file_path): - pi = None - eta1 = None - eta2 = None - num_comp = 0 - x_dim = 0 - success = False - with open(file_path, "r") as f: - while True: - line = f.readline() - if not line: - break - fields = line.rstrip().split() - if fields[0] == "": - pi = np.array([float(v) for v in fields[2:-1]], dtype=float_cpu()) - num_comp = len(pi) - elif fields[0] == "": - for k in range(num_comp): - line = f.readline() - fields = line.split() - if x_dim == 0: - x_dim = len(fields) - eta1 = np.zeros((num_comp, x_dim), dtype=float_cpu()) - eta2 = np.zeros((num_comp, x_dim), dtype=float_cpu()) - - assert len(fields) == x_dim or len(fields) == x_dim + 1 - eta1[k] = [float(v) for v in fields[:x_dim]] - elif fields[0] == "": - for k in range(num_comp): - line = f.readline() - fields = line.split() - assert len(fields) == x_dim or len(fields) == x_dim + 1 - eta2[k] = [-0.5 * float(v) for v in fields[:x_dim]] - if k == num_comp - 1: - success = True - assert success - eta = np.hstack((eta1, eta2)) - return cls(x_dim=x_dim, pi=pi, eta=eta) - - def _validate_mu(self): - assert self.mu.shape[0] == self.num_comp - assert self.mu.shape[1] == self.x_dim - - def _validate_Lambda(self): - assert self.Lambda.shape[0] == self.num_comp - assert self.Lambda.shape[1] == self.x_dim - assert np.all(self.Lambda > 0) - - def _validate_eta(self): - assert self.eta.shape[0] == self.num_comp - assert self.eta.shape[1] == self.x_dim * 2 - - def validate(self): - if self.pi is not None: - self._validate_pi() - - if self.mu is not None and self.Lambda is not None: - self._validate_mu() - self._validate_Lambda() - - if self.eta is not None: - self._validate_eta() - - @staticmethod - def compute_eta(mu, Lambda): - Lmu = Lambda * mu - eta = np.hstack((Lmu, -0.5 * Lambda)) - return eta - - @staticmethod - def compute_std(eta): - x_dim = int(eta.shape[-1] / 2) - eta1 = eta[:, :x_dim] - eta2 = eta[:, x_dim:] - mu = -0.5 * eta1 / eta2 - Lambda = -2 * eta2 - return mu, Lambda - - @staticmethod - def compute_A_nat(eta): - x_dim = int(eta.shape[-1] / 2) - eta1 = eta[:, :x_dim] - eta2 = eta[:, x_dim:] - r1 = 0.5 * x_dim * np.log(2 * np.pi) - r2 = -1 / 4 * np.sum(eta1 * eta1 / eta2, axis=-1) - r3 = -1 / 2 * np.sum(np.log(-2 * eta2), axis=-1) - return r1 + r2 + r3 - - @staticmethod - def compute_A_std(mu, Lambda): - x_dim = mu.shape[1] - r1 = 0.5 * x_dim * np.log(2 * np.pi) - r2 = -0.5 * np.sum(np.log(Lambda), axis=-1) - r3 = 0.5 * np.sum(mu * mu * Lambda, axis=-1) - return r1 + r2 + r3 - - def _compute_nat_params(self): - self.eta = self.compute_eta(self.mu, self.Lambda) - self.A = self.compute_A_nat(self.eta) - - def _compute_std_params(self): - self.mu, self.Lambda = self.compute_std(self.eta) - self._cholLambda = None - self._logLambda = None - self._Sigma = None - - @staticmethod - def compute_suff_stats(x): - d = x.shape[-1] - u = np.zeros((x.shape[0], 2 * d), dtype=float_cpu()) - u[:, :d] = x - u[:, d:] = x * x - return u - - def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - C = 1 / self.Lambda[:, feat_idx] - for k in range(mu.shape[0]): - plot_gaussian_1D(mu[k], C[k], num_sigmas, num_pts, **kwargs) - - def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - C = 1 / self.Lambda[:, feat_idx] - for k in range(mu.shape[0]): - C_k = np.diag(C[k]) - plot_gaussian_ellipsoid_2D(mu[k], C_k, num_sigmas, num_pts, **kwargs) - - def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - C = 1 / self.Lambda[:, feat_idx] - for k in range(mu.shape[0]): - C_k = np.diag(C[k]) - plot_gaussian_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) - - def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - C = 1 / self.Lambda[:, feat_idx] - for k in range(mu.shape[0]): - C_k = np.diag(C[k]) - plot_gaussian_ellipsoid_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) - - -DiagGMM = GMMDiagCov diff --git a/hyperion/pdfs/mixtures/gmm_tied_diag_cov.py b/hyperion/pdfs/mixtures/gmm_tied_diag_cov.py deleted file mode 100644 index a3e7f93e..00000000 --- a/hyperion/pdfs/mixtures/gmm_tied_diag_cov.py +++ /dev/null @@ -1,198 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import numpy as np -import h5py -from scipy.special import erf - -from ...hyp_defs import float_cpu -from ...utils.math import softmax, logsumexp -from ...utils.plotting import ( - plot_gaussian_1D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D, - plot_gaussian_3D, -) -from ...clustering import KMeans - -from .gmm_diag_cov import GMMDiagCov - - -class GMMTiedDiagCov(GMMDiagCov): - def __init__( - self, - mu=None, - Lambda=None, - var_floor=1e-3, - update_mu=True, - update_Lambda=True, - **kwargs - ): - super().__init__( - mu=mu, - Lambda=Lambda, - var_floor=var_floor, - update_mu=update_mu, - update_Lambda=update_Lambda, - **kwargs - ) - - def _compute_gmm_nat_std(self): - if self.mu is not None and self.Lambda is not None: - self._validate_mu() - self._validate_Lambda() - self._compute_nat_params() - elif self.eta is not None: - self._validate_eta() - self.A = self.compute_A_nat(self.eta) - self._compute_std_params() - - def _initialize_stdnormal(self): - self.pi = np.array([1], dtype=float_cpu()) - self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) - self.Lambda = np.ones((self.x_dim,), dtype=float_cpu()) - - def _initialize_kmeans(self, num_comp, x): - if num_comp == 1: - self.pi = np.array([1], dtype=float_cpu()) - self.mu = np.mean(x, axis=0, keepdims=True) - self.Lambda = 1 / np.std(x, axis=0, keepdims=True) ** 2 - return - - kmeans = KMeans(num_clusters=num_comp) - loss, cluster_index = kmeans.fit(x, epochs=100) - - self.mu = kmeans.mu - self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) - C = np.zeros((x.shape[-1],), dtype=float_cpu()) - for k in range(num_comp): - r = cluster_index == k - self.pi[k] = np.sum(r) / x.shape[0] - delta = x[r] - self.mu[k] - C += np.sum(delta ** 2, axis=0) - - self.Lambda = x.shape[0] / C - - def Mstep(self, N, u_x): - - F, S = self.unstack_suff_stats(u_x) - - if self.update_mu: - self.mu = F / N[:, None] - - if self.update_Lambda: - S = S / N[:, None] - self.mu ** 2 - S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) - S = np.maximum(S, S_floor) - Spool = np.sum(N[:, None] * S, axis=0) / np.sum(N) - self.Lambda = 1 / Spool - self._Sigma = Spool - self._cholLambda = None - self._logLambda = None - - if self.update_pi: - N0 = N < self.min_N - if np.any(N0): - N[N0] = 0 - mu[N0] = 0 - S[N0] = 1 - self.pi = N / np.sum(N) - self._log_pi = None - - self._compute_nat_params() - - def split_comp(self, K=2): - - std_dev = 1 / self.cholLambda - - num_comp = self.num_comp * K - pi = np.repeat(self.pi, K) / K - mu = np.repeat(self.mu, K, axis=0) - - if K == 2: - mu[::2] += std_dev - mu[1::2] -= std_dev - else: - for k in range(K): - factor = 2 * (np.random.uniform(size=std_dev.shape) > 0.5) - 1 - mu[k::K] += factor * std_dev - - config = self.get_config() - return DiagGMMTiedCov(pi=pi, mu=mu, Lambda=self.Lambda, **config) - - def log_prob_std(self, x): - r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) - llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) - for k in range(self.num_comp): - mah_dist2 = np.sum(((x - self.mu[k]) * self.cholLambda) ** 2, axis=-1) - llk_k[:, k] = r0[k] - 0.5 * mah_dist2 - return logsumexp(llk_k, axis=-1) - - def log_cdf(self, x): - llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) - for k in range(self.num_comp): - delta = (x - self.mu[k]) * self.cholLambda - lk = 0.5 * (1 + erf(delta / np.sqrt(2))) - llk_k[:, k] = self.log_pi[k] + np.sum(np.log(lk + 1e-20), axis=-1) - - return logsumexp(llk_k) - - def sample(self, num_samples, rng=None, seed=1024): - if rng is None: - rng = np.random.RandomState(seed) - - r = rng.multinomial(1, self.pi, size=(num_samples,)) - x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) - - for k in range(self.num_comp): - index = r[:, k] == 1 - x[index] = 1.0 / self.cholLambda * x[index] + self.mu[k] - - return x - - def _validate_Lambda(self): - assert self.Lambda.shape[0] == self.x_dim - assert np.all(self.Lambda > 0) - - @staticmethod - def compute_eta(mu, Lambda): - Lmu = Lambda * mu - eta = np.hstack((Lmu, -0.5 * np.tile(Lambda, (mu.shape[0], 1)))) - return eta - - @staticmethod - def compute_std(eta): - x_dim = int(eta.shape[-1] / 2) - eta1 = eta[:, :x_dim] - eta2 = eta[:, x_dim:] - mu = -0.5 * eta1 / eta2 - Lambda = -2 * eta2[0] - return mu, Lambda - - def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - C = 1 / self.Lambda[feat_idx] - for k in range(mu.shape[0]): - plot_gaussian_1D(mu[k], C, num_sigmas, num_pts, **kwargs) - - def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - C = np.diag(1 / self.Lambda[feat_idx]) - for k in range(mu.shape[0]): - plot_gaussian_ellipsoid_2D(mu[k], C, num_sigmas, num_pts, **kwargs) - - def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - C = np.diag(1 / self.Lambda[feat_idx]) - for k in range(mu.shape[0]): - plot_gaussian_3D(mu[k], C, num_sigmas, num_pts, **kwargs) - - def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): - mu = self.mu[:, feat_idx] - C = np.diag(1 / self.Lambda[feat_idx]) - for k in range(mu.shape[0]): - plot_gaussian_ellipsoid_3D(mu[k], C, num_sigmas, num_pts, **kwargs) - - -DiagGMMTiedCov = GMMTiedDiagCov diff --git a/hyperion/pdfs/plda/__init__.py b/hyperion/pdfs/plda/__init__.py deleted file mode 100644 index 9d11ad38..00000000 --- a/hyperion/pdfs/plda/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - - -from .plda_base import PLDABase -from .frplda import FRPLDA -from .splda import SPLDA -from .plda import PLDA diff --git a/hyperion/pdfs/plda/frplda.py b/hyperion/pdfs/plda/frplda.py deleted file mode 100644 index 5ea628fe..00000000 --- a/hyperion/pdfs/plda/frplda.py +++ /dev/null @@ -1,410 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -from scipy import linalg as sla - -from ...hyp_defs import float_cpu -from ...utils.math import invert_pdmat, invert_trimat, logdet_pdmat -from .plda_base import PLDABase - - -class FRPLDA(PLDABase): - def __init__( - self, - mu=None, - B=None, - W=None, - fullcov_W=True, - update_mu=True, - update_B=True, - update_W=True, - **kwargs - ): - super(FRPLDA, self).__init__(mu=mu, update_mu=update_mu, **kwargs) - if mu is not None: - self.y_dim = mu.shape[0] - self.B = B - self.W = W - self.fullcov_W = fullcov_W - self.update_B = update_B - self.update_W = update_W - - def validate(self): - assert self.mu.shape[0] == self.B.shape[0] - assert self.mu.shape[0] == self.B.shape[1] - assert self.mu.shape[0] == self.W.shape[0] - assert self.mu.shape[0] == self.W.shape[1] - - @property - def is_init(self): - if self._is_init: - return True - if self.mu is not None and self.B is not None and self.W is not None: - self.validate() - self._is_init = True - return self._is_init - - def initialize(self, D): - N, F, S = D - self.x_dim = F.shape[1] - self.y_dim = F.shape[1] - M = F.shape[0] - N_tot = np.sum(N) - - y = F / N[:, None] - Fy = np.dot(F.T, y) - C = S - Fy - Fy.T - for i in range(M): - yy = np.outer(y[i, :], y[i, :]) - C += N[i] * yy - - C = (C + C.T) / 2 - mu = np.mean(y, axis=0) - iB = np.dot(y.T, y) / M - np.outer(mu, mu) - iW = C / N_tot - - B = invert_pdmat(iB, return_inv=True)[-1] - W = invert_pdmat(iW, return_inv=True)[-1] - - self.mu = mu - self.B = B - self.W = W - self._is_init = True - - def compute_py_g_x( - self, D, return_cov=False, return_logpy_0=False, return_acc=False - ): - - assert self.is_init - - N, F, S = D - - M = F.shape[0] - y_dim = self.y_dim - assert y_dim == F.shape[1] - - compute_inv = return_cov or return_acc - return_tuple = compute_inv or return_logpy_0 - - N_is_int = False - if np.all(np.ceil(N) == N): - N_is_int = True - - gamma = np.dot(F, self.W) + np.dot(self.mu, self.B) - if N_is_int: - iterator = np.unique(N) - else: - iterator = range(M) - - y = np.zeros_like(F) - if return_cov: - Sigma_y = np.zeros((M, y_dim, y_dim), dtype=float_cpu()) - else: - Sigma_y = None - - if return_logpy_0: - logpy = -0.5 * y_dim * np.log(2 * np.pi) * np.ones((M,), dtype=float_cpu()) - - if return_acc: - Py = np.zeros((y_dim, y_dim), dtype=float_cpu()) - Ry = np.zeros((y_dim, y_dim), dtype=float_cpu()) - - for k in iterator: - if N_is_int: - i = (N == k).nonzero()[0] - N_i = k - M_i = len(i) - else: - i = k - N_i = N[k] - M_i = 1 - - L_i = self.B + N_i * self.W - - r = invert_pdmat( - L_i, - right_inv=True, - return_logdet=return_logpy_0, - return_inv=compute_inv, - ) - mult_iL = r[0] - if return_logpy_0: - logL = r[2] - if compute_inv: - iL = r[-1] - - y[i, :] = mult_iL(gamma[i, :]) - - if return_cov: - Sigma_y[i, :, :] = iL - - if return_logpy_0: - logpy[i] += 0.5 * (logL - np.sum(y[i, :] * gamma[i, :], axis=-1)) - - if return_acc: - Py += M_i * iL - - if not return_tuple: - return y - - r = [y] - if return_cov: - r += [Sigma_y] - if return_logpy_0: - r += [logpy] - if return_acc: - r += [Ry, Py] - return r - - def Estep(self, D): - N, F, S = D - y, logpy, Ry, Py = self.compute_py_g_x(D, return_logpy_0=True, return_acc=True) - - M = F.shape[0] - N_tot = np.sum(N) - - y_acc = np.sum(y, axis=0) - Cy = np.dot(F.T, y) - - Niy = y * N[:, None] - Ry += np.dot(Niy.T, y) - Py += np.dot(y.T, y) - - logpy_acc = np.sum(logpy) - - stats = (N_tot, M, S, logpy_acc, y_acc, Ry, Cy, Py) - return stats - - def elbo(self, stats): - N, M, S, logpy_x = stats[:4] - - logW = logdet_pdmat(self.W) - logB = logdet_pdmat(self.B) - - logpx_y = 0.5 * ( - -N * self.x_dim * np.log(2 * np.pi) - + N * logW - - np.inner(self.W.ravel(), S.ravel()) - ) - logpy = ( - 0.5 - * M - * ( - -self.y_dim * np.log(2 * np.pi) - + logB - - np.inner(np.dot(self.mu, self.B), self.mu) - ) - ) - - elbo = logpx_y + logpy - logpy_x - return elbo - # N, M, sumy, yy, _, _, CW, logL = stats - # ymu = np.outer(sumy, mu) - # CB = yy - ymu -ymu.T + M*np.outer(self.mu, self.mu.T) - - # logW = logdet_pdmat(self.W) - # logB = logdet_pdmat(self.B) - - # elbo = 0.5*(-logL - N*self.x_dim*np.log(2*np.pi) - # +N*logW - np.inner(self.W.ravel(), CW.ravel()) - # +M*logB - np.inner(self.B.ravel(), CB.ravel())) - # return elbo - - def MstepML(self, stats): - N, M, S, _, y_acc, Ry, Cy, Py = stats - ybar = y_acc / M - if self.update_mu: - self.mu = ybar - if self.update_B: - if self.update_mu: - iB = Py / M - np.outer(self.mu, self.mu) - else: - muybar = np.outer(self.mu, ybar) - iB = Py / M - muybar - muybar + np.outer(self.mu, self.mu) - self.B = invert_pdmat(iB, return_inv=True)[-1] - if self.update_W: - iW = (S - Cy - Cy.T + Ry) / N - if self.fullcov_W: - self.W = invert_pdmat(iW, return_inv=True)[-1] - else: - self.W = np.diag(1 / np.diag(iW)) - - def MstepMD(self, stats): - pass - - def get_config(self): - config = { - "update_W": self.update_W, - "update_B": self.update_B, - "fullcov_W": self.fullcov_W, - } - base_config = super(FRPLDA, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {"mu": self.mu, "B": self.B, "W": self.W} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "B", "W"] - params = cls._load_params_to_dict(f, config["name"], param_list) - kwargs = dict(list(config.items()) + list(params.items())) - return cls(**kwargs) - - def llr_1vs1(self, x1, x2): - - assert self.is_init - - Lnon = self.B + self.W - mult_icholLnon, logcholLnon = invert_trimat( - sla.cholesky(Lnon, lower=False, overwrite_a=True), - right_inv=True, - return_logdet=True, - )[:2] - logLnon = 2 * logcholLnon - - Ltar = self.B + 2 * self.W - mult_icholLtar, logcholLtar = invert_trimat( - sla.cholesky(Ltar, lower=False, overwrite_a=True), - right_inv=True, - return_logdet=True, - )[:2] - logLtar = 2 * logcholLtar - - WF1 = np.dot(x1, self.W) - WF2 = np.dot(x2, self.W) - Bmu = np.dot(self.mu, self.B) - - gamma_non_1 = mult_icholLnon(WF1 + Bmu) - gamma_non_2 = mult_icholLnon(WF2 + Bmu) - - Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] - Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) - - gamma_tar_1 = mult_icholLtar(WF1 + 0.5 * Bmu) - gamma_tar_2 = mult_icholLtar(WF2 + 0.5 * Bmu) - - Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] - Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) - - scores = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) - scores += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 - scores += ( - 2 * logLnon - - logLtar - - logdet_pdmat(self.B) - + np.inner(np.dot(self.mu, self.B), self.mu) - ) - scores *= 0.5 - return scores - - def llr_NvsM_book(self, D1, D2): - - assert self.is_init - - N1, F1, _ = D1 - N2, F2, _ = D2 - - Bmu = np.dot(self.mu, self.B) - - scores = np.zeros((len(N1), len(N2)), dtype=float_cpu()) - for N1_i in np.unique(N1): - for N2_j in np.unique(N2): - i = np.where(N1 == N1_i)[0] - j = np.where(N2 == N2_j)[0] - - L1 = self.B + N1_i * self.W - mult_icholL1, logcholL1 = invert_trimat( - sla.cholesky(L1, lower=False, overwrite_a=True), - right_inv=True, - return_logdet=True, - )[:2] - logL1 = 2 * logcholL1 - - L2 = self.B + N2_j * self.W - mult_icholL2, logcholL2 = invert_trimat( - sla.cholesky(L2, lower=False, overwrite_a=True), - right_inv=True, - return_logdet=True, - )[:2] - logL2 = 2 * logcholL2 - - Ltar = self.B + (N1_i + N2_j) * self.W - mult_icholLtar, logcholLtar = invert_trimat( - sla.cholesky(Ltar, lower=False, overwrite_a=True), - right_inv=True, - return_logdet=True, - )[:2] - logLtar = 2 * logcholLtar - - WF1 = np.dot(F1[i, :], self.W) - WF2 = np.dot(F2[j, :], self.W) - - gamma_non_1 = mult_icholL1(WF1 + Bmu) - gamma_non_2 = mult_icholL2(WF2 + Bmu) - - Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] - Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) - - gamma_tar_1 = mult_icholLtar(WF1 + 0.5 * Bmu) - gamma_tar_2 = mult_icholLtar(WF2 + 0.5 * Bmu) - - Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] - Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) - - scores_ij = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) - scores_ij += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 - scores_ij += logL1 + logL2 - logLtar - scores[np.ix_(i, j)] = scores_ij - - scores += -logdet_pdmat(self.B) + np.inner(np.dot(self.mu, self.B), self.mu) - scores *= 0.5 - return scores - - def sample( - self, num_classes, num_samples_per_class, rng=None, seed=1024, return_y=False - ): - - assert self.is_init - - if rng is None: - rng = np.random.RandomState(seed=seed) - - Sb = invert_pdmat(self.B, return_inv=True)[-1] - chol_Sb = sla.cholesky(Sb, lower=False) - Sw = invert_pdmat(self.W, return_inv=True)[-1] - chol_Sw = sla.cholesky(Sw, lower=False) - - x_dim = self.mu.shape[0] - z = rng.normal(size=(num_classes * num_samples_per_class, x_dim)).astype( - dtype=float_cpu(), copy=False - ) - z = np.dot(z, chol_Sw) - y = rng.normal(size=(num_classes, x_dim)).astype(dtype=float_cpu(), copy=False) - y = np.dot(y, chol_Sb) + self.mu - y = np.repeat(y, num_samples_per_class, axis=0) - - if return_y: - return y + z, y - - return y + z - - def weighted_avg_params(self, mu, B, W, w_mu, w_B, w_W): - super(FRPLDA, self).weigthed_avg_params(mu, w_mu) - if w_B > 0: - Sb0 = invert_pdmat(self.B, return_inv=True)[-1] - Sb = invert_pdmat(B, return_inv=True)[-1] - Sb = w_B * Sb + (1 - w_B) * Sb0 - self.B = invert_pdmat(Sb, return_inv=True)[-1] - if w_W > 0: - Sw0 = invert_pdmat(self.W, return_inv=True)[-1] - Sw = invert_pdmat(W, return_inv=True)[-1] - Sw = w_W * Sw + (1 - w_W) * Sw0 - self.W = invert_pdmat(Sw, return_inv=True)[-1] - - def weighted_avg_model(self, plda, w_mu, w_B, w_W): - self.weighted_avg_params(plda.mu, plda.B, plda.W, w_mu, w_B, w_W) diff --git a/hyperion/pdfs/plda/plda_base.py b/hyperion/pdfs/plda/plda_base.py deleted file mode 100644 index 8a83543d..00000000 --- a/hyperion/pdfs/plda/plda_base.py +++ /dev/null @@ -1,367 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from abc import ABCMeta, abstractmethod - -from ...hyp_defs import float_cpu -from ..core.pdf import PDF -from ...transforms import LNorm - - -class PLDABase(PDF): - __metaclass__ = ABCMeta - - def __init__(self, y_dim=None, mu=None, update_mu=True, **kwargs): - super(PLDABase, self).__init__(**kwargs) - self.mu = mu - self.y_dim = y_dim - self.update_mu = update_mu - if mu is not None: - self.x_dim = mu.shape[0] - - @abstractmethod - def initialize(self, D): - pass - - @abstractmethod - def compute_py_g_x(self, D): - pass - - def fit( - self, - x, - class_ids=None, - ptheta=None, - sample_weight=None, - x_val=None, - class_ids_val=None, - ptheta_val=None, - sample_weight_val=None, - epochs=20, - ml_md="ml+md", - md_epochs=None, - ): - - use_ml = False if ml_md == "md" else True - use_md = False if ml_md == "ml" else True - - assert not (class_ids is None and ptheta is None) - if class_ids is None: - D = self.compute_stats_soft(x, ptheta) - else: - D = self.compute_stats_hard(x, class_ids) - - if x_val is not None: - assert not (class_ids_val is None and ptheta_val is None) - if class_ids_val is None: - D_val = self.compute_stats_soft(x_val, ptheta_val) - else: - D_val = self.compute_stats_hard(x_val, class_ids_val) - - if not self.is_init: - self.initialize(D) - - elbo = np.zeros((epochs,), dtype=float_cpu()) - elbo_val = np.zeros((epochs,), dtype=float_cpu()) - for epoch in range(epochs): - - stats = self.Estep(D) - elbo[epoch] = self.elbo(stats) - if x_val is not None: - stats_val = self.Estep(D_val) - elbo_val[epoch] = self.elbo(stats_val) - - if use_ml: - self.MstepML(stats) - if use_md and (md_epochs is None or epoch in md_epochs): - self.MstepMD(stats) - - elbo_norm = elbo / np.sum(D[0]) - if x_val is None: - return elbo, elbo_norm - else: - elbo_val_norm = elbo_val / np.sum(D_val[0]) - return elbo, elbo_norm, elbo_val, elbo_val_norm - - @abstractmethod - def Estep(self, x): - pass - - @abstractmethod - def MstepML(self, x): - pass - - @abstractmethod - def MstepMD(self, x): - pass - - @abstractmethod - def llr_1vs1(self, x1, x2): - pass - - @abstractmethod - def llr_NvsM_book(self, D1, D2): - pass - - def fit_adapt_weighted_avg_model( - self, - x, - class_ids=None, - ptheta=None, - sample_weight=None, - x_val=None, - class_ids_val=None, - ptheta_val=None, - sample_weight_val=None, - epochs=20, - ml_md="ml+md", - md_epochs=None, - plda0=None, - w_mu=1, - w_B=0.5, - w_W=0.5, - ): - - assert self.is_init - use_ml = False if ml_md == "md" else True - use_md = False if ml_md == "ml" else True - - assert not (class_ids is None and ptheta is None) - if class_ids is None: - D = self.compute_stats_soft(x, ptheta) - else: - D = self.compute_stats_hard(x, class_ids) - - if x_val is not None: - assert not (class_ids_val is None and ptheta_val is None) - if class_ids_val is None: - D_val = self.compute_stats_soft(x_val, ptheta_val) - else: - D_val = self.compute_stats_hard(x_val, class_ids_val) - - elbo = np.zeros((epochs,), dtype=float_cpu()) - elbo_val = np.zeros((epochs,), dtype=float_cpu()) - for epoch in range(epochs): - - stats = self.Estep(D) - elbo[epoch] = self.elbo(stats) - if x_val is not None: - stats_val = self.Estep(D_val) - elbo_val[epoch] = self.elbo(stats_val) - - if use_ml: - self.MstepML(stats) - if use_md and (md_epochs is None or epoch in md_epochs): - self.MstepMD(stats) - - self.weighted_avg_model(plda0, w_mu, w_B, w_W) - - elbo_norm = elbo / np.sum(D[0]) - if x_val is None: - return elbo, elbo_norm - else: - elbo_val_norm = elbo_val / np.sum(D_val[0]) - return elbo, elbo_norm, elbo_val, elbo_val_norm - - def fit_adapt( - self, - x, - class_ids=None, - ptheta=None, - sample_weight=None, - x0=None, - class_ids0=None, - ptheta0=None, - sample_weight0=None, - x_val=None, - class_ids_val=None, - ptheta_val=None, - sample_weight_val=None, - epochs=20, - ml_md="ml+md", - md_epochs=None, - ): - - assert self.is_init - use_ml = False if ml_md == "md" else True - use_md = False if ml_md == "ml" else True - - assert not (class_ids is None and ptheta is None) - if class_ids is None: - D = self.compute_stats_soft(x, ptheta) - else: - D = self.compute_stats_hard(x, class_ids) - - if x0 is not None: - assert not (class_ids0 is None and ptheta0 is None) - if class_ids0 is None: - D0 = self.compute_stats_soft(x0, ptheta0) - else: - D0 = self.compute_stats_hard(x0, class_ids0) - - if x_val is not None: - assert not (class_ids_val is None and ptheta_val is None) - if class_ids_val is None: - D_val = self.compute_stats_soft(x_val, ptheta_val) - else: - D_val = self.compute_stats_hard(x_val, class_ids_val) - - elbo = np.zeros((epochs,), dtype=float_cpu()) - elbo_val = np.zeros((epochs,), dtype=float_cpu()) - for epoch in range(epochs): - - stats = self.Estep(D) - stats0 = self.Estep(D0) - elbo[epoch] = self.elbo(stats) - if x_val is not None: - stats_val = self.Estep(D_val) - elbo_val[epoch] = self.elbo(stats_val) - - if use_ml: - self.MstepML(stats) - if use_md and (md_epochs is None or epoch in md_epochs): - self.MstepMD(stats) - - elbo_norm = elbo / np.sum(D[0]) - if x_val is None: - return elbo, elbo_norm - else: - elbo_val_norm = elbo_val / np.sum(D_val[0]) - return elbo, elbo_norm, elbo_val, elbo_val_norm - - @staticmethod - def compute_stats_soft(x, p_theta, sample_weight=None, scal_factor=None): - if sample_weight is not None: - p_theta = sample_weight[:, None] * p_theta - if scal_factor is not None: - p_theta *= scal_factor - N = np.sum(p_theta, axis=0) - F = np.dot(p_theta.T, x) - wx = np.sum(p_theta, axis=1, keepdims=True) * x - S = np.dot(x.T, wx) - return N, F, S - - @staticmethod - def compute_stats_hard(x, class_ids, sample_weight=None, scale_factor=None): - x_dim = x.shape[1] - num_classes = np.max(class_ids) + 1 - N = np.zeros((num_classes,), dtype=float_cpu()) - F = np.zeros((num_classes, x_dim), dtype=float_cpu()) - if sample_weight is not None: - wx = sample_weight[:, None] * x - else: - wx = x - - for i in range(num_classes): - idx = class_ids == i - if sample_weight is None: - N[i] = np.sum(idx).astype(float_cpu()) - F[i] = np.sum(x[idx], axis=0) - else: - N[i] = np.sum(sample_weight[idx]) - F[i] = np.sum(wx[idx], axis=0) - - S = np.dot(x.T, wx) - if scale_factor is not None: - N *= scale_factor - F *= scale_factor - S *= scale_factor - - return N, F, S - - @staticmethod - def compute_stats_hard_v0(x, class_ids, sample_weight=None, scal_factor=None): - x_dim = x.shape[1] - num_classes = np.max(class_ids) + 1 - p_theta = np.zeros((x.shape[0], num_classes), dtype=float_cpu()) - p_theta[np.arange(x.shape[0]), class_ids] = 1 - return PLDABase.compute_stats_soft(x, p_theta, sample_weight, scal_factor) - - @staticmethod - def center_stats(D, mu): - N, F, S = D - Fc = F - np.outer(N, mu) - Fmu = np.outer(np.sum(F, axis=0), mu) - Sc = S - Fmu - Fmu.T + np.sum(N) * np.outer(mu, mu) - return N, Fc, Sc - - def llr_NvsM(self, x1, x2, ids1=None, ids2=None, method="vavg-lnorm"): - if method == "savg": - return self.llr_NvsM_savg(x1, ids1, x2, ids2) - - D1 = x1 if ids1 is None else self.compute_stats_hard(x1, class_ids=ids1) - D2 = x2 if ids2 is None else self.compute_stats_hard(x2, class_ids=ids2) - - if method == "book": - return self.llr_NvsM_book(D1, D2) - if method == "vavg": - return self.llr_NvsM_vavg(D1, D2, do_lnorm=False) - if method == "vavg-lnorm": - return self.llr_NvsM_vavg(D1, D2, do_lnorm=True) - - def llr_NvsM_vavg(self, D1, D2, do_lnorm=True): - x1 = D1[1] / np.expand_dims(D1[0], axis=-1) - x2 = D2[1] / np.expand_dims(D2[0], axis=-1) - if do_lnorm: - lnorm = LNorm() - x1 = lnorm.predict(x1) - x2 = lnorm.predict(x2) - - return self.llr_1vs1(x1, x2) - - def llr_NvsM_savg(self, x1, ids1, x2, ids2): - scores_1vs1 = self.llr_1vs1(x1, x2) - N, F, _ = self.compute_stats_hard(scores_1vs1, ids1) - scores_Nvs1 = F / N[:, None] - N, F, _ = self.compute_stats_hard(scores_Nvs1.T, ids2) - scores = F.T / N - return scores - - def llr_Nvs1(self, x1, x2, ids1=None, method="vavg-lnorm"): - if method == "savg": - return self.llr_Nvs1_savg(x1, ids1, x2) - - D1 = x1 if ids1 is None else self.compute_stats_hard(x1, class_ids=ids1) - - if method == "book": - D2 = self.compute_stats_hard(x2, np.arange(x2.shape[0])) - return self.llr_NvsM_book(D1, D2) - if method == "vavg": - return self.llr_Nvs1_vavg(D1, x2, do_lnorm=False) - if method == "vavg-lnorm": - return self.llr_Nvs1_vavg(D1, x2, do_lnorm=True) - - def llr_Nvs1_vavg(self, D1, x2, do_lnorm=True): - x1 = D1[1] / np.expand_dims(D1[0], axis=-1) - if do_lnorm: - lnorm = LNorm() - x1 = lnorm.predict(x1) - x2 = lnorm.predict(x2) - - return self.llr_1vs1(x1, x2) - - def llr_Nvs1_savg(self, x1, ids1, x2): - scores_1vs1 = self.llr_1vs1(x1, x2) - N, F, _ = self.compute_stats_hard(scores_1vs1, ids1) - scores = F / N[:, None] - return scores - - @abstractmethod - def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): - pass - - def get_config(self): - config = {"y_dim": self.y_dim, "update_mu": self.update_mu} - base_config = super(PLDABase, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def weigthed_avg_params(self, mu, w_mu): - self.mu = w_mu * mu + (1 - w_mu) * self.mu - - @abstractmethod - def weigthed_avg_model(self, plda): - pass diff --git a/hyperion/pdfs/plda/splda.py b/hyperion/pdfs/plda/splda.py deleted file mode 100644 index 1ffaaa1c..00000000 --- a/hyperion/pdfs/plda/splda.py +++ /dev/null @@ -1,431 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import numpy as np -from scipy import linalg as sla - -from ...hyp_defs import float_cpu -from ...utils.math import invert_pdmat, invert_trimat, logdet_pdmat -from .plda_base import PLDABase - - -class SPLDA(PLDABase): - def __init__( - self, - y_dim=None, - mu=None, - V=None, - W=None, - fullcov_W=True, - update_mu=True, - update_V=True, - update_W=True, - **kwargs - ): - super().__init__(y_dim=y_dim, mu=mu, update_mu=update_mu, **kwargs) - if V is not None: - self.y_dim = V.shape[0] - self.V = V - self.W = W - self.fullcov_W = fullcov_W - self.update_V = update_V - self.update_W = update_W - - def validate(self): - assert self.mu.shape[0] >= self.V.shape[0] - assert self.mu.shape[0] == self.V.shape[1] - assert self.mu.shape[0] == self.W.shape[0] - assert self.mu.shape[0] == self.W.shape[1] - - @property - def is_init(self): - if self._is_init: - return True - if self.mu is not None and self.V is not None and self.W is not None: - self.validate() - self._is_init = True - return self._is_init - - def initialize(self, D): - N, F, S = D - self.x_dim = F.shape[1] - M = F.shape[0] - N_tot = np.sum(N) - - Vytilde = F / N[:, None] - mu = np.mean(Vytilde, axis=0) - - Vy = Vytilde - mu - U, s, Vt = sla.svd(Vy, full_matrices=False, overwrite_a=True) - V = s[: self.y_dim, None] * Vt[: self.y_dim, :] - NVytilde = N[:, None] * Vytilde - C = (S - np.dot(NVytilde.T, Vytilde)) / N_tot - if self.fullcov_W: - W = invert_pdmat(C, return_inv=True)[-1] - else: - W = 1 / np.diag(C) - - self.mu = mu - self.V = V - self.W = W - - def compute_py_g_x( - self, D, return_cov=False, return_logpy_0=False, return_acc=False - ): - N, F, S = D - Fc = F - self.mu - - M = F.shape[0] - y_dim = self.y_dim - - WV = np.dot(self.W, self.V.T) - VV = np.dot(self.V, WV) - - compute_inv = return_cov or return_acc - return_tuple = compute_inv or return_logpy_0 - - N_is_int = False - if np.all(np.ceil(N) == N): - N_is_int = True - - I = np.eye(y_dim, dtype=float_cpu()) - gamma = np.dot(Fc, WV) - if N_is_int: - iterator = np.unique(N) - else: - iterator = range(M) - - y = np.zeros((M, y_dim), dtype=float_cpu()) - if return_cov: - Sigma_y = np.zeros((M, y_dim, y_dim), dtype=float_cpu()) - else: - Sigma_y = None - - if return_logpy_0: - logpy = -0.5 * y_dim * np.log(2 * np.pi) * np.ones((M,), dtype=float_cpu()) - - if return_acc: - Py = np.zeros((y_dim, y_dim), dtype=float_cpu()) - Ry = np.zeros((y_dim, y_dim), dtype=float_cpu()) - - for k in iterator: - if N_is_int: - i = (N == k).nonzero()[0] - N_i = k - M_i = len(i) - else: - i = k - N_i = N[k] - M_i = 1 - - L_i = I + N_i * VV - r = invert_pdmat( - L_i, - right_inv=True, - return_logdet=return_logpy_0, - return_inv=compute_inv, - ) - - mult_iL = r[0] - if return_logpy_0: - logL = r[2] - if compute_inv: - iL = r[-1] - - y[i, :] = mult_iL(gamma[i, :]) - - if return_cov: - Sigma_y[i, :, :] = iL - - if return_logpy_0: - logpy[i] += 0.5 * (logL - np.sum(y[i, :] * gamma[i, :], axis=-1)) - - if return_acc: - Py += M_i * iL - Ry += N_i * M_i * iL - - if not return_tuple: - return y - - r = [y] - if return_cov: - r += [Sigma_y] - if return_logpy_0: - r += [logpy] - if return_acc: - r += [Ry, Py] - return tuple(r) - - def Estep(self, D): - N, F, S = D - y, logpy, Ry, Py = self.compute_py_g_x(D, return_logpy_0=True, return_acc=True) - - M = F.shape[0] - N_tot = np.sum(N) - F_tot = np.sum(F, axis=0) - - y_acc = np.sum(y, axis=0) - Cy = np.dot(F.T, y) - - Niy = y * N[:, None] - Ry1 = np.sum(Niy, axis=0) - Ry += np.dot(Niy.T, y) - Py += np.dot(y.T, y) - - logpy_acc = np.sum(logpy) - - stats = (N_tot, M, F_tot, S, logpy_acc, y_acc, Ry1, Ry, Cy, Py) - return stats - - def elbo(self, stats): - N, M, F, S, logpy_x = stats[:5] - - logW = logdet_pdmat(self.W) - Fmu = np.outer(F, self.mu) - Shat = S - Fmu - Fmu.T + N * np.outer(self.mu, self.mu) - - logpx_y = 0.5 * ( - -N * self.x_dim * np.log(2 * np.pi) - + N * logW - - np.inner(self.W.ravel(), Shat.ravel()) - ) - logpy = -0.5 * M * self.y_dim * np.log(2 * np.pi) - - elbo = logpx_y + logpy - logpy_x - return elbo - - def MstepML(self, stats): - N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py = stats - - a = np.hstack((Ry, Ry1[:, None])) - b = np.hstack((Ry1, N)) - Rytilde = np.vstack((a, b)) - - Cytilde = np.hstack((Cy, F[:, None])) - - if self.update_mu and not self.update_V: - self.mu = (F - np.dot(Ry1, self.V)) / N - - if not self.update_mu and self.update_V: - iRy_mult = invert_pdmat(Ry, right_inv=False)[0] - self.V = iRy_mult(Cy.T - np.outer(Ry1, self.mu)) - - if self.update_mu and self.update_V: - iRytilde_mult = invert_pdmat(Rytilde, right_inv=False)[0] - Vtilde = iRytilde_mult(Cytilde.T) - self.V = Vtilde[:-1, :] - self.mu = Vtilde[-1, :] - - if self.update_W: - if self.update_mu and self.update_V: - iW = (S - np.dot(Cy, self.V) - np.outer(F, self.mu)) / N - else: - Vtilde = np.vstack((self.V, self.mu)) - CVt = np.dot(Cytilde, Vtilde) - iW = (S - CVt - CVt.T + np.dot(np.dot(Vtilde.T, Rytilde), Vtilde)) / N - if self.fullcov_W: - self.W = invert_pdmat(iW, return_inv=True)[-1] - else: - self.W = np.diag(1 / np.diag(iW)) - - def MstepMD(self, stats): - N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py = stats - mu_y = y_acc / M - - if self.update_mu: - self.mu += np.dot(mu_y, self.V) - - if self.update_V: - Cov_y = Py / M - np.outer(mu_y, mu_y) - chol_Cov_y = sla.cholesky(Cov_y, lower=False, overwrite_a=True) - self.V = np.dot(chol_Cov_y, self.V) - - def get_config(self): - config = { - "update_W": self.update_W, - "update_V": self.update_V, - "fullcov_W": self.fullcov_W, - } - base_config = super(SPLDA, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {"mu": self.mu, "V": self.V, "W": self.W} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "V", "W"] - params = cls._load_params_to_dict(f, config["name"], param_list) - kwargs = dict(list(config.items()) + list(params.items())) - return cls(**kwargs) - - def log_probx_g_y(self, x, y): - logW = logdet_pdmat(self.W) - delta = x - self.mu - np.dot(y, self.V) - logp = ( - -x.shape[-1] * np.log(2 * np.pi) - + logW - - np.sum(np.dot(delta, self.W) * delta, axis=-1) - ) - logp /= 2 - return logp - - def llr_1vs1(self, x1, x2): - - WV = np.dot(self.W, self.V.T) - VV = np.dot(self.V, WV) - I = np.eye(self.y_dim, dtype=float_cpu()) - - Lnon = I + VV - mult_icholLnon, logcholLnon = invert_trimat( - sla.cholesky(Lnon, lower=False, overwrite_a=True), - right_inv=True, - return_logdet=True, - )[:2] - logLnon = 2 * logcholLnon - - Ltar = I + 2 * VV - mult_icholLtar, logcholLtar = invert_trimat( - sla.cholesky(Ltar, lower=False, overwrite_a=True), - right_inv=True, - return_logdet=True, - )[:2] - logLtar = 2 * logcholLtar - - VWF1 = np.dot(x1 - self.mu, WV) - VWF2 = np.dot(x2 - self.mu, WV) - - gamma_non_1 = mult_icholLnon(VWF1) - gamma_non_2 = mult_icholLnon(VWF2) - - Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] - Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) - - gamma_tar_1 = mult_icholLtar(VWF1) - gamma_tar_2 = mult_icholLtar(VWF2) - - Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] - Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) - - scores = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) - scores += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 - scores += 2 * logLnon - logLtar - scores *= 0.5 - return scores - - def llr_NvsM_book(self, D1, D2): - N1, F1, _ = D1 - N2, F2, _ = D2 - - WV = np.dot(self.W, self.V.T) - VV = np.dot(self.V, WV) - I = np.eye(self.y_dim, dtype=float_cpu()) - - F1 -= N1[:, None] * self.mu - F2 -= N2[:, None] * self.mu - - scores = np.zeros((len(N1), len(N2)), dtype=float_cpu()) - for N1_i in np.unique(N1): - for N2_j in np.unique(N2): - i = np.where(N1 == N1_i)[0] - j = np.where(N2 == N2_j)[0] - L1 = I + N1_i * VV - mult_icholL1, logcholL1 = invert_trimat( - sla.cholesky(L1, lower=False, overwrite_a=True), - right_inv=True, - return_logdet=True, - )[:2] - logL1 = 2 * logcholL1 - - L2 = I + N2_j * VV - mult_icholL2, logcholL2 = invert_trimat( - sla.cholesky(L2, lower=False, overwrite_a=True), - right_inv=True, - return_logdet=True, - )[:2] - logL2 = 2 * logcholL2 - - Ltar = I + (N1_i + N2_j) * VV - mult_icholLtar, logcholLtar = invert_trimat( - sla.cholesky(Ltar, lower=False, overwrite_a=True), - right_inv=True, - return_logdet=True, - )[:2] - logLtar = 2 * logcholLtar - - VWF1 = np.dot(F1[i, :], WV) - VWF2 = np.dot(F2[j, :], WV) - - gamma_non_1 = mult_icholL1(VWF1) - gamma_non_2 = mult_icholL2(VWF2) - - Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] - Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) - - gamma_tar_1 = mult_icholLtar(VWF1) - gamma_tar_2 = mult_icholLtar(VWF2) - - Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] - Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) - - scores_ij = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) - scores_ij += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 - scores_ij += logL1 + logL2 - logLtar - scores[np.ix_(i, j)] = scores_ij - - scores *= 0.5 - return scores - - def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): - if rng is None: - rng = np.random.RandomState(seed=seed) - - Sw = invert_pdmat(self.W, return_inv=True)[-1] - chol_Sw = sla.cholesky(Sw, lower=False) - - x_dim = self.mu.shape[0] - z = rng.normal(size=(num_classes * num_samples_per_class, x_dim)).astype( - dtype=float_cpu(), copy=False - ) - z = np.dot(z, chol_Sw) - y = rng.normal(size=(num_classes, self.y_dim)).astype( - dtype=float_cpu(), copy=False - ) - y = np.dot(y, self.V) + self.mu - y = np.repeat(y, num_samples_per_class, axis=0) - - return y + z - - def weighted_avg_params(self, mu, V, W, w_mu, w_B, w_W): - super(SPLDA, self).weigthed_avg_params(mu, w_mu) - if w_B > 0: - Sb0 = np.dot(self.V.T, self.V) - Sb = np.dot(V.T, V) - Sb = w_B * Sb + (1 - w_B) * Sb0 - w, V = sla.eigh(Sb, overwrite_a=True) - w = w[-self.y_dim :] - V = np.sqrt(w) * V[:, -self.y_dim :] - self.V = V.T - - if w_W > 0: - Sw0 = invert_pdmat(self.W, return_inv=True)[-1] - Sw = invert_pdmat(W, return_inv=True)[-1] - Sw = w_W * Sw + (1 - w_W) * Sw0 - self.W = invert_pdmat(Sw, return_inv=True)[-1] - - def weighted_avg_model(self, plda, w_mu, w_B, w_W): - self.weighted_avg_params(plda.mu, plda.V, plda.W, w_mu, w_B, w_W) - - def project(self, T, delta_mu=None): - mu = self.mu - if mu is not None: - mu -= delta_mu - mu = np.dot(mu, T) - V = np.dot(self.V, T) - Sw = invert_pdmat(self.W, return_inv=True)[-1] - Sw = np.dot(T.T, np.dot(Sw, T)) - W = invert_pdmat(Sw, return_inv=True)[-1] - - return SPLDA(mu=mu, V=V, W=W, fullcov_W=True) diff --git a/hyperion/pipeline/pipeline.py b/hyperion/pipeline/pipeline.py deleted file mode 100644 index 6b8076f5..00000000 --- a/hyperion/pipeline/pipeline.py +++ /dev/null @@ -1,63 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import logging -import numpy as np -import h5py - -from ..hyp_model import HypModel - -from ..transforms import * - - -class Pipeline(HypModel): - """Class to process a series of models.""" - - def __init__(self, transforms, **kwargs): - super(Pipeline, self).__init__(**kwargs) - if not isinstance(transforms, list): - transforms = [transforms] - self.transforms = transforms - if transforms is not None: - self.update_names() - - def append(self, t): - self.transforms.append(t) - if self.name is not None: - t.name = self.name + "/" + t.name - - def predict(self, x): - for t in self.transforms: - x = t.predict(x) - return x - - def update_names(self): - if self.name is not None: - for t in self.transforms: - t.name = self.name + "/" + t.name - - def get_config(self): - config = super(Pipeline, self).get_config() - config_t = {} - for i in range(len(self.transforms)): - config_t[i] = self.transforms[i].get_config() - config["transforms"] = config_t - return config - - def save_params(self, f): - for t in self.transforms: - t.save_params(f) - - @classmethod - def load_params(cls, f, config): - config_ts = config["transforms"] - transforms = [] - for i in range(len(config_ts)): - config_t = config_ts[str(i)] - logging.debug(config_t) - class_t = globals()[config_t["class_name"]] - t = class_t.load_params(f, config_t) - transforms.append(t) - return cls(transforms, name=config["name"]) diff --git a/hyperion/score_norm/__init__.py b/hyperion/score_norm/__init__.py deleted file mode 100644 index b0eb8000..00000000 --- a/hyperion/score_norm/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - - -from .t_norm import TNorm -from .z_norm import ZNorm -from .zt_norm import ZTNorm -from .tz_norm import TZNorm -from .s_norm import SNorm -from .adapt_s_norm import AdaptSNorm diff --git a/hyperion/score_norm/adapt_s_norm.py b/hyperion/score_norm/adapt_s_norm.py deleted file mode 100644 index 3f1a47c7..00000000 --- a/hyperion/score_norm/adapt_s_norm.py +++ /dev/null @@ -1,88 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - - -import numpy as np -import h5py - -from .score_norm import ScoreNorm - - -class AdaptSNorm(ScoreNorm): - """Class for adaptive S-Norm""" - - def __init__(self, nbest=100, nbest_discard=0, **kwargs): - super(AdaptSNorm, self).__init__(*kwargs) - self.nbest = nbest - self.nbest_discard = nbest_discard - - def predict( - self, - scores, - scores_coh_test, - scores_enr_coh, - mask_coh_test=None, - mask_enr_coh=None, - ): - - assert scores_enr_coh.shape[1] == scores_coh_test.shape[0] - assert self.nbest_discard < scores_enr_coh.shape[1] - if self.nbest > scores_enr_coh.shape[1] - self.nbest_discard: - nbest = scores_enr_coh.shape[1] - self.nbest_discard - else: - nbest = self.nbest - - if mask_coh_test is not None: - scores_coh_test[mask_coh_test == False] = 0 - if mask_enr_coh is not None: - scores_enr_coh[mask_enr_coh == False] = 0 - - best_idx = np.flipud(np.argsort(scores_coh_test, axis=0))[ - self.nbest_discard : self.nbest_discard + nbest - ] - scores_z_norm = np.zeros_like(scores) - for i in range(scores.shape[1]): - best_idx_i = best_idx[:, i] - - mu_z = np.mean(scores_enr_coh[:, best_idx_i], axis=1, keepdims=True) - - if mask_enr_coh is None: - s_z = np.std(scores_enr_coh[:, best_idx_i], axis=1, keepdims=True) - else: - norm = np.mean(mask_enr_coh[:, best_idx_i], axis=1, keepdims=True) - mu_z /= norm - s_z = np.sqrt( - np.mean(scores_enr_coh[:, best_idx_i] ** 2, axis=1, keepdims=True) - / norm - - mu_z ** 2 - ) - - s_z = np.clip(s_z, a_min=1e-5, a_max=None) - scores_z_norm[:, i] = (scores[:, i] - mu_z.T) / s_z.T - - best_idx = np.fliplr(np.argsort(scores_enr_coh, axis=1))[ - :, self.nbest_discard : self.nbest_discard + nbest - ] - scores_t_norm = np.zeros_like(scores) - for i in range(scores.shape[0]): - best_idx_i = best_idx[i] - - mu_z = np.mean(scores_coh_test[best_idx_i, :], axis=0, keepdims=True) - - if mask_coh_test is None: - s_z = np.std(scores_coh_test[best_idx_i, :], axis=0, keepdims=True) - else: - norm = np.mean(mask_coh_test[best_idx_i, :], axis=0, keepdims=True) - mu_z /= norm - s_z = np.sqrt( - np.mean(scores_coh_test[best_idx_i, :] ** 2, axis=0, keepdims=True) - / norm - - mu_z ** 2 - ) - - s_z = np.clip(s_z, a_min=1e-5, a_max=None) - scores_t_norm[i, :] = (scores[i, :] - mu_z) / s_z - - return (scores_z_norm + scores_t_norm) / np.sqrt(2) diff --git a/hyperion/score_norm/s_norm.py b/hyperion/score_norm/s_norm.py deleted file mode 100644 index ee00a7e8..00000000 --- a/hyperion/score_norm/s_norm.py +++ /dev/null @@ -1,34 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py - -from .score_norm import ScoreNorm -from .t_norm import TNorm -from .z_norm import ZNorm - - -class SNorm(ScoreNorm): - """Class for S-Norm, symmetric score normalization.""" - - def __init__(self, **kwargs): - super(SNorm, self).__init__(*kwargs) - self.t_norm = TNorm(**kwargs) - self.z_norm = ZNorm(**kwargs) - - def predict( - self, - scores, - scores_coh_test, - scores_enr_coh, - mask_coh_test=None, - mask_enr_coh=None, - ): - - scores_z_norm = self.z_norm.predict(scores, scores_enr_coh, mask_enr_coh) - scores_t_norm = self.t_norm.predict(scores, scores_coh_test, mask_coh_test) - - return (scores_z_norm + scores_t_norm) / np.sqrt(2) diff --git a/hyperion/score_norm/score_norm.py b/hyperion/score_norm/score_norm.py deleted file mode 100644 index f20a0b98..00000000 --- a/hyperion/score_norm/score_norm.py +++ /dev/null @@ -1,18 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from ..hyp_model import HypModel - - -class ScoreNorm(HypModel): - """ - Base class for score normalization - """ - - def __init__(self, std_floor=1e-5, **kwargs): - super(ScoreNorm, self).__init__(*kwargs) - self.std_floor = std_floor diff --git a/hyperion/score_norm/t_norm.py b/hyperion/score_norm/t_norm.py deleted file mode 100644 index 3fb92548..00000000 --- a/hyperion/score_norm/t_norm.py +++ /dev/null @@ -1,31 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py - -from .score_norm import ScoreNorm - - -class TNorm(ScoreNorm): - """Class for T-Norm score normalization.""" - - def predict(self, scores, scores_coh_test, mask=None): - - if mask is None: - mu_t = np.mean(scores_coh_test, axis=0, keepdims=True) - s_t = np.std(scores_coh_test, axis=0, keepdims=True) - else: - scores_coh_test[mask == False] = 0 - n_t = np.mean(mask, axis=0, keepdims=True) - mu_t = np.mean(scores_coh_test, axis=0, keepdims=True) / n_t - s_t = np.sqrt( - np.mean(scores_coh_test ** 2, axis=0, keepdims=True) / n_t - mu_t ** 2 - ) - - s_t[s_t < self.std_floor] = self.std_floor - - scores_norm = (scores - mu_t) / s_t - return scores_norm diff --git a/hyperion/score_norm/tz_norm.py b/hyperion/score_norm/tz_norm.py deleted file mode 100644 index d4bb1539..00000000 --- a/hyperion/score_norm/tz_norm.py +++ /dev/null @@ -1,40 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from .score_norm import ScoreNorm -from .t_norm import TNorm -from .z_norm import ZNorm - - -class TZNorm(ScoreNorm): - """Class for TZ-Norm score normalization.""" - - def __init__(self, **kwargs): - super(SNorm, self).__init__(*kwargs) - self.t_norm = TNorm(**kwargs) - self.z_norm = ZNorm(**kwargs) - - def predict( - self, - scores, - scores_coh_test, - scores_enr_coh, - scores_coh_coh, - mask_coh_test=None, - mask_enr_coh=None, - mask_coh_coh=None, - ): - - scores_t_norm = self.t_norm.predict(scores, scores_coh_test, mask_coh_test) - scores_enr_coh_t_norm = self.t_norm.predict( - scores_enr_coh, scores_coh_coh, mask_coh_coh - ) - scores_tz_norm = self.z_norm.predict( - scores_t_norm, scores_enr_coh_t_norm, mask_enr_coh - ) - - return scores_tz_norm diff --git a/hyperion/score_norm/z_norm.py b/hyperion/score_norm/z_norm.py deleted file mode 100644 index f5350fb1..00000000 --- a/hyperion/score_norm/z_norm.py +++ /dev/null @@ -1,32 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from .score_norm import ScoreNorm - - -class ZNorm(ScoreNorm): - """ - Class for Z-Norm score normalization. - """ - - def predict(self, scores, scores_enr_coh, mask=None): - - if mask is None: - mu_z = np.mean(scores_enr_coh, axis=1, keepdims=True) - s_z = np.std(scores_enr_coh, axis=1, keepdims=True) - else: - scores_enr_coh[mask == False] = 0 - n_z = np.mean(mask, axis=1, keepdims=True) - mu_z = np.mean(scores_enr_coh, axis=1, keepdims=True) / n_z - s_z = np.sqrt( - np.mean(scores_enr_coh ** 2, axis=1, keepdims=True) / n_z - mu_z ** 2 - ) - - s_z[s_z < self.std_floor] = self.std_floor - - scores_norm = (scores - mu_z) / s_z - return scores_norm diff --git a/hyperion/score_norm/zt_norm.py b/hyperion/score_norm/zt_norm.py deleted file mode 100644 index 4c5c8b5c..00000000 --- a/hyperion/score_norm/zt_norm.py +++ /dev/null @@ -1,41 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py - -from .score_norm import ScoreNorm -from .t_norm import TNorm -from .z_norm import ZNorm - - -class ZTNorm(ScoreNorm): - """Class ZT-Norm score-normalization.""" - - def __init__(self, **kwargs): - super(SNorm, self).__init__(*kwargs) - self.t_norm = TNorm(**kwargs) - self.z_norm = ZNorm(**kwargs) - - def predict( - self, - scores, - scores_coh_test, - scores_enr_coh, - scores_coh_coh, - mask_coh_test=None, - mask_enr_coh=None, - mask_coh_coh=None, - ): - - scores_z_norm = self.z_norm.predict(scores, scores_enr_coh, mask_enr_coh) - scores_coh_test_z_norm = self.z_norm.predict( - scores_coh_test, scores_coh_coh, mask_enr_coh - ) - scores_zt_norm = self.t_norm.predict( - scores_z_norm, scores_coh_test_z_norm, mask_coh_test - ) - - return scores_zt_norm diff --git a/hyperion/torch/__init__.py b/hyperion/torch/__init__.py index 8fade929..41745d38 100644 --- a/hyperion/torch/__init__.py +++ b/hyperion/torch/__init__.py @@ -3,21 +3,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -# - -# from . import utils -# from . import loggers -# from . import metrics -# from . import lr_schedulers -# from . import data -# from . import layers -# from . import layer_blocks -# from . import narchs -# from . import trainers -# from . import transforms -# from . import adv_attacks -# from . import helpers -# from . import seq_embed from .torch_model import TorchModel from .torch_model_loader import TorchModelLoader diff --git a/hyperion/torch/adv_attacks/__init__.py b/hyperion/torch/adv_attacks/__init__.py index 906b8740..5fda4ac9 100644 --- a/hyperion/torch/adv_attacks/__init__.py +++ b/hyperion/torch/adv_attacks/__init__.py @@ -3,14 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .fgsm_attack import FGSMAttack -from .snr_fgsm_attack import SNRFGSMAttack -from .rand_fgsm_attack import RandFGSMAttack -from .iter_fgsm_attack import IterFGSMAttack +from .attack_factory import AttackFactory +from .carlini_wagner_l0 import CarliniWagnerL0 from .carlini_wagner_l2 import CarliniWagnerL2 from .carlini_wagner_linf import CarliniWagnerLInf -from .carlini_wagner_l0 import CarliniWagnerL0 +from .fgsm_attack import FGSMAttack +from .iter_fgsm_attack import IterFGSMAttack from .pgd_attack import PGDAttack - -from .attack_factory import AttackFactory +from .rand_fgsm_attack import RandFGSMAttack from .random_attack_factory import RandomAttackFactory +from .snr_fgsm_attack import SNRFGSMAttack diff --git a/hyperion/torch/adv_attacks/art_attack_factory.py b/hyperion/torch/adv_attacks/art_attack_factory.py index e09c62ff..678470f5 100644 --- a/hyperion/torch/adv_attacks/art_attack_factory.py +++ b/hyperion/torch/adv_attacks/art_attack_factory.py @@ -3,14 +3,25 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser try: from art.attacks import evasion as attacks except ImportError: pass +from ...utils.misc import filter_func_args + + +def make_4d_hook(func): + def wrapper(x, *args, **kwargs): + x = x[None, None] + y = func(x, *args, **kwargs) + return y[0, 0] + + return wrapper + class ARTAttackFactory(object): @staticmethod @@ -28,11 +39,10 @@ def create( num_random_init=0, minimal=False, random_eps=False, - min_eps=None, + min_eps=1e-6, beta=0.001, theta=0.1, gamma=1.0, - etha=0.01, confidence=0.0, lr=1e-2, lr_decay=0.5, @@ -42,9 +52,12 @@ def create( max_iter=10, overshoot=1.1, num_grads=10, - c=1e-3, max_halving=5, max_doubling=5, + tau_decr_factor=0.9, + initial_c=1e-5, + largest_c=20.0, + c_incr_factor=2.0, decision_rule="EN", init_eval=100, max_eval=10000, @@ -53,31 +66,44 @@ def create( use_importance=False, abort_early=True, th=None, + es: int = 0, sigma=0.5, lambda_tv=0.3, - labmda_c=1.0, + lambda_c=1.0, lambda_s=0.5, reg=3000, kernel_size=5, eps_factor=1.1, eps_iter=10, + p_wassertein=2, conj_sinkhorn_iter=400, proj_sinkhorn_iter=400, + sub_dim: int = 10, + bin_search_tol: float = 0.1, + lambda_geoda: float = 0.6, + sigma_geoda: float = 0.0002, + lambda_fadv=0.0, + layers_fadv=[1], + thr_lowpro: float = 0.5, + lambda_lowpro: float = 1.5, + eta_lowpro: float = 0.2, + eta_lowpro_decay: float = 0.98, + eta_lowpro_min: float = 1e-7, + eta_newton: float = 0.01, targeted=False, num_samples=1, eps_scale=1, batch_size=1, ): - eps = eps * eps_scale - eps_step = eps_step * eps_scale - if min_eps is not None: + if attack_type not in ["feature-adv"]: + eps = eps * eps_scale + eps_step = eps_step * eps_scale min_eps = min_eps * eps_scale + delta = delta * eps_scale - attack_set = set( - ["fgm", "pgd", "auto-pgd", "boundary", "cw-linf", "wasserstein"] - ) - if attack_type in attack_set: + attack_l12 = set(["fgm", "pgd", "auto-pgd", "wasserstein"]) + if attack_type in attack_l12: if norm == 1: eps = eps * num_samples eps_step = eps_step * num_samples @@ -98,14 +124,14 @@ def create( epsilon=eps, step_adapt=step_adapt, max_iter=max_iter, - num_trials=num_trials, + num_trial=num_trial, sample_size=sample_size, init_size=init_size, min_epsilon=min_eps, ) - if attack_type == "hop-skin-jump": - return attacks.HopSkinJump( + if attack_type == "hop-skip-jump": + return attacks.HopSkipJump( model, targeted=targeted, norm=norm, @@ -132,7 +158,7 @@ def create( ) if attack_type == "deepfool": - attacks.DeepFool( + return attacks.DeepFool( model, max_iter=max_iter, epsilon=eps, @@ -141,7 +167,7 @@ def create( ) if attack_type == "elasticnet": - attacks.ElasticNet( + return attacks.ElasticNet( model, confidence=confidence, targeted=targeted, @@ -149,13 +175,25 @@ def create( binary_search_steps=binary_search_steps, max_iter=max_iter, beta=beta, - initial_const=c, + initial_const=initial_c, batch_size=batch_size, decision_rule=decision_rule, ) + if attack_type == "feature-adv": + return attacks.FeatureAdversariesPyTorch( + model, + delta=delta, + lambda_=lambda_fadv, + layer=tuple(layers_fadv), + max_iter=max_iter, + batch_size=batch_size, + step_size=eps_step, + random_start=num_random_init > 0, + ) + if attack_type == "threshold": - attacks.ThresholdAttack(model, th=th, es=es, targeted=targeted) + return attacks.ThresholdAttack(model, th=th, es=es, targeted=targeted) if attack_type == "fgm": return attacks.FastGradientMethod( @@ -193,15 +231,48 @@ def create( ) if attack_type == "auto-pgd": - return attacks.AutoProjectedGradientDescent( + if len(model.input_shape) == 1: + # autopgd only works with image kind shape + model._input_shape = (1, 1, model.input_shape[0]) + attack = attacks.AutoProjectedGradientDescent( model, norm=norm, eps=eps, eps_step=eps_step, max_iter=max_iter, targeted=targeted, - nb_random_init=num_random_init, - random_eps=random_eps, + nb_random_init=max(1, num_random_init), + batch_size=batch_size, + ) + attack.generate = make_4d_hook(attack.generate) + return attack + + if attack_type == "auto-cgd": + if len(model.input_shape) == 1: + # autopgd only works with image kind shape + model._input_shape = (1, 1, model.input_shape[0]) + attack = attacks.AutoConjugateGradient( + model, + norm=norm, + eps=eps, + eps_step=eps_step, + max_iter=max_iter, + targeted=targeted, + nb_random_init=max(1, num_random_init), + batch_size=batch_size, + ) + attack.generate = make_4d_hook(attack.generate) + return attack + + if attack_type == "geoda": + return attacks.GeoDA( + model, + norm=norm, + sub_dim=sub_dim, + max_iter=max_iter, + bin_search_tol=bin_search_tol, + lambda_param=lambda_geoda, + sigma=sigma_geoda, batch_size=batch_size, ) @@ -210,14 +281,21 @@ def create( model, theta=theta, gamma=gamma, batch_size=batch_size ) - if attack_type == "newtonfool": - return attacks.NewtonFool( - model, eta=eta, max_iter=max_iter, batch_size=batch_size + if attack_type == "low-pro-fool": + return attacks.LowProFool( + model, + n_steps=max_iter, + threshold=thr_lowpro, + lambd=lambda_lowpro, + eta=eta_lowpro, + eta_decay=eta_lowpro_decay, + eta_min=eta_lowpro_min, + norm=norm, ) - if attack_type == "threshold": + if attack_type == "newtonfool": return attacks.NewtonFool( - model, eta=eta, max_iter=max_iter, batch_size=batch_size + model, eta=eta_newton, max_iter=max_iter, batch_size=batch_size ) if attack_type == "cw-l2": @@ -227,8 +305,8 @@ def create( learning_rate=lr, binary_search_steps=binary_search_steps, max_iter=max_iter, - initial_const=c, targeted=targeted, + initial_const=initial_c, max_halving=max_halving, max_doubling=max_doubling, batch_size=batch_size, @@ -241,19 +319,20 @@ def create( learning_rate=lr, max_iter=max_iter, targeted=targeted, - max_halving=max_halving, - max_doubling=max_doubling, - eps=eps, + decrease_factor=tau_decr_factor, + initial_const=initial_c, + largest_const=largest_c, + const_factor=c_incr_factor, batch_size=batch_size, ) if attack_type == "zoo": - return attacks.ZooMethod( + return attacks.ZooAttack( model, confidence, learning_rate=lr, max_iter=max_iter, - initial_const=c, + initial_const=initial_c, targeted=targeted, binary_search_steps=binary_search_steps, abort_early=abort_early, @@ -265,22 +344,33 @@ def create( ) if attack_type == "shadow": - return attacks.ShadowAttack( + if len(model.input_shape) == 1: + # autopgd only works with image kind shape + model._input_shape = (1, 1, model.input_shape[0]) + + attack = attacks.ShadowAttack( model, sigma=sigma, - num_steps=num_iters, + nb_steps=max_iter, learning_rate=lr, lambda_tv=lambda_tv, lambda_c=lambda_c, lambda_s=lambda_s, - batch_norm=batch_norm, + batch_size=batch_size, targeted=targeted, ) + attack.generate = make_4d_hook(attack.generate) + return attack if attack_type == "wasserstein": - return attacks.Wasserstein( + if len(model.input_shape) == 1: + # autopgd only works with image kind shape + model._input_shape = (1, 1, model.input_shape[0]) + + attack = attacks.Wasserstein( model, targeted=targeted, + p=p_wassertein, regularization=reg, kernel_size=kernel_size, eps=eps, @@ -292,6 +382,8 @@ def create( projected_sinkhorn_max_iter=proj_sinkhorn_iter, batch_size=batch_size, ) + attack.generate = make_4d_hook(attack.generate) + return attack raise Exception("%s is not a valid attack type" % (attack_type)) @@ -307,59 +399,7 @@ def filter_args(**kwargs): else: kwargs["norm"] = int(kwargs["norm"]) - valid_args = ( - "attack_type", - "eps", - "delta", - "step_adapt", - "num_trial", - "sample_size", - "init_size", - "norm", - "eps_step", - "num_random_init", - "minimal", - "random_eps", - "min_eps", - "beta", - "theta", - "gamma", - "etha", - "confidence", - "decision_rule", - "lr", - "lr_decay", - "lr_num_decay", - "momentum", - "binary_search_steps", - "max_iter", - "init_eval", - "max_eval", - "overshoot", - "num_grads", - "c", - "max_halving", - "max_doubling", - "variable_h", - "abort_early", - "num_parallel", - "use_importance", - "th", - "sigma", - "lambda_tv", - "labmda_c", - "lambda_s", - "reg", - "kernel_size", - "eps_factor", - "eps_iter", - "conj_sinkhorn_iter", - "proj_sinkhorn_iter", - "targeted", - ) - - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - + args = filter_func_args(ARTAttackFactory.create, kwargs) return args @staticmethod @@ -371,7 +411,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--attack-type", type=str.lower, - default="fgsm", + default="fgm", choices=[ "boundary", "brendel", @@ -380,12 +420,15 @@ def add_class_args(parser, prefix=None): "bim", "pgd", "auto-pgd", + "auto-cgd", + "feature-adv", + "low-pro-fool", "jsma", "newtonfool", "cw-l2", "cw-linf", "elasticnet", - "hop-skin-jump", + "hop-skip-jump", "zoo", "threshold", "shadow", @@ -571,7 +614,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--min-eps", - default=None, + default=1e-6, type=float, help=("Stop attack if perturbation is smaller than min_eps."), ) @@ -614,12 +657,33 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - "--c", + "--initial-c", default=1e-2, type=float, help=("Initial weight of constraint function f in carlini-wagner attack"), ) + parser.add_argument( + "--largest-c", + default=20.0, + type=float, + help=("largest weight of constraint function f in carlini-wagner attack"), + ) + + parser.add_argument( + "--c-incr-factor", + default=2, + type=float, + help=("factor to increment c in carline-wagner-l0/inf attack"), + ) + + parser.add_argument( + "--tau-decr-factor", + default=0.9, + type=float, + help=("factor to reduce tau in carline-wagner-linf attack"), + ) + parser.add_argument( "--max-halving", default=5, @@ -635,10 +699,10 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - "--no-abort", - default=False, - action="store_true", - help=("do not abort early in optimizer iterations"), + "--abort-early", + default=True, + action=ActionYesNo, + help=("abort early in optimizer iterations"), ) parser.add_argument( @@ -670,6 +734,14 @@ def add_class_args(parser, prefix=None): "Threshold for threshold attack, None indicates finding and minimum threshold" ), ) + parser.add_argument( + "--es", + default=0, + type=int, + help=( + "Indicates whether the attack uses CMAES (0) or DE (1) as Evolutionary Strategy" + ), + ) parser.add_argument( "--sigma", @@ -704,6 +776,19 @@ def add_class_args(parser, prefix=None): "Scalar penalty weight for similarity of color channels in perturbation" ), ) + parser.add_argument( + "--lambda-fadv", + default=0.0, + type=float, + help=("Regularization parameter of the L-inf soft constraint"), + ) + parser.add_argument( + "--layers-fadv", + default=[1], + type=int, + nargs="+", + help=("indices of the representation layers"), + ) parser.add_argument( "--reg", @@ -730,6 +815,12 @@ def add_class_args(parser, prefix=None): type=int, help=("Number of iterations to increase the epsilon."), ) + parser.add_argument( + "--p-wassertein", + default=2, + type=int, + help=("lp distance for wassertein distance"), + ) parser.add_argument( "--conj-sinkhorn-iter", default=400, @@ -743,6 +834,65 @@ def add_class_args(parser, prefix=None): help=("maximum number of iterations for the projected sinkhorn optimizer"), ) + parser.add_argument( + "--thr-lowpro", + type=float, + default=0.5, + help="""Lowest prediction probability of a valid adversary for low-pro-fool""", + ) + parser.add_argument( + "--lambda-lowpro", + type=float, + default=1.5, + help="""Amount of lp-norm impact on objective function for low-pro-fool""", + ) + parser.add_argument( + "--eta-lowpro", + type=float, + default=0.2, + help="""Rate of updating the perturbation vectors for low-pro-fool""", + ) + parser.add_argument( + "--eta-lowpro-decay", + type=float, + default=0.98, + help="""Step-by-step decrease of eta for low-pro-fool""", + ) + parser.add_argument( + "--eta-lowpro-min", type=float, default=1e-7, help="""Minimal eta value""" + ) + parser.add_argument( + "--eta-newton", type=float, default=0.01, help="""eta for newtonfool""" + ) + # parser.add_argument( + # "--sub-dim", + # default=10, + # type=int, + # help="Dimensionality of 2D frequency space (DCT).", + # ) + + # parser.add_argument( + # "--bin-search-tol", + # default=0.1, + # type=float, + # help="""Maximum remaining L2 perturbation defining binary search + # convergence""", + # ) + # parser.add_argument( + # "--lambda-geoda", + # default=0.6, + # type=float, + # help="""The lambda of equation 19 with lambda_param=0 corresponding to a + # single iteration and lambda_param=1 to a uniform distribution of + # iterations per step.""", + # ) + # parser.add_argument( + # "--sigma-geoda", + # default=0.0002, + # type=float, + # help="""Variance of the Gaussian perturbation.""", + # ) + parser.add_argument( "--targeted", default=False, diff --git a/hyperion/torch/adv_attacks/attack_factory.py b/hyperion/torch/adv_attacks/attack_factory.py index 8ea952ad..ca89a794 100644 --- a/hyperion/torch/adv_attacks/attack_factory.py +++ b/hyperion/torch/adv_attacks/attack_factory.py @@ -2,16 +2,16 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser -from .fgsm_attack import FGSMAttack -from .snr_fgsm_attack import SNRFGSMAttack -from .rand_fgsm_attack import RandFGSMAttack -from .iter_fgsm_attack import IterFGSMAttack -from .carlini_wagner_l2 import CarliniWagnerL2 from .carlini_wagner_l0 import CarliniWagnerL0 +from .carlini_wagner_l2 import CarliniWagnerL2 from .carlini_wagner_linf import CarliniWagnerLInf +from .fgsm_attack import FGSMAttack +from .iter_fgsm_attack import IterFGSMAttack from .pgd_attack import PGDAttack +from .rand_fgsm_attack import RandFGSMAttack +from .snr_fgsm_attack import SNRFGSMAttack class AttackFactory(object): @@ -30,7 +30,7 @@ def create( binary_search_steps=9, max_iter=10, abort_early=True, - c=1e-3, + initial_c=1e-3, reduce_c=False, c_incr_factor=2, tau_decr_factor=0.9, @@ -47,6 +47,7 @@ def create( eps = eps * eps_scale alpha = alpha * eps_scale + norm = float(norm) if attack_type == "fgsm": return FGSMAttack( @@ -98,7 +99,7 @@ def create( binary_search_steps, max_iter, abort_early, - c, + initial_c, norm_time=norm_time, time_dim=time_dim, use_snr=use_snr, @@ -114,7 +115,7 @@ def create( lr, max_iter, abort_early, - c, + initial_c, reduce_c, c_incr_factor, indep_channels, @@ -130,7 +131,7 @@ def create( lr, max_iter, abort_early, - c, + initial_c, reduce_c, c_incr_factor, tau_decr_factor, @@ -219,9 +220,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--norm", - type=float, - default=float("inf"), - choices=[float("inf"), 1, 2], + default="inf", + choices=["inf", "1", "2"], help=("Attack perturbation norm"), ) @@ -284,7 +284,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - "--c", + "--initial-c", default=1e-2, type=float, help=( diff --git a/hyperion/torch/adv_attacks/carlini_wagner_l2.py b/hyperion/torch/adv_attacks/carlini_wagner_l2.py index 27cffe97..e8b545b5 100644 --- a/hyperion/torch/adv_attacks/carlini_wagner_l2.py +++ b/hyperion/torch/adv_attacks/carlini_wagner_l2.py @@ -2,8 +2,8 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math import logging +import math import torch import torch.nn as nn diff --git a/hyperion/torch/adv_attacks/pgd_attack.py b/hyperion/torch/adv_attacks/pgd_attack.py index 879531ed..ca496e64 100644 --- a/hyperion/torch/adv_attacks/pgd_attack.py +++ b/hyperion/torch/adv_attacks/pgd_attack.py @@ -2,10 +2,11 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math import logging +import math import torch + from .adv_attack import AdvAttack diff --git a/hyperion/torch/adv_attacks/random_attack_factory.py b/hyperion/torch/adv_attacks/random_attack_factory.py index e333b119..a91c99ac 100644 --- a/hyperion/torch/adv_attacks/random_attack_factory.py +++ b/hyperion/torch/adv_attacks/random_attack_factory.py @@ -4,9 +4,11 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch + from .attack_factory import AttackFactory as AF @@ -126,7 +128,7 @@ def _sample_attack_args(self): ) attack_args["max_iter"] = self._randint(self.min_iter, self.max_iter) attack_args["abort_early"] = self.abort_early - attack_args["c"] = self._uniform(self.min_c, self.max_c) + attack_args["initial_c"] = self._uniform(self.min_c, self.max_c) attack_args["reduce_c"] = self.reduce_c attack_args["c_incr_factor"] = self.c_incr_factor attack_args["tau_decr_factor"] = self.tau_decr_factor @@ -218,10 +220,9 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--norms", - type=float, - default=[float("inf")], + default=["inf"], nargs="+", - choices=[float("inf"), 1, 2], + choices=["inf", "1", "2"], help=("Attack perturbation norms"), ) diff --git a/hyperion/torch/adv_defenses/wave_gan_white.py b/hyperion/torch/adv_defenses/wave_gan_white.py index ad7f985e..5d045f08 100644 --- a/hyperion/torch/adv_defenses/wave_gan_white.py +++ b/hyperion/torch/adv_defenses/wave_gan_white.py @@ -2,15 +2,15 @@ # Added wave_gan_model_ckpt to test using different model ckpts [Sonal 24Aug20] import logging +import math from pathlib import Path from typing import Tuple -import math import librosa import numpy as np +import yaml import torch -import yaml try: # import parallel_wavegan.models @@ -21,6 +21,7 @@ pass from sklearn.preprocessing import StandardScaler + from torch import nn diff --git a/hyperion/torch/data/__init__.py b/hyperion/torch/data/__init__.py index 4deb3f25..ee5a661d 100644 --- a/hyperion/torch/data/__init__.py +++ b/hyperion/torch/data/__init__.py @@ -3,11 +3,16 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .audio_dataset import AudioDataset + +# samplers +from .bucketing_seg_sampler import BucketingSegSampler +from .dino_audio_dataset import DINOAudioDataset +from .embed_sampler_factory import EmbedSamplerFactory + # datasets from .feat_seq_dataset import FeatSeqDataset from .paired_feat_seq_dataset import PairedFeatSeqDataset -from .audio_dataset import AudioDataset - -# samplers -from .weighted_seq_sampler import ClassWeightedSeqSampler +# from .weighted_seq_sampler import ClassWeightedSeqSampler +from .seg_sampler_factory import SegSamplerFactory diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index f0ab811d..a8e45bda 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -4,42 +4,89 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser -import time import math +import time +from collections import OrderedDict +from typing import Dict, List, Optional import numpy as np import pandas as pd +# import k2 +try: + import k2 +except: + from ..utils import dummy_k2 as k2 + +import sentencepiece as spm import torch +import torch.distributed as dist +import torchaudio.transforms as tat +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from torch.utils.data import Dataset -from ..torch_defs import floatstr_torch from ...io import RandomAccessAudioReader as AR -from ...utils.utt2info import Utt2Info -from ...augment import SpeechAugment - -from torch.utils.data import Dataset -import torch.distributed as dist +from ...np.augment import SpeechAugment +from ...np.preprocessing import Resampler +from ...utils import ClassInfo, SegmentSet +from ...utils.misc import filter_func_args +from ...utils.text import read_text +from ..tokenizers import HypTokenizer +from ..torch_defs import floatstr_torch +from ..utils import collate_seqs_1d, collate_seqs_nd, list_of_dicts_to_list class AudioDataset(Dataset): + """AudioDataset class + + Args: + recordings_file: recordings manifest file (kaldi .scp or pandas .csv) + segments_file: segments manifest file (kaldi .scp or pandas .csv) + class_names: list with the names of the types of classes in the datasets, e.g., speaker, language + class_files: list of class info files + tokenizer_mappings: list mapping the segment_set fields to the tokenizer name + that should be used with them, e.g., text->text-1, + this argument has to be sync with tokenizer_files. + tokenizer_files: list of tokenizer cofinguration files + this argument has to be sync with tokenizer_mappings. + aug_cfgs: list of augmentation configuration files + num_augs: number of augmentations per segment and augmentation type + num_aug_mix: "number of AugMix augmentations per segment + aug_mix_alpha: AugMix Diritchlet distribution parameter + return_segment_info: list of columns of the segment file which should be returned as supervisions + return_orig: when using augmentation, whether or not to return also the original audio + target_sample_freq: target sampling frequencey, if not None all audios are converted to this sample freq + wav_scale: make waves to be in [-wav_scale, wav_scale] + is_val: is validation dataset. + seed: random seed", + time_durs_file: (deprecated) segment to duration in secs file, if durations are not in segments_file + text_file: (deprecated) text file with words labels for each utterances. + bpe_model: (deprecated) bpe model for the text label. + """ + def __init__( self, - audio_path, - key_file, - class_file=None, - time_durs_file=None, - min_chunk_length=1, - max_chunk_length=None, - aug_cfg=None, - return_fullseqs=False, - return_class=True, - return_clean_aug_pair=False, - transpose_input=False, - wav_scale=2 ** 15 - 1, - is_val=False, + recordings_file: str, + segments_file: str, + class_names: Optional[List[str]] = None, + class_files: Optional[List[str]] = None, + tokenizer_mappings: Optional[List[str]] = None, + tokenizer_files: Optional[List[str]] = None, + aug_cfgs: Optional[List[str]] = None, + num_augs: int = 1, + num_aug_mix: int = 0, + aug_mix_alpha: float = 0, + return_segment_info: Optional[List[str]] = None, + return_orig: bool = False, + target_sample_freq: Optional[float] = None, + wav_scale: float = 1, + is_val: bool = False, + seed: int = 112358, + time_durs_file: Optional[str] = None, + text_file: Optional[str] = None, + bpe_model: Optional[str] = None, ): - + super().__init__() try: rank = dist.get_rank() world_size = dist.get_world_size() @@ -49,53 +96,147 @@ def __init__( self.rank = rank self.world_size = world_size - + self.epoch = 0 if rank == 0: - logging.info("opening dataset %s" % audio_path) - self.r = AR(audio_path, wav_scale=wav_scale) + logging.info("loading segments file %s", segments_file) + + self.seg_set = SegmentSet.load(segments_file) if rank == 0: - logging.info("loading utt2info file %s" % key_file) - self.u2c = Utt2Info.load(key_file, sep=" ") + logging.info("dataset contains %d seqs", len(self.seg_set)) + if rank == 0: - logging.info("dataset contains %d seqs" % self.num_seqs) + logging.info("opening audio reader %s", recordings_file) + + audio_seg_set = self.seg_set if self.seg_set.has_time_marks else None + self.r = AR(recordings_file, segments=audio_seg_set, wav_scale=wav_scale) self.is_val = is_val - self._read_time_durs_file(time_durs_file) + if time_durs_file is not None: + self._load_legacy_durations(time_durs_file) + + assert "duration" in self.seg_set + + logging.info("loading class-info files") + self._load_class_infos(class_names, class_files, is_val) - # self._seq_lengths = self.r.read_time_duration(self.u2c.key) - self._prune_short_seqs(min_chunk_length) + logging.info("loading tokenizers") + self._load_tokenizers(tokenizer_mappings, tokenizer_files) - self.short_seq_exist = self._seq_shorter_than_max_length_exists( - max_chunk_length + if bpe_model is not None: + logging.info("loading bpe models") + self._load_bpe_model(bpe_model, is_val) + + if text_file is not None: + logging.info("loading text files") + self._load_text_infos(text_file, is_val) + + self.return_segment_info = ( + [] if return_segment_info is None else return_segment_info ) + self.return_orig = return_orig - self._prepare_class_info(class_file) + self.num_augs = num_augs + self.num_aug_mix = num_aug_mix + self.aug_mix_alpha = aug_mix_alpha + self.seed = seed + self.rng = np.random.default_rng(seed + 1000 * rank) + self._create_augmenters(aug_cfgs) - if max_chunk_length is None: - max_chunk_length = min_chunk_length - self._min_chunk_length = min_chunk_length - self._max_chunk_length = max_chunk_length + self.target_sample_freq = target_sample_freq + self.resamplers = {} + self.resampler = Resampler(target_sample_freq) - self.return_fullseqs = return_fullseqs - self.return_class = return_class - self.return_clean_aug_pair = return_clean_aug_pair + def _load_legacy_durations(self, time_durs_file): + if self.rank == 0: + logging.info("loading durations file %s", time_durs_file) - self.transpose_input = transpose_input + time_durs = SegmentSet.load(time_durs_file) + self.seg_set["duration"] = time_durs.loc[ + self.seg_set["id"] + ].class_id.values.astype(float, copy=False) - self.augmenter = None + def _load_bpe_model(self, bpe_model, is_val): + if self.rank == 0: + logging.info("loading bpe file %s", bpe_model) + self.sp = spm.SentencePieceProcessor() + self.sp.load(bpe_model) + blank_id = self.sp.piece_to_id("") + vocab_size = self.sp.get_piece_size() + + def _load_text_infos(self, text_file, is_val): + if text_file is None: + return + if self.rank == 0: + logging.info("loading text file %s", text_file) + + text = read_text(text_file) + self.seg_set["text"] = text.loc[self.seg_set["id"]].text + + def _load_class_infos(self, class_names, class_files, is_val): + self.class_info = OrderedDict() + if class_names is None: + assert class_files is None + return + + assert len(class_names) == len(class_files) + for name, file in zip(class_names, class_files): + assert ( + name in self.seg_set + ), f"class_name {name} not present in the segment set" + self.seg_set.convert_col_to_str( + name + ) # make sure that class ids are strings + if self.rank == 0: + logging.info("loading class-info file %s", file) + table = ClassInfo.load(file) + self.class_info[name] = table + if not is_val: + # check that all classes are present in the training segments + class_ids = table["id"] + segment_class_ids = self.seg_set[name].unique() + for c_id in class_ids: + if c_id not in segment_class_ids: + logging.warning( + "%s class: %s not present in dataset", name, c_id + ) + + def _load_tokenizers(self, tokenizer_mappings, tokenizer_files): + self.tokenizers = OrderedDict() + self.tokenizers_to_infos = OrderedDict() + if tokenizer_mappings is None: + assert tokenizer_files is None + return + + assert len(tokenizer_mappings) == len(tokenizer_files) + tokenizer_names = [] + for map in tokenizer_mappings: + info_name, tokenizer_name = map.split("->", maxsplit=1) + self.tokenizers_to_infos[tokenizer_name] = info_name + tokenizer_names.append(tokenizer_name) + + for name, file in zip(tokenizer_names, tokenizer_files): + assert name in self.seg_set, f"field {name} not present in the segment set" + if self.rank == 0: + logging.info("loading tokenizer file %s", file) + tokenizer = HypTokenizer.auto_load(file) + self.tokenizers[name] = tokenizer + + def _create_augmenters(self, aug_cfgs): + self.augmenters = [] self.reverb_context = 0 - if aug_cfg is not None: - self.augmenter = SpeechAugment.create( - aug_cfg, random_seed=112358 + 1000 * rank + if aug_cfgs is None: + return + + for aug_cfg in aug_cfgs: + logging.info(f"loading augmentation={aug_cfg}") + augmenter = SpeechAugment.create( + aug_cfg, random_seed=self.seed + 1000 * self.rank ) - self.reverb_context = self.augmenter.max_reverb_context + self.augmenters.append(augmenter) + self.reverb_context = max(augmenter.max_reverb_context, self.reverb_context) - def _read_time_durs_file(self, file_path): - if self.rank == 0: - logging.info("reading time_durs file %s" % file_path) - nf_df = pd.read_csv(file_path, header=None, sep=" ") - nf_df.index = nf_df[0] - self._seq_lengths = nf_df.loc[self.u2c.key, 1].values + def set_epoch(self, epoch): + self.epoch = epoch @property def wav_scale(self): @@ -103,31 +244,19 @@ def wav_scale(self): @property def num_seqs(self): - return len(self.u2c) + return len(self.seg_set) def __len__(self): return self.num_seqs @property def seq_lengths(self): - return self._seq_lengths + return self.seg_set["duration"] @property def total_length(self): return np.sum(self.seq_lengths) - @property - def min_chunk_length(self): - if self.return_fullseqs: - self._min_chunk_length = np.min(self.seq_lengths) - return self._min_chunk_length - - @property - def max_chunk_length(self): - if self._max_chunk_length is None: - self._max_chunk_length = np.max(self.seq_lengths) - return self._max_chunk_length - @property def min_seq_length(self): return np.min(self.seq_lengths) @@ -136,287 +265,378 @@ def min_seq_length(self): def max_seq_length(self): return np.max(self.seq_lengths) - def _prune_short_seqs(self, min_length): - if self.rank == 0: - logging.info("pruning short seqs") - keep_idx = self.seq_lengths >= min_length - self.u2c = self.u2c.filter_index(keep_idx) - self._seq_lengths = self.seq_lengths[keep_idx] - if self.rank == 0: - logging.info( - "pruned seqs with min_length < %f," - "keep %d/%d seqs" % (min_length, self.num_seqs, len(keep_idx)) - ) - - def _prepare_class_info(self, class_file): - class_weights = None - if class_file is None: - classes, class_idx = np.unique(self.u2c.info, return_inverse=True) - class2idx = {k: i for i, k in enumerate(classes)} - else: - if self.rank == 0: - logging.info("reading class-file %s" % (class_file)) - class_info = pd.read_csv(class_file, header=None, sep=" ") - class2idx = {str(k): i for i, k in enumerate(class_info[0])} - class_idx = np.array([class2idx[k] for k in self.u2c.info], dtype=int) - if class_info.shape[1] == 2: - class_weights = np.array(class_info[1]).astype( - floatstr_torch(), copy=False - ) - - self.num_classes = len(class2idx) - - class2utt_idx = {} - class2num_utt = np.zeros((self.num_classes,), dtype=int) - - for k in range(self.num_classes): - idx = (class_idx == k).nonzero()[0] - class2utt_idx[k] = idx - class2num_utt[k] = len(idx) - if class2num_utt[k] == 0: - if not self.is_val: - logging.warning("class %d doesn't have any samples" % (k)) - if class_weights is None: - class_weights = np.ones((self.num_classes,), dtype=floatstr_torch()) - class_weights[k] = 0 - - count_empty = np.sum(class2num_utt == 0) - if count_empty > 0: - logging.warning("%d classes have 0 samples" % (count_empty)) - - self.utt_idx2class = class_idx - self.class2utt_idx = class2utt_idx - self.class2num_utt = class2num_utt - if class_weights is not None: - class_weights /= np.sum(class_weights) - class_weights = torch.Tensor(class_weights) - self.class_weights = class_weights - - if self.short_seq_exist: - # if there are seq shorter than max_chunk_lenght we need some extra variables - # we will need class_weights to put to 0 classes that have all utts shorter than the batch chunk length - if self.class_weights is None: - self.class_weights = torch.ones((self.num_classes,)) - - # we need the max length of the utterances of each class - class2max_length = torch.zeros((self.num_classes,), dtype=torch.float) - for c in range(self.num_classes): - if class2num_utt[c] > 0: - class2max_length[c] = np.max( - self.seq_lengths[self.class2utt_idx[c]] - ) - - self.class2max_length = class2max_length - - def _seq_shorter_than_max_length_exists(self, max_length): - return np.any(self.seq_lengths < max_length) - @property - def var_chunk_length(self): - return self.min_chunk_length < self.max_chunk_length - - def get_random_chunk_length(self): - - if self.var_chunk_length: - return ( - torch.rand(size=(1,)).item() - * (self.max_chunk_length - self.min_chunk_length) - + self.min_chunk_length + def num_classes(self): + return {k: t.num_classes for k, t in self.class_info.items()} + + def _parse_segment_item(self, segment): + if isinstance(segment, (tuple, list)): + seg_id, start, duration = segment + assert duration <= self.seg_set.loc[seg_id].duration, ( + f"{seg_id} with start={start} duration " + f"({self.seg_set.loc[seg_id].duration}) < " + f"chunk duration ({duration})" ) - - return self.max_chunk_length - - def __getitem__(self, index): - # logging.info('{} {} {} get item {}'.format( - # self, os.getpid(), threading.get_ident(), index)) - if self.return_fullseqs: - return self._get_fullseq(index) else: - return self._get_random_chunk(index) + seg_id, start, duration = segment, 0, 0 - def _get_fullseq(self, index): - key = self.u2c.key[index] - x, fs = self.r.read([key]) - x = x[0].astype(floatstr_torch(), copy=False) - x_clean = x - if self.augmenter is not None: - x, aug_info = self.augmenter(x) + return seg_id, start, duration - if self.transpose_input: - x = x[None, :] - if self.return_clean_aug_pair: - x_clean = x_clean[None, :] + def _read_audio(self, seg_id, start, duration): + # how much extra audio we need to load to + # calculate the reverb of the first part of the audio + reverb_context = min(self.reverb_context, start) + start -= reverb_context + read_duration = duration + reverb_context - if self.return_clean_aug_pair: - r = x, x_clean + # read audio + x, fs = self.r.read([seg_id], time_offset=start, time_durs=read_duration) + return x[0].astype(floatstr_torch(), copy=False), fs[0] - if not self.return_class: - return r + def _apply_aug_mix(self, x, x_augs, aug_idx): + x_aug_mix = {} + alpha_d = (self.aug_mix_alpha,) * len(x_augs) + w = self.rng.dirichlet(alpha_d, self.num_aug_mix) + m = self.rng.beta(alpha_d, self.num_aug_mix) + for i in range(self.num_aug_mix): + x_mix = np.zeros_like(x) + for j, (_, x_aug_j) in enumerate(x_augs.items()): + x_mix += w[i, j] * x_aug_j - class_idx = self.utt_idx2class[index] - r = *r, class_idx - return r + x_aug_mix[f"x_aug_{aug_idx}_{i}"] = m[i] * x + (1 - m[i]) * x_mix - def _get_random_chunk(self, index): + return x_aug_mix - if len(index) == 2: - index, chunk_length = index + def _apply_augs(self, x, duration, fs): + if not self.augmenters: + return {"x": x} + + if duration == 0: + num_samples = len(x) else: - chunk_length = self.max_chunk_length + num_samples = int(duration * fs) + + reverb_context_samples = len(x) - num_samples + x_orig = x[reverb_context_samples:] + x_augs = {} + # for each type of augmentation + for i, augmenter in enumerate(self.augmenters): + # we do n_augs per augmentation type + x_augs_i = {} + for j in range(self.num_augs): + # augment x + x_aug, aug_info = augmenter(x) + # remove the extra left context used to compute the reverberation. + x_aug = x_aug[reverb_context_samples : len(x)] + x_aug = x_aug.astype(floatstr_torch(), copy=False) + x_augs_i[f"x_aug_{i}_{j}"] = x_aug + + if self.num_aug_mix > 0: + x_augs_i = self._apply_aug_mix(x_orig, x_augs_i, i) + + x_augs.update(x_augs_i) + + if self.return_orig: + x_augs["x"] = x_orig + elif len(x_augs) == 1: + # if we just have one aug and we don't return the clean version, + # we just call x to the aug version + x_augs["x"] = x_augs.pop("x_aug_0_0") + + return x_augs + + def _get_segment_info(self, seg_id): + seg_info = {} + # converts the class_ids to integers + for info_name in self.return_segment_info: + tokenizer_name = "" + if info_name in self.tokenizers_to_infos: + tokenizer_name = info_name + info_name = self.tokenizers_to_infos[tokenizer_name] + + seg_info_i = self.seg_set.loc[seg_id, info_name] + if info_name in self.class_info: + # if the type of information is a class-id + # we use the class information table to + # convert from id to integer + class_info = self.class_info[info_name] + seg_info_i = class_info.loc[seg_info_i, "class_idx"] + elif tokenizer_name in self.tokenizers: + seg_info_i = self.tokenizers[tokenizer_name].encode(seg_info_i) + elif info_name == "text": + seg_info_i = self.sp.encode(seg_info_i, out_type=int) + + seg_info[info_name] = seg_info_i + + return seg_info + + def _get_resampler(self, fs): + if fs in self.resamplers: + return self.resamplers[fs] + + resampler = tat.Resample( + int(fs), + int(self.target_sample_freq), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492, + ) + resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() + self.resamplers[fs] = resampler_f + return resampler_f - key = self.u2c.key[index] + def _resample(self, x, fs): + if self.target_sample_freq is None: + return x, fs - full_seq_length = self.seq_lengths[index] + return self.resampler(x, fs) + + def __getitem__(self, segment): + seg_id, start, duration = self._parse_segment_item(segment) + x, fs = self._read_audio(seg_id, start, duration) assert ( - chunk_length <= full_seq_length - ), "chunk_length(%d) <= full_seq_length(%d)" % (chunk_length, full_seq_length) - - time_offset = torch.rand(size=(1,)).item() * (full_seq_length - chunk_length) - reverb_context = min(self.reverb_context, time_offset) - time_offset -= reverb_context - read_chunk_length = chunk_length + reverb_context - - # logging.info('get-random-chunk {} {} {} {} {}'.format(index, key, time_offset, chunk_length, full_seq_length )) - x, fs = self.r.read([key], time_offset=time_offset, time_durs=read_chunk_length) - - # try: - # x, fs = self.r.read([key], time_offset=time_offset, - # time_durs=read_chunk_length) - # except: - # # some files produce error in the fseek after reading the data, - # # this seems an issue from pysoundfile or soundfile lib itself - # # reading from a sligthly different starting position seems to solve the problem in most cases - # try: - # logging.info('error-1 reading at key={} totol_dur={} offset={} read_chunk_length={}, retrying...'.format( - # key, full_seq_length, time_offset, read_chunk_length)) - # time_offset = math.floor(time_offset) - # x, fs = self.r.read([key], time_offset=time_offset, - # time_durs=read_chunk_length) - # except: - # try: - # # if changing the value of time-offset doesn't solve the issue, we try to read from - # # from time-offset to the end of the file, and remove the extra frames later - # logging.info('error-2 reading at key={} totol_dur={} offset={} retrying reading until end-of-file ...'.format( - # key, full_seq_length, time_offset)) - # x, fs = self.r.read([key], time_offset=time_offset) - # x = [x[0][:int(read_chunk_length * fs[0])]] - # except: - # # try to read the full file - # logging.info('error-3 reading at key={} totol_dur={} retrying reading full file ...'.format( - # key, full_seq_length)) - # x, fs = self.r.read([key]) - # x = [x[0][:int(read_chunk_length * fs[0])]] - - x = x[0] - fs = fs[0] - - x_clean = x - logging.info("hola1") - if self.augmenter is not None: - logging.info("hola2") - chunk_length_samples = int(chunk_length * fs) - end_idx = len(x) - reverb_context_samples = end_idx - chunk_length_samples - assert reverb_context_samples >= 0, ( - "key={} time-offset={}, read-chunk={} " - "read-x-samples={}, chunk_samples={}, reverb_context_samples={}" - ).format( - key, - time_offset, - read_chunk_length, - end_idx, - chunk_length_samples, - reverb_context_samples, - ) - # end_idx = reverb_context_samples + chunk_length_samples - x, aug_info = self.augmenter(x) - x = x[reverb_context_samples:end_idx] - if self.return_clean_aug_pair: - x_clean = x_clean[reverb_context_samples:end_idx] - x_clean = x_clean.astype(floatstr_torch(), copy=False) - # x_clean = x_clean[reverb_context_samples:] - # logging.info('augmentation x-clean={}, x={}, aug_info={}'.format( - # x_clean.shape, x.shape, aug_info)) - # if len(x) != 64000: - # logging.info('x!=4s, {} {} {} {} {} {} {} {}'.format(len(x),reverb_context, reverb_context_samples, chunk_length, chunk_length_samples, end_idx, fs, read_chunk_length)) - - # if len(x) != 64000: - # logging.info('x!=4s-2, {} {} {} {}'.format(len(x), chunk_length, fs, read_chunk_length)) - - if self.transpose_input: - x = x[None, :] - if self.return_clean_aug_pair: - x_clean = x_clean[None, :] - - x = x.astype(floatstr_torch(), copy=False) - if self.return_clean_aug_pair: - r = x, x_clean - else: - r = (x,) + len(x) > 0 + ), f"read audio empty seg_id={seg_id}, start={start}, dur={duration}" + x, fs = self._resample(x, fs) + data = {"seg_id": seg_id, "sample_freq": fs} + x_augs = self._apply_augs(x, duration, fs) + data.update(x_augs) + seg_info = self._get_segment_info(seg_id) + data.update(seg_info) + return data - if not self.return_class: - return r + @staticmethod + def collate(self, batch): + + # sort batch by the length of x + audio_lengths = [] + for record in batch: + audio_lengths.append(record["x"].shape[0]) + audio_lengths = torch.as_tensor(audio_lengths) + if not torch.all(audio_lengths[:-1] >= audio_lengths[1:]): + sort_idx = torch.argsort(audio_lengths, descending=True) + batch = [batch[i] for i in sort_idx] + + del audio_lengths + + def _is_list_of_tensors(x): + return isinstance(x[0], (torch.Tensor, np.ndarray)) + + def _is_list_of_items(x): + return isinstance(x[0], (int, float)) + + def _is_list_of_strs(x): + return isinstance(x[0], str) + + def _is_list_of_strlists(x): + return isinstance(x[0], list) and isinstance(x[0][0], str) + + def _is_list_of_intlists(x): + return isinstance(x[0], list) and isinstance(x[0][0], int) + + output_batch = {} + batch_keys = batch[0].keys() + for key in batch_keys: + item_list = list_of_dicts_to_list(batch, key) + if key == "id": + # this are the segment ids + output_batch[key] = item_list + elif key == "x" or key[:2] == "x_" and _is_list_of_tensors(item_list): + # these are input audios + data, data_lengths = collate_seqs_1d(item_list) + output_batch[key] = data + output_batch[f"{key}_lengths"] = data_lengths + elif _is_list_of_items(item_list): + # these should be things like class ids + output_batch[key] = torch.as_tensor(item_list) + elif _is_list_of_tensors(item_list): + # other tensor data + data, data_lengths = collate_seqs_nd(item_list) + output_batch[key] = data + output_batch[f"{key}_lengths"] = data_lengths + elif _is_list_of_intlists(item_list): + # we assume k2 ragged tensor for now + output_batch[key] = k2.RaggedTensor(item_list) + elif _is_list_of_strs(item_list): + # we just left them as they are: + output_batch[key] = item_list + else: + raise TypeError(f"we don't know how to collate this data={item_list}") + + return output_batch - class_idx = self.utt_idx2class[index] - r = *r, class_idx - return r + @staticmethod + def collate_old(self, batch): + from torch.nn.utils.rnn import pad_sequence + + audio = [] + audio_length = [] + target = [] + for record in batch: + audio_length.append(record["x"].shape[0]) + audio_length = torch.as_tensor(audio_length) + if not torch.all(audio_length[:-1] >= audio_length[1:]): + sort_idx = torch.argsort(audio_length, descending=True) + batch = [batch[i] for i in sort_idx] + + audio_length = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + + audio = pad_sequence(audio) + audio_length = torch.as_tensor(audio_length) + target = k2.RaggedTensor(target) + batch = { + "x": torch.transpose(audio, 0, 1), + "x_lengths": audio_length, + "text": target, + } + return batch + + def get_collator(self): + return lambda batch: AudioDataset.collate(self, batch) @staticmethod def filter_args(**kwargs): - - ar_args = AR.filter_args(**kwargs) - valid_args = ( - "path_prefix", - "class_file", - "time_durs_file", - "min_chunk_length", - "max_chunk_length", - "return_fullseqs", - "part_idx", - "num_parts", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - args.update(ar_args) + args = filter_func_args(AudioDataset.__init__, kwargs) return args @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - # parser.add_argument('--path-prefix', - # default='', - # help=('path prefix for rspecifier scp file')) + if "recordings_file" not in skip: + parser.add_argument( + "--recordings-file", + required=True, + help="recordings manifest file (kaldi .scp or pandas .csv)", + ) + + if "segments_file" not in skip: + parser.add_argument( + "--segments-file", + required=True, + help="segments manifest file (kaldi .scp or pandas .csv)", + ) parser.add_argument( - "--class-file", + "--class-names", default=None, - help=("ordered list of classes keys, it can contain class weights"), + nargs="+", + help=( + "list with the names of the types of classes in the datasets, e.g., speaker, language" + ), ) parser.add_argument( - "--time-durs-file", default=None, help=("utt to duration in secs file") + "--class-files", + default=None, + nargs="+", + help="list of class info files", ) parser.add_argument( - "--min-chunk-length", - type=float, + "--tokenizer-mappings", default=None, - help=("minimum length of sequence chunks"), + nargs="+", + help="""list mapping the segment_set fields to the tokenizer name + that should be used with them, e.g., text->text-1, + this argument has to be sync with tokenizer_files. + """, ) + parser.add_argument( - "--max-chunk-length", - type=float, + "--tokenizer-files", + default=None, + nargs="+", + help="""list of tokenizer cofinguration files + this argument has to be sync with tokenizer_mappings. + """, + ) + + parser.add_argument( + "--time-durs-file", default=None, - help=("maximum length of sequence chunks"), + help=( + "(deprecated) segment to duration in secs file, if durations are not in segments_file" + ), ) parser.add_argument( - "--return-fullseqs", + "--bpe-model", + default=None, + help="bpe model for the text label", + ) + + parser.add_argument( + "--text-file", + default=None, + help="text file with words labels for each utterances", + ) + + if "aug_cfgs" not in skip: + parser.add_argument( + "--aug-cfgs", + default=None, + nargs="+", + help="augmentation configuration file.", + ) + + parser.add_argument( + "--num-augs", + default=1, + type=int, + help="number of augmentations per segment and augmentation type", + ) + parser.add_argument( + "--num-aug-mix", + default=0, + type=int, + help="number of AugMix augmentations per segment", + ) + parser.add_argument( + "--aug-mix-alpha", + default=0.5, + type=float, + help="number of AugMix augmentations per segment", + ) + parser.add_argument( + "--return-segment-info", + default=None, + nargs="+", + help=( + "list of columns of the segment file which should be returned as supervisions" + ), + ) + parser.add_argument( + "--return-orig", default=False, - action="store_true", - help=("returns full sequences instead of chunks"), + action=ActionYesNo, + help=( + "when using augmentation, whether or not to return also the original audio" + ), + ) + + parser.add_argument( + "--target-sample-freq", + default=None, + type=int, + help=( + "target sampling frequencey, if not None all audios are converted to this sample freq" + ), + ) + + parser.add_argument( + "--seed", + default=11235811, + type=int, + help="random seed", ) AR.add_class_args(parser) diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py new file mode 100644 index 00000000..64d2928c --- /dev/null +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -0,0 +1,131 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +from typing import Optional, Type + +import numpy as np +import torch +import torch.distributed as dist + +from ...utils import SegmentSet +from .hyp_sampler import HypSampler +from .seg_sampler import SegSampler + + +class BucketingSegSampler(HypSampler): + + def __init__( + self, + seg_set: SegmentSet, + base_sampler: Type[HypSampler] = SegSampler, + num_buckets: int = 10, + length_column: str = "duration", + max_batches_per_epoch: Optional[int] = None, + seed: int = 1234, + **base_kwargs + ): + super().__init__( + max_batches_per_epoch=max_batches_per_epoch, shuffle=False, seed=seed + ) + self.seg_set = seg_set + self.base_sampler = base_sampler + self.base_kwargs = base_sampler.filter_args(**base_kwargs) + self.base_kwargs["seed"] = seed + self.num_buckets = num_buckets + self.length_column = length_column + self._create_bucket_samplers() + self._compute_len() + self.depleted_buckets = torch.zeros((num_buckets,), dtype=torch.bool) + + def create_buckets(self): + sort_idx = np.argsort(self.seg_set[self.length_column].values) + sorted_seg_set = self.seg_set.iloc[sort_idx] + cum_lengths = np.cumsum(sorted_seg_set[self.length_column].values, axis=0) + bucket_length = cum_lengths[-1] / self.num_buckets + buckets = [] + for i in range(self.num_buckets): + bucket_idx = (cum_lengths <= bucket_length) & (cum_lengths > 0) + bucket_i = sorted_seg_set.loc[bucket_idx] + buckets.append(bucket_i) + cum_lengths -= bucket_length + + return buckets + + def _create_bucket_samplers(self): + buckets = self.create_buckets() + bucket_samplers = [] + for i in range(self.num_buckets): + sampler_i = self.base_sampler(buckets[i], **self.base_kwargs) + bucket_samplers.append(sampler_i) + + self.bucket_samplers = bucket_samplers + + def __len__(self): + return self._len + + def _compute_len(self): + self._len = 0 + for i in range(self.num_buckets): + self._len += len(self.bucket_samplers[i]) + + if self.max_batches_per_epoch is not None: + self._len = min(self._len, self.max_batches_per_epoch) + + def set_epoch(self, epoch, batch=0): + for i in range(self.num_buckets): + self.bucket_samplers[i].set_epoch(epoch, batch) + + def __iter__(self): + super().__iter__() + self.depleted_buckets[:] = False + for i in range(self.num_buckets): + self.bucket_samplers[i].__iter__() + + return self + + def all_buckets_depleted(self): + return torch.all(self.depleted_buckets).item() + + def __next__(self): + + if self.batch == self._len or self.all_buckets_depleted(): + raise StopIteration + + while True: + bucket_idx = torch.randint( + low=0, high=self.num_buckets, size=(1,), generator=self.rng + ).item() + if self.depleted_buckets[bucket_idx]: + continue + + bucket = self.bucket_samplers[bucket_idx] + try: + batch = next(bucket) + break + except StopIteration: + self.depleted_buckets[bucket_idx] = True + if self.all_buckets_depleted(): + raise StopIteration() + + if self.batch == 0: + logging.info("batch 0 chunks=%s", str(batch[:10])) + + self.batch += 1 + return batch + + @property + def avg_batch_size(self): + avg_batch_size = 0 + for sampler in self.bucket_samplers: + avg_batch_size += sampler.avg_batch_size + + avg_batch_size /= self.num_buckets + return avg_batch_size + + @staticmethod + def filter_args(**kwargs): + return kwargs diff --git a/hyperion/torch/data/class_weighted_embed_sampler.py b/hyperion/torch/data/class_weighted_embed_sampler.py new file mode 100644 index 00000000..708e12ed --- /dev/null +++ b/hyperion/torch/data/class_weighted_embed_sampler.py @@ -0,0 +1,293 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +import time +from typing import Optional + +import numpy as np +import pandas as pd +import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils import ClassInfo +from ...utils.misc import filter_func_args +from .hyp_sampler import HypSampler + + +class ClassWeightedEmbedSampler(HypSampler): + def __init__( + self, + embed_set, + class_info: ClassInfo, + batch_size: int = 1, + num_embeds_per_class: int = 1, + weight_exponent: float = 1.0, + weight_mode: str = "custom", + num_hard_prototypes: int = 0, + affinity_matrix: Optional[torch.Tensor] = None, + class_name: str = "class_id", + max_batches_per_epoch: Optional[int] = None, + shuffle: bool = False, + seed: int = 1234, + ): + super().__init__( + max_batches_per_epoch=max_batches_per_epoch, shuffle=shuffle, seed=seed + ) + self.class_name = class_name + self.embed_set = embed_set + self.class_info = class_info + self.batch_size = batch_size + self.avg_batch_size = batch_size + + self.num_embeds_per_class = num_embeds_per_class + + self.weight_exponent = weight_exponent + self.weight_mode = weight_mode + + self.num_hard_prototypes = num_hard_prototypes + self.batch = 0 + + self._compute_len() + self._compute_num_classes_per_batch() + self._gather_class_info() + self._set_class_weights() + + self.set_hard_prototypes(affinity_matrix) + + logging.info( + ("sampler batches/epoch=%d batch-size=%d, " "classes/batch=%.2f "), + self._len, + self.batch_size, + self.num_classes_per_batch, + ) + + def _set_seed(self): + if self.shuffle: + self.rng.manual_seed(self.seed + 10 * self.epoch + 100 * self.rank) + else: + self.rng.manual_seed(self.seed + 100 * self.rank) + + def _compute_len(self): + self._len = int( + math.ceil(len(self.embed_set) / self.avg_batch_size / self.world_size) + ) + if self.max_batches_per_epoch is not None: + self._len = min(self._len, self.max_batches_per_epoch) + + def __len__(self): + return self._len + + def _gather_class_info(self): + # we get some extra info that we need for the classes. + # we need the mapping from class index to id + self.map_class_idx_to_ids = self.class_info[["class_idx", "id"]] + self.map_class_idx_to_ids.set_index("class_idx", inplace=True) + + # we need the list of embeddings from each class + # to speed up embedding sampling + # searching then in each batch, it is too slow + map_class_to_embeds = self.embed_set.df[["id", self.class_name]].set_index( + self.class_name + ) + self.map_class_to_embed_idx = {} + for class_id in self.class_info["id"].values: + if class_id in map_class_to_embeds.index: + embed_ids = map_class_to_embeds.loc[class_id, "id"] + if isinstance(embed_ids, str): + embed_ids = [embed_ids] + else: + embed_ids = embed_ids.values + + embed_idx = self.embed_set.get_loc(embed_ids) + else: + embed_idx = [] + self.class_info.loc[class_id, "weights"] = 0.0 + self.class_info.renorm_weights() + + self.map_class_to_embed_idx[class_id] = embed_idx + + def _set_class_weights(self): + if self.weight_mode == "uniform": + self.class_info.set_uniform_weights() + elif self.weight_mode == "data-prior": + weights = self.class_info["total_duration"].values + self.class_info.set_weights(self, weights) + + if self.weight_exponent != 1.0: + self.class_info.exp_weights(self.weight_exponent) + + @property + def hard_prototype_mining(self): + return self.num_hard_prototypes > 1 + + def set_hard_prototypes(self, affinity_matrix): + if affinity_matrix is None: + self.hard_prototypes = None + return + + # don't sample hard negs from classes with zero weigth or absent + zero_w = self.class_info["weights"] == 0 + if np.any(zero_w): + zero_w_idx = self.class_info.loc[zero_w, "class_idx"].values + affinity_matrix[:, zero_w_idx] = -1000 + + for i in range(affinity_matrix.size(1)): + mask_i = self.class_info["class_idx"] == i + if np.all(mask_i == 0): + affinity_matrix[:, i] = -1000 + + # hard prototypes for a class are itself and k-1 closest to it. + self.hard_prototypes = torch.topk( + affinity_matrix, self.num_hard_prototypes, dim=-1 + ).indices + + def get_hard_prototypes(self, class_idx): + return self.hard_prototypes[class_idx].flatten().numpy() + + def _compute_num_classes_per_batch(self): + num_classes = self.batch_size / self.num_embeds_per_class + if self.hard_prototype_mining: + num_classes /= self.num_hard_prototypes + self.num_classes_per_batch = int(math.ceil(num_classes)) + + def _get_class_weights( + self, + ): + return torch.as_tensor(self.class_info["weights"].values) + + def _sample_classes(self): + weights = self._get_class_weights() + row_idx = torch.multinomial( + weights, + num_samples=self.num_classes_per_batch, + replacement=True, + generator=self.rng, + ).numpy() + + class_ids = self.class_info.iloc[row_idx].id.values + if self.hard_prototype_mining: + # map class ids to class indexes + class_idx = self.class_info.loc[class_ids, "class_idx"].values + class_idx = self.get_hard_prototypes(class_idx) + # map back to class ids + class_ids = self.map_class_idx_to_ids.loc[class_idx, "id"].values + + return class_ids + + def _sample_embeds(self, class_ids): + + id_col_idx = self.embed_set.get_col_idx("id") + embed_ids = [] + for c in class_ids: + # get embeds belonging to c + embed_idx_c = self.map_class_to_embed_idx[c] + # sample num_embeds_per_class randomly + if len(embed_idx_c) == 0: + logging.error("no embeddings found with class=%s", c) + + sel_idx = torch.randint( + low=0, + high=len(embed_idx_c), + size=(self.num_embeds_per_class,), + generator=self.rng, + ).numpy() + + sel_embed_idx_c = embed_idx_c[sel_idx] + sel_embed_ids_c = list(self.embed_set.iloc[sel_embed_idx_c, id_col_idx]) + embed_ids.extend(sel_embed_ids_c) + + return embed_ids + + def __next__(self): + + if self.batch == self._len: + raise StopIteration + + class_ids = self._sample_classes() + embed_ids = self._sample_embeds(class_ids) + if self.batch == 0: + logging.info("batch 0 uttidx=%s", str(embed_ids[:10])) + + self.batch += 1 + return embed_ids + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(ClassWeightedEmbedSampler.__init__, kwargs) + # valid_args = ( + # "batch_size", + # "num_embeds_per_class", + # "weight_exponent", + # "weight_mode", + # "num_hard_prototypes", + # "class_name", + # "shuffle", + # "seed", + # ) + + # return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--batch-size", + type=int, + default=1, + help=("batch size per gpu"), + ) + + parser.add_argument( + "--num-embeds-per-class", + type=int, + default=1, + help=("number of embeds per class in batch"), + ) + parser.add_argument( + "--weight-exponent", + default=1.0, + type=float, + help=("exponent for class weights"), + ) + parser.add_argument( + "--weight-mode", + default="custom", + choices=["custom", "uniform", "data-prior"], + help=("method to get the class weights"), + ) + + parser.add_argument( + "--num-hard-prototypes", + type=int, + default=0, + help=("number of hard prototype classes per batch"), + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the embeddings at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--class-name", + default="class_id", + help="which column in the info table indicates the class", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py new file mode 100644 index 00000000..7cadfee2 --- /dev/null +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -0,0 +1,591 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +import time +from typing import Optional, Union + +import numpy as np +import pandas as pd +import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils import ClassInfo, SegmentSet +from ...utils.misc import filter_func_args +from .hyp_sampler import HypSampler + + +class ClassWeightedRandomSegChunkSampler(HypSampler): + def __init__( + self, + seg_set: SegmentSet, + class_info: ClassInfo, + min_chunk_length: int, + max_chunk_length: Optional[int] = None, + min_batch_size: int = 1, + max_batch_size: Optional[int] = None, + max_batch_length: Optional[int] = None, + num_chunks_per_seg_epoch: Union[str, int] = "auto", + num_segs_per_class: int = 1, + num_chunks_per_seg: int = 1, + weight_exponent: float = 1.0, + weight_mode: str = "custom", + seg_weight_mode: str = "uniform", + num_hard_prototypes: int = 0, + affinity_matrix: Optional[torch.Tensor] = None, + class_name: str = "class_id", + length_name: str = "duration", + max_batches_per_epoch: Optional[int] = None, + shuffle: bool = False, + iters_per_epoch: Optional[int] = None, + batch_size: Optional[int] = None, + seed: int = 1234, + ): + super().__init__( + max_batches_per_epoch=max_batches_per_epoch, shuffle=shuffle, seed=seed + ) + self.class_name = class_name + self.length_name = length_name + self.seg_set = seg_set + self.class_info = class_info + self.min_chunk_length = min_chunk_length + self.max_chunk_length = ( + min_chunk_length if max_chunk_length is None else max_chunk_length + ) + + # computing min-batch-size + if batch_size is not None: + min_batch_size = batch_size + + min_batch_size = max(num_segs_per_class * num_chunks_per_seg, min_batch_size) + + # computing max-batch-size + if max_batch_length is None: + max_batch_size_0 = int(min_batch_size * max_chunk_length / min_chunk_length) + else: + max_batch_size_0 = int(max_batch_length / max_chunk_length) + + max_batch_size = ( + max_batch_size_0 + if max_batch_size is None + else min(max_batch_size_0, max_batch_size) + ) + + self.min_batch_size = min_batch_size + self.max_batch_size = max_batch_size + self.avg_batch_size = (min_batch_size + max_batch_size) / 2 + self.var_batch_size = self.min_batch_size != self.max_batch_size + + self.num_segs_per_class = num_segs_per_class + self.num_chunks_per_seg = num_chunks_per_seg + + self.weight_exponent = weight_exponent + self.weight_mode = weight_mode + self.seg_weight_mode = seg_weight_mode + + self.num_hard_prototypes = num_hard_prototypes + self.batch = 0 + + # compute the number of batches / epoch + # legacy config parameter + num_chunks_per_seg_epoch = ( + iters_per_epoch if iters_per_epoch is not None else num_chunks_per_seg_epoch + ) + self._set_num_chunks_per_seg_epoch(num_chunks_per_seg_epoch) + self._compute_len() + + # # fast mapping from classes to segments + # self.map_class_to_segs = self.seg_set.df[ + # ["id", self.class_name, self.length_name] + # ] + # self.map_class_to_segs.set_index(self.class_name, drop=False, inplace=True) + + self._gather_class_info() + self._set_class_weights() + + self.set_hard_prototypes(affinity_matrix) + + logging.info( + ( + "sampler batches/epoch=%d min-batch-size=%d, max-batch-size=%d " + "avg-batch-size/gpu=%.2f avg-classes/batch=%.2f samples/(seg*epoch)=%d" + ), + self._len, + self.min_batch_size, + self.max_batch_size, + self.avg_batch_size, + self.avg_batch_size / num_segs_per_class / num_chunks_per_seg, + self.num_chunks_per_seg_epoch, + ) + + self.counts = {} + + def _set_seed(self): + if self.shuffle: + self.rng.manual_seed(self.seed + 10 * self.epoch + 100 * self.rank) + else: + self.rng.manual_seed(self.seed + 100 * self.rank) + + def _set_num_chunks_per_seg_epoch(self, num_chunks_per_seg_epoch): + if num_chunks_per_seg_epoch == "auto": + self._compute_num_chunks_per_seg_epoch_auto() + else: + self.num_chunks_per_seg_epoch = num_chunks_per_seg_epoch + + def _compute_num_chunks_per_seg_epoch_auto(self): + seg_set = self.seg_set + avg_seg_length = np.mean(seg_set[self.length_name]) + avg_chunk_length = (self.max_chunk_length + self.min_chunk_length) / 2 + self.num_chunks_per_seg_epoch = math.ceil(avg_seg_length / avg_chunk_length) + logging.debug( + "num egs per segment and epoch: %d", self.num_chunks_per_seg_epoch + ) + + def _compute_len(self): + self._len = int( + math.ceil( + self.num_chunks_per_seg_epoch + * len(self.seg_set) + / self.avg_batch_size + / self.world_size + ) + ) + if self.max_batches_per_epoch is not None: + self._len = min(self._len, self.max_batches_per_epoch) + + def __len__(self): + return self._len + + def _gather_class_info(self): + # we get some extra info that we need for the classes. + + # we need the maximum/minimum segment duration for each class. + max_dur = np.zeros(len(self.class_info)) + min_dur = np.zeros(len(self.class_info)) + total_dur = np.zeros(len(self.class_info)) + for i, c in enumerate(self.class_info["id"]): + seg_idx = self.seg_set[self.class_name] == c + if seg_idx.sum() > 0: + durs_i = self.seg_set.loc[seg_idx, self.length_name] + max_dur[i] = durs_i.max() + min_dur[i] = durs_i.min() + total_dur[i] = durs_i.sum() + else: + max_dur[i] = min_dur[i] = total_dur[i] = 0 + + self.class_info["max_seg_duration"] = max_dur + self.class_info["min_seg_duration"] = min_dur + self.class_info["total_duration"] = total_dur + + # we need the mapping from class index to id + self.map_class_idx_to_ids = self.class_info[["class_idx", "id"]] + self.map_class_idx_to_ids.set_index("class_idx", inplace=True) + + # we need the list of segments from each class + # to speed up segment sampling + # searching then in each batch, it is too slow + map_class_to_segs = self.seg_set.df[["id", self.class_name]].set_index( + self.class_name + ) + self.map_class_to_segs_idx = {} + for class_id in self.class_info["id"].values: + if class_id in map_class_to_segs.index: + seg_ids = map_class_to_segs.loc[class_id, "id"] + if isinstance(seg_ids, str): + seg_ids = [seg_ids] + else: + seg_ids = seg_ids.values + + seg_idx = self.seg_set.get_loc(seg_ids) + else: + seg_idx = [] + self.class_info.loc[class_id, "weights"] = 0.0 + self.class_info.renorm_weights() + + self.map_class_to_segs_idx[class_id] = seg_idx + + def _set_class_weights(self): + if self.weight_mode == "uniform": + self.class_info.set_uniform_weights() + elif self.weight_mode == "data-prior": + weights = self.class_info["total_duration"].values + self.class_info.set_weights(weights) + + if self.weight_exponent != 1.0: + self.class_info.exp_weights(self.weight_exponent) + + zero_weight = self.class_info["max_seg_duration"] < self.min_chunk_length + if np.any(zero_weight): + self.class_info.set_zero_weight(zero_weight) + + self.var_weights = np.any( + self.seg_set[self.length_name] < self.max_chunk_length + ) + + @property + def hard_prototype_mining(self): + return self.num_hard_prototypes > 1 + + def set_hard_prototypes(self, affinity_matrix): + if affinity_matrix is None: + self.hard_prototypes = None + return + + # don't sample hard negs from classes with zero weigth or absent + zero_w = self.class_info["weights"] == 0 + if np.any(zero_w): + zero_w_idx = self.class_info.loc[zero_w, "class_idx"].values + affinity_matrix[:, zero_w_idx] = -1000 + + for i in range(affinity_matrix.size(1)): + mask_i = self.class_info["class_idx"] == i + if np.all(mask_i == 0): + affinity_matrix[:, i] = -1000 + + # hard prototypes for a class are itself and k-1 closest to it. + self.hard_prototypes = torch.topk( + affinity_matrix, self.num_hard_prototypes, dim=-1 + ).indices + + def get_hard_prototypes(self, class_idx): + return self.hard_prototypes[class_idx].flatten().numpy() + + def _sample_chunk_length(self): + if self.var_batch_size: + return ( + torch.rand(size=(1,), generator=self.rng).item() + * (self.max_chunk_length - self.min_chunk_length) + + self.min_chunk_length + ) + + return self.min_chunk_length + + def _compute_batch_size(self, chunk_length): + return int(self.min_batch_size * self.max_chunk_length / chunk_length) + + def _compute_num_classes_per_batch(self, batch_size): + num_classes = batch_size / self.num_segs_per_class / self.num_chunks_per_seg + if self.hard_prototype_mining: + num_classes /= self.num_hard_prototypes + return int(math.ceil(num_classes)) + + def _get_class_weights(self, chunk_length): + if not self.var_weights: + return torch.as_tensor(self.class_info["weights"].values) + + # get classes where all segments are shorter than + # chunk length and put weight to 0 + zero_idx = self.class_info["max_seg_duration"] < chunk_length + if not np.any(zero_idx): + return torch.as_tensor(self.class_info["weights"].values) + + class_weights = self.class_info["weights"].values.copy() + class_weights[zero_idx] = 0.0 + # renormalize weights + class_weights /= class_weights.sum() + return torch.as_tensor(class_weights) + + def _sample_classes(self, num_classes, chunk_length): + weights = self._get_class_weights(chunk_length) + row_idx = torch.multinomial( + weights, + num_samples=num_classes, + replacement=True, + generator=self.rng, + ).numpy() + + class_ids = self.class_info.iloc[row_idx].id.values + if self.hard_prototype_mining: + # map class ids to class indexes + class_idx = self.class_info.loc[class_ids, "class_idx"].values + class_idx = self.get_hard_prototypes(class_idx) + # map back to class ids + class_ids = self.map_class_idx_to_ids.loc[class_idx, "id"].values + + return class_ids + + def _sample_segs(self, class_ids, chunk_length): + + dur_col_idx = self.seg_set.get_col_idx(self.length_name) + id_col_idx = self.seg_set.get_col_idx("id") + + seg_ids = [] + for c in class_ids: + # for each class we sample segments longer than chunk length + # get segments belonging to c + # t1 = time.time() + seg_idx_c = self.map_class_to_segs_idx[c] + # t2 = time.time() + durs = self.seg_set.iloc[seg_idx_c, dur_col_idx].values + if self.class_info.loc[c, "min_seg_duration"] < chunk_length: + mask = durs >= chunk_length + seg_idx_c = seg_idx_c[mask] + durs = durs[mask] + + # t3 = time.time() + # sample num_segs_per_class random segments + if len(seg_idx_c) == 0: + logging.error("no segments found with class=%s dur=%d", c, chunk_length) + if self.seg_weight_mode == "uniform": + sel_idx = torch.randint( + low=0, + high=len(seg_idx_c), + size=(self.num_segs_per_class,), + generator=self.rng, + ).numpy() + + elif self.seg_weight_mode == "data-prior": + weights = durs / durs.sum() + sel_idx = torch.multinomial( + torch.from_numpy(weights), + num_samples=self.num_segs_per_class, + replacement=True, + generator=self.rng, + ).numpy() + # t4 = time.time() + else: + raise ValueError("unknown seg-weight-mode=%s", self.seg_weight_mode) + + sel_seg_idx_c = seg_idx_c[sel_idx] + sel_seg_ids_c = list(self.seg_set.iloc[sel_seg_idx_c, id_col_idx]) + # t5 = time.time() + seg_ids.extend(sel_seg_ids_c) + # t6 = time.time() + # logging.info( + # "stime %f %f %f %f %f", t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5 + # ) + + return seg_ids + + def _sample_chunks(self, seg_ids, chunk_length): + chunks = [] + scale = ( + torch.as_tensor(self.seg_set.loc[seg_ids, self.length_name].values) + - chunk_length + ) + for i in range(self.num_chunks_per_seg): + start = scale * torch.rand(size=(len(seg_ids),), generator=self.rng) + chunks_i = [(id, s.item(), chunk_length) for id, s in zip(seg_ids, start)] + chunks.extend(chunks_i) + + return chunks + + def __next__(self): + + if self.batch == self._len: + raise StopIteration + + # t1 = time.time() + chunk_length = self._sample_chunk_length() + # t2 = time.time() + batch_size = self._compute_batch_size(chunk_length) + # t3 = time.time() + num_classes = self._compute_num_classes_per_batch(batch_size) + # t4 = time.time() + class_ids = self._sample_classes(num_classes, chunk_length) + # for i in class_ids: + # if i in self.counts: + # self.counts[i] += 1 + # else: + # self.counts[i] = 1 + + # mx = 0 + # mn = 1000000000 + # for k, v in self.counts.items(): + # if v > mx: + # mx = v + # if v < mn: + # mn = v + + # t5 = time.time() + seg_ids = self._sample_segs(class_ids, chunk_length) + # t6 = time.time() + chunks = self._sample_chunks(seg_ids, chunk_length) + # t7 = time.time() + # print( + # "next", + # t2 - t1, + # t3 - t2, + # t4 - t3, + # t5 - t4, + # t6 - t5, + # t7 - t6, + # batch_size, + # num_classes, + # self.min_batch_size, + # len(chunks), + # flush=True, + # ) + if self.batch == 0: + logging.info("batch 0 uttidx=%s", str(chunks[:10])) + + self.batch += 1 + return chunks + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(ClassWeightedRandomSegChunkSampler.__init__, kwargs) + + # valid_args = ( + # "min_chunk_length", + # "max_chunk_length", + # "min_batch_size", + # "max_batch_size", + # "max_batch_length", + # "num_chunks_per_seg_epoch", + # "num_segs_per_class", + # "num_chunks_per_seg", + # "weight_exponent", + # "weight_mode", + # "seg_weight_mode", + # "num_hard_prototypes", + # "class_name", + # "length_name", + # "iters_per_epoch", + # "batch_size", + # "shuffle", + # "seed", + # ) + + # return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--min-chunk-length", + type=float, + default=4.0, + help=("minimum length of the segment chunks"), + ) + parser.add_argument( + "--max-chunk-length", + type=float, + default=None, + help=("maximum length of segment chunks"), + ) + + parser.add_argument( + "--min-batch-size", + type=int, + default=1, + help=("minimum batch size per gpu"), + ) + parser.add_argument( + "--max-batch-size", + type=int, + default=None, + help=( + "maximum batch size per gpu, if None, estimated from max_batch_length" + ), + ) + + parser.add_argument( + "--batch-size", + default=128, + type=int, + help=("deprecated, use min-batch-size instead"), + ) + + parser.add_argument( + "--max-batch-duration", + type=float, + default=None, + help=( + "maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), + ) + + parser.add_argument( + "--iters-per-epoch", + default=None, + type=lambda x: x if (x == "auto" or x is None) else float(x), + help=("deprecated, use --num-egs-per-seg-epoch instead"), + ) + + parser.add_argument( + "--num-chunks-per-seg-epoch", + default="auto", + type=lambda x: x if x == "auto" else float(x), + help=("number of times we sample a segment in each epoch"), + ) + + parser.add_argument( + "--num-segs-per-class", + type=int, + default=1, + help=("number of segments per class in batch"), + ) + parser.add_argument( + "--num-chunks-per-seg", + type=int, + default=1, + help=("number of chunks per segment in batch"), + ) + + parser.add_argument( + "--weight-exponent", + default=1.0, + type=float, + help=("exponent for class weights"), + ) + parser.add_argument( + "--weight-mode", + default="custom", + choices=["custom", "uniform", "data-prior"], + help=("method to get the class weights"), + ) + + parser.add_argument( + "--seg-weight-mode", + default="uniform", + choices=["uniform", "data-prior"], + help=("method to sample segments given a class"), + ) + + parser.add_argument( + "--num-hard-prototypes", + type=int, + default=0, + help=("number of hard prototype classes per batch"), + ) + + parser.add_argument( + "--max-batches-per-epoch", + type=int, + default=None, + help=("Max. batches per epoch"), + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the segments or chunks at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--length-name", + default="duration", + help="which column in the segment table indicates the duration of the segment", + ) + parser.add_argument( + "--class-name", + default="class_id", + help="which column in the segment table indicates the class of the segment", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/dino_audio_dataset.py b/hyperion/torch/data/dino_audio_dataset.py new file mode 100644 index 00000000..15eaca4b --- /dev/null +++ b/hyperion/torch/data/dino_audio_dataset.py @@ -0,0 +1,342 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +import time +from typing import Dict, List, Optional + +import numpy as np +import pandas as pd + +# import k2 +import sentencepiece as spm +import torch +import torch.distributed as dist +import torchaudio.transforms as tat +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...io import RandomAccessAudioReader as AR +from ...np.augment import SpeechAugment +from ...utils.class_info import ClassInfo +from ...utils.misc import filter_func_args +from ...utils.segment_set import SegmentSet +from ...utils.text import read_text +from ..torch_defs import floatstr_torch +from .audio_dataset import AudioDataset + + +class DINOAudioDataset(AudioDataset): + """AudioDataset class to train DINO for speech + + Args: + recordings_file: recordings manifest file (kaldi .scp or pandas .csv) + segments_file: segments manifest file (kaldi .scp or pandas .csv) + class_names: list with the names of the types of classes in the datasets, e.g., speaker, language + class_files: list of class info files + time_durs_file: (deprecated) segment to duration in secs file, if durations are not in segments_file + bpe_model: bpe model for the text label + text_file: text file with words labels for each utterances + teacher_aug_cfg: configuration for teacher augmentations + student_aug_cfg: configuration for student augmentations. + aug_cfgs: list of augmentation configuration files + num_augs: number of augmentations per segment and augmentation type + num_aug_mix: "number of AugMix augmentations per segment + aug_mix_alpha: AugMix Diritchlet distribution parameter + return_segment_info: list of columns of the segment file which should be returned as supervisions + return_orig: when using augmentation, whether or not to return also the original audio + target_sample_freq: target sampling frequencey, if not None all audios are converted to this sample freq + wav_scale: make waves to be in [-wav_scale, wav_scale] + is_val: is validation dataset. + seed: random seed + teacher_chunk_length: chunk length for the teacher model + num_teacher_chunks: num teacher chunks in eachd batch + student_chunk_length: chunk length for the student model + num_student_chunks: num student chunks in eachd batch + same_teacher_student_chunks: is True if teacher and student chunks are overlapped, False if disjoint + """ + + def __init__( + self, + recordings_file: str, + segments_file: str, + class_names: Optional[List[str]] = None, + class_files: Optional[List[str]] = None, + bpe_model: Optional[str] = None, + text_file: Optional[str] = None, + time_durs_file: Optional[str] = None, + teacher_aug_cfg: Optional[str] = None, + student_aug_cfg: Optional[str] = None, + num_augs: int = 1, + num_aug_mix: int = 0, + aug_mix_alpha: float = 0, + return_segment_info: Optional[List[str]] = None, + return_orig: bool = False, + target_sample_freq: Optional[float] = None, + wav_scale: float = 1, + is_val: bool = False, + seed: int = 112358, + teacher_chunk_length: float = 4, + num_teacher_chunks: int = 2, + student_chunk_length: float = 2, + num_student_chunks: int = 4, + same_teacher_student_chunks: bool = False, + ): + aug_cfgs = [] + student_aug_idx = -1 + teacher_aug_idx = -1 + if student_aug_cfg is not None: + aug_cfgs.append(student_aug_cfg) + student_aug_idx = 0 + if teacher_aug_cfg is not None: + assert student_aug_idx is not None + if teacher_aug_cfg != student_aug_cfg: + aug_cfgs.append(teacher_aug_cfg) + teacher_aug_idx = 1 + else: + teacher_aug_idx = 0 + + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + self.teacher_chunk_length = teacher_chunk_length + self.num_teacher_chunks = num_teacher_chunks + self.student_chunk_length = student_chunk_length + self.num_student_chunks = num_student_chunks + self.same_teacher_student_chunks = same_teacher_student_chunks + if student_aug_idx != -1: + self.student_augmenter = self.augmenters[student_aug_idx] + if teacher_aug_idx != -1: + self.teacher_augmenter = self.augmenters[teacher_aug_idx] + + def _apply_chunk_augs(self, x, duration, fs, augmenter, tag): + if not augmenter: + return {f"x_{tag}": x} + + if duration == 0: + num_samples = len(x) + else: + num_samples = int(duration * fs) + + reverb_context_samples = len(x) - num_samples + x_orig = x[reverb_context_samples:] + x_augs = {} + for j in range(self.num_augs): + # augment x + x_aug, aug_info = augmenter(x) + # remove the extra left context used to compute the reverberation. + x_aug = x_aug[reverb_context_samples : len(x)] + x_aug = x_aug.astype(floatstr_torch(), copy=False) + x_augs[f"x_{tag}_aug_{j}"] = x_aug + + if self.num_aug_mix > 0: + x_augs = self._apply_aug_mix(x_orig, x_augs, 0) + + if self.return_orig: + x_augs[f"x_{tag}"] = x_orig + elif len(x_augs) == 1: + # if we just have one aug and we don't return the clean version, + # we just call x to the aug version + x_augs[f"x_{tag}"] = x_augs.pop(f"x_{tag}_aug_0") + + return x_augs + + def _apply_augs(self, xs, duration, fs, augmenter, tag): + x_augs = {} + for i, x in enumerate(xs): + x_augs_i = self._apply_chunk_augs(x, duration, fs, augmenter, f"{tag}_{i}") + x_augs.update(x_augs_i) + + return x_augs + + def _split_audio_into_chunks(self, x, x_samples, chunk_samples, num_chunks): + reverb_context = len(x) - x_samples + chunk_shift = (x_samples - chunk_samples) // num_chunks + xs = [] + for i in range(num_chunks): + x_start = i * chunk_shift + x_end = x_start + chunk_samples + reverb_context + xs.append(x[x_start:x_end]) + + return xs + + def _split_audio_into_teacher_student_disjoint(self, x, duration, fs): + total_samples = int(duration * fs) + teacher_chunk_samples = int(fs * self.teacher_chunk_length) + student_chunk_samples = int(fs * self.student_chunk_length) + sum_chunk = teacher_chunk_samples + student_chunk_samples + assert total_samples >= sum_chunk, f"signal samples = {len(x)} < {sum_chunk}" + + teacher_crops_x_chunk = self.num_teacher_chunks * teacher_chunk_samples + student_crops_x_chunk = self.num_student_chunks * student_chunk_samples + sum_crops_x_chunk = teacher_crops_x_chunk + student_crops_x_chunk + teacher_samples = max( + teacher_crops_x_chunk * total_samples // sum_crops_x_chunk, + teacher_chunk_samples, + ) + student_samples = total_samples - teacher_samples + # here we decide if we split the audio in [teacher, student] or [student, teacher] + teacher_first = self.rng.random() < 0.5 + + if teacher_first: + x1_samples = teacher_samples + # x2_samples = student_samples + else: + x1_samples = student_samples + # x2_samples = teacher_samples + + max_reverb_context = int(self.reverb_context * fs) + x1_reverb_context = len(x) - total_samples + x1_end_sample = x1_reverb_context + x1_samples + x1 = x[:x1_end_sample] + if x1_end_sample >= max_reverb_context: + x2_reverb_context = max_reverb_context + else: + x2_reverb_context = x1_end_sample + + # print( + # "xxx", + # len(x), + # total_samples, + # teacher_first, + # teacher_samples, + # student_samples, + # x1_reverb_context, + # x1_end_sample, + # x2_reverb_context, + # flush=True, + # ) + x2 = x[x1_end_sample - x2_reverb_context :] + if teacher_first: + x_teacher = x1 + x_student = x2 + else: + x_teacher = x2 + x_student = x1 + + return x_teacher, teacher_samples, x_student, student_samples + + def _split_audio_into_teacher_student_same(self, x, duration, fs): + total_samples = int(duration * fs) + return x, total_samples, x, total_samples + + def _split_audio_into_teacher_student_chunks(self, x, duration, fs): + if self.same_teacher_student_chunks: + ( + x_teacher, + teacher_samples, + x_student, + student_samples, + ) = self._split_audio_into_teacher_student_same(x, duration, fs) + else: + ( + x_teacher, + teacher_samples, + x_student, + student_samples, + ) = self._split_audio_into_teacher_student_disjoint(x, duration, fs) + # assert ( + # len(x_teacher) >= 64000 and len(x_teacher) <= 136000 + # ), f"{len(x_teacher)}, {len(x_student)} {len(x)} {duration*fs}, {teacher_samples}, {student_samples}" + # assert ( + # len(x_student) >= 32000 and len(x_student) <= 136000 + # ), f"{len(x_teacher)}, {len(x_student)}, {len(x)} {duration*fs}, {teacher_samples}, {student_samples}" + xs_teacher = self._split_audio_into_chunks( + x_teacher, + teacher_samples, + int(fs * self.teacher_chunk_length), + self.num_teacher_chunks, + ) + xs_student = self._split_audio_into_chunks( + x_student, + student_samples, + int(fs * self.student_chunk_length), + self.num_student_chunks, + ) + # for xx in xs_teacher: + # assert ( + # len(xx) >= 64000 and len(xx) <= 72000 + # ), f"{[len(t) for t in xs_teacher]} {len(x_teacher)} {len(x)}" + # for xx in xs_student: + # assert ( + # len(xx) >= 32000 and len(xx) <= 40000 + # ), f"{[len(t) for t in xs_student]} {len(x_student)} {len(x)}" + + return xs_teacher, xs_student + + def __getitem__(self, segment): + seg_id, start, duration = self._parse_segment_item(segment) + x, fs = self._read_audio(seg_id, start, duration) + x, fs = self._resample(x, fs) + assert len(x) >= int( + duration * fs + ), f"getitem {self.seg_set.loc[seg_id].duration}, {start}, {duration}, {len(x)}" + data = {"seg_id": seg_id, "sample_freq": fs} + xs_teacher, xs_student = self._split_audio_into_teacher_student_chunks( + x, duration, fs + ) + x_augs_teacher = self._apply_augs( + xs_teacher, self.teacher_chunk_length, fs, self.teacher_augmenter, "teacher" + ) + x_augs_student = self._apply_augs( + xs_student, self.student_chunk_length, fs, self.student_augmenter, "student" + ) + data.update(x_augs_teacher) + data.update(x_augs_student) + seg_info = self._get_segment_info(seg_id) + data.update(seg_info) + return data + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(DINOAudioDataset.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + skip.add("aug_cfgs") + AudioDataset.add_class_args(parser, skip=skip) + parser.add_argument( + "--teacher-aug-cfg", default=None, help="config for teacher augmentations" + ) + parser.add_argument( + "--student-aug-cfg", default=None, help="config for student augmentations" + ) + parser.add_argument( + "--teacher-chunk-length", + default=4.0, + type=float, + help="chunk length for the teacher model", + ) + parser.add_argument( + "--student-chunk-length", + default=4.0, + type=float, + help="chunk length for the student model", + ) + parser.add_argument( + "--num-teacher-chunks", + default=2, + type=int, + help="num teacher chunks in eachd batch", + ) + parser.add_argument( + "--num-student-chunks", + default=4, + type=int, + help="num student chunks in eachd batch", + ) + parser.add_argument( + "--same-teacher-student-chunks", + default=False, + action=ActionYesNo, + help="teacher and student chunks are overlapped", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/embed_dataset.py b/hyperion/torch/data/embed_dataset.py index aa244d81..3c4433af 100644 --- a/hyperion/torch/data/embed_dataset.py +++ b/hyperion/torch/data/embed_dataset.py @@ -8,57 +8,61 @@ import logging import time -# import copy - import numpy as np import pandas as pd - import torch +import torch.distributed as dist +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from torch.utils.data import Dataset -from ..torch_defs import floatstr_torch from ...io import RandomAccessDataReaderFactory as RF -from ...utils.utt2info import Utt2Info - -from torch.utils.data import Dataset +from ...utils.class_info import ClassInfo +from ...utils.info_table import InfoTable +from ...utils.misc import filter_func_args +from ..torch_defs import floatstr_torch class EmbedDataset(Dataset): def __init__( self, embeds=None, - class_ids=None, - class_weights=None, - rspecifier=None, - key_file=None, - class_file=None, + embed_info=None, + class_info=None, + embed_file=None, + embed_info_file=None, + class_names=None, + class_files=None, + return_segment_info=None, path_prefix=None, preload_embeds=False, - return_class=True, is_val=False, ): - - assert embeds is not None or rspecifier is not None - assert rspecifier is None or key_file is not None - assert class_ids is not None or key_file is not None + assert embeds is not None or embed_file is not None + assert embed_info is not None or embed_info is not None + assert class_info is not None or class_files is not None + super().__init__() + try: + rank = dist.get_rank() + world_size = dist.get_world_size() + except: + rank = 0 + world_size = 1 self.preload_embeds = preload_embeds - if key_file is not None: - if isinstance(key_file, Utt2Info): - self.u2c = key_file - else: - logging.info("loading utt2info file %s", key_file) - self.u2c = Utt2Info.load(key_file, sep=" ") - self.num_embeds = len(self.u2c) - else: - assert embeds is not None - self.u2c = None - self.num_embeds = len(embeds) + + if embed_info is None: + embed_info = InfoTable.load(embed_info_file) + + self.embed_info = embed_info + if rank == 0: + logging.info("dataset contains %d embeddings", len(self.embed_info)) if embeds is None: - logging.info("opening dataset %s", rspecifier) - self.r = RF.create(rspecifier, path_prefix=path_prefix, scp_sep=" ") + # if rank == 0: + # logging.info("opening dataset %s", rspecifier) + self.r = RF.create(embed_file, path_prefix=path_prefix, scp_sep=" ") if self.preload_embeds: - self.embeds = self.r.load(u2c.key, squeeze=True).astype( + self.embeds = self.r.load(embed_info["id"], squeeze=True).astype( floatstr_torch(), copy=False ) del self.r @@ -68,65 +72,79 @@ def __init__( self.embeds = embeds.astype(floatstr_torch(), copy=False) self.is_val = is_val - self._prepare_class_info(class_file, class_ids, class_weights) - self.return_class = return_class - - logging.info("dataset contains %d embeds", self.num_embeds) + if rank == 0: + logging.info("loading class-info files") + self._load_class_infos(class_names, class_files, is_val) + + self.return_segment_info = ( + [] if return_segment_info is None else return_segment_info + ) + + def _load_class_infos(self, class_names, class_files, is_val): + self.class_info = {} + if class_names is None: + assert class_files is None + return + + assert len(class_names) == len(class_files) + for name, file in zip(class_names, class_files): + assert ( + name in self.seg_set + ), f"class_name {name} not present in the segment set" + if self.rank == 0: + logging.info("loading class-info file %s" % file) + table = ClassInfo.load(file) + self.class_info[name] = table + if not is_val: + # check that all classes are present in the training segments + class_ids = table["id"] + segment_class_ids = self.seg_set[name].unique() + for c_id in class_ids: + if c_id not in segment_class_ids: + logging.warning( + "%s class: %s not present in dataset", name, c_id + ) + + @property + def num_embeds(self): + return len(self.embed_info) def __len__(self): return self.num_embeds - def _prepare_class_info(self, class_file, class_idx=None, class_weights=None): - if class_file is None: - if self.u2c is not None: - classes, class_idx = np.unique(self.u2c.info, return_inverse=True) - self.num_classes = np.max(class_idx) + 1 - else: - logging.info("reading class-file %s", class_file) - class_info = pd.read_csv(class_file, header=None, sep=" ") - class2idx = {str(k): i for i, k in enumerate(class_info[0])} - self.num_classes = len(class2idx) - class_idx = np.array([class2idx[k] for k in self.u2c.info], dtype=int) - if class_info.shape[1] == 2: - class_weights = np.array(class_info[1]).astype( - floatstr_torch(), copy=False - ) + @property + def num_classes(self): + return {k: t.num_classes for k, t in self.class_info.items()} - class2utt_idx = {} - class2num_utt = np.zeros((self.num_classes,), dtype=int) - - for k in range(self.num_classes): - idx = (class_idx == k).nonzero()[0] - class2utt_idx[k] = idx - class2num_utt[k] = len(idx) - if class2num_utt[k] == 0: - if not self.is_val: - logging.warning("class %d doesn't have any samples", k) - if class_weights is None: - class_weights = np.ones((self.num_classes,), dtype=floatstr_torch()) - class_weights[k] = 0 - - count_empty = np.sum(class2num_utt == 0) - if count_empty > 0: - logging.warning("%d classes have 0 samples", count_empty) - - self.utt_idx2class = class_idx - self.class2utt_idx = class2utt_idx - self.class2num_utt = class2num_utt - if class_weights is not None: - class_weights /= np.sum(class_weights) - class_weights = torch.Tensor(class_weights) - self.class_weights = class_weights - - def __getitem__(self, index): + def _read_embeds(self, embed_id): if self.preload_embeds: + index = self.embed_info.index.get_loc(embed_id) x = self.embeds[index] else: - key = self.u2c.key[index] - x = self.r.read([key])[0].astype(floatstr_torch(), copy=False) - - if not self.return_class: - return x - - class_idx = self.utt_idx2class[index] - return x, class_idx + x = self.r.read([embed_id])[0].astype(floatstr_torch(), copy=False) + return x + + def _get_embed_info(self, embed_id): + embed_info = {} + # converts the class_ids to integers + for info_name in self.return_embed_info: + embed_info_i = self.embed_info.loc[embed_id, info_name] + if info_name in self.class_info: + # if the type of information is a class-id + # we use the class information table to + # convert from id to integer + class_info = self.class_info[info_name] + embed_info_i = class_info.loc[embed_info_i, "class_idx"] + + embed_info[info_name] = embed_info_i + + return embed_info + + def __getitem__(self, embed_id): + x = self._read_embed(embed_id) + + data = {"embed_id": embed_id, "x": x} + # adds the embed labels + embed_info = self._get_embed_info(embed_id) + data.update(embed_info) + return data diff --git a/hyperion/torch/data/embed_sampler.py b/hyperion/torch/data/embed_sampler.py new file mode 100644 index 00000000..251ba917 --- /dev/null +++ b/hyperion/torch/data/embed_sampler.py @@ -0,0 +1,137 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +from typing import Optional + +import numpy as np +import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args +from .hyp_sampler import HypSampler + + +class EmbedSampler(HypSampler): + def __init__( + self, + embed_set, + batch_size: int = 1, + max_batches_per_epoch: Optional[int] = None, + shuffle: bool = False, + drop_last: bool = False, + seed: int = 1234, + ): + super().__init__( + max_batches_per_epoch=max_batches_per_epoch, shuffle=shuffle, seed=seed + ) + self.embed_set = embed_set + self.batch_size = batch_size + self.avg_batch_size = batch_size + + num_batches = len(self.embed_set) / batch_size / self.world_size + if drop_last: + self._len = int(num_batches) + else: + self._len = int(math.ceil(num_batches)) + + if self.max_batches_per_epoch is not None: + self._len = min(self._len, self.max_batches_per_epoch) + + self._permutation = None + + def __len__(self): + return self._len + + def _shuffle_embeds(self): + self._permutation = torch.randperm( + len(self.embed_set), generator=self.rng + ).numpy() + + def __iter__(self): + super().__iter__() + if self.shuffle: + self._shuffle_segs() + + self.start = self.rank + return self + + def __next__(self): + + if self.batch == self._len: + raise StopIteration + + stop = min( + self.start + self.world_size * self.min_batch_size, len(self.embed_set) + ) + if self.shuffle: + idx = self._permutation[self.start : stop : self.world_size] + else: + idx = slice(self.start, stop, self.world_size) + + self.start += self.world_size * self.min_batch_size + + embed_ids = self.embed_set.iloc[idx].id + + if self.batch == 0: + logging.info("batch 0 chunks=%s", str(embed_ids[:10])) + + self.batch += 1 + return embed_ids + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(EmbedSampler.__init__, kwargs) + # valid_args = ( + # "batch_size", + # "shuffle", + # "drop_last", + # "seed", + # ) + + # return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--batch-size", + type=int, + default=1, + help=("minimum batch size per gpu"), + ) + + parser.add_argument( + "--drop-last", + action=ActionYesNo, + help="drops the last batch of the epoch", + ) + + parser.add_argument( + "--max-batches-per-epoch", + type=int, + default=None, + help=("Max. batches per epoch"), + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the segments or chunks at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/embed_sampler_factory.py b/hyperion/torch/data/embed_sampler_factory.py new file mode 100644 index 00000000..6ead9daf --- /dev/null +++ b/hyperion/torch/data/embed_sampler_factory.py @@ -0,0 +1,137 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Optional, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from .bucketing_seg_sampler import BucketingSegSampler +from .class_weighted_embed_sampler import ClassWeightedEmbedSampler +from .embed_dataset import EmbedDataset +from .embed_sampler import EmbedSampler + +sampler_dict = { + "class_weighted_embed_sampler": ClassWeightedEmbedSampler, + "embed_sampler": EmbedSampler, +} + + +class EmbedSamplerFactory(object): + """Factory class to create different types of samplers for + embeddings like x-vectors. + """ + + @staticmethod + def create( + dataset: EmbedDataset, + sampler_type: str = "class_weighted_embed_sampler", + **kwargs, + ): + """Functions that creates a sampler based on a dataset, sampler_type and sampler arguments. + + Args: + dataset: embeddings dataset object containing the data info + sampler_type: string indicating the sampler type. + """ + + sampler_class = sampler_dict[sampler_type] + sampler_kwargs = sampler_class.filter_args(**kwargs) + + if sampler_type in ["class_weighted_embed_sampler"]: + try: + class_name = sampler_kwargs["class_name"] + except: + class_name = "class_id" + sampler_kwargs["class_info"] = dataset.class_info[class_name] + + logging.info(f"sampler-args={sampler_kwargs}") + + return sampler_class(dataset.embed_info, **sampler_kwargs) + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "batch_size", + "num_embeds_per_class", + "weight_exponent", + "weight_mode", + "num_hard_prototypes", + "class_name", + "max_batches_per_epoch", + "shuffle", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--batch-size", + type=int, + default=1, + help=("batch size per gpu"), + ) + + parser.add_argument( + "--num-embeds-per-class", + type=int, + default=1, + help=("number of embeds per class in batch"), + ) + parser.add_argument( + "--weight-exponent", + default=1.0, + type=float, + help=("exponent for class weights"), + ) + parser.add_argument( + "--weight-mode", + default="custom", + choices=["custom", "uniform", "data-prior"], + help=("method to get the class weights"), + ) + + parser.add_argument( + "--num-hard-prototypes", + type=int, + default=0, + help=("number of hard prototype classes per batch"), + ) + + parser.add_argument( + "--max-batches-per-epoch", + type=int, + default=None, + help=("Max. batches per epoch"), + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the embeddings at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--class-name", + default="class_id", + help="which column in the info table indicates the class", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/feat_seq_dataset.py b/hyperion/torch/data/feat_seq_dataset.py index 2774c899..bb487dda 100644 --- a/hyperion/torch/data/feat_seq_dataset.py +++ b/hyperion/torch/data/feat_seq_dataset.py @@ -3,106 +3,126 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -import logging -from jsonargparse import ArgumentParser, ActionParser -import time import copy +import logging +import os +import sys import threading +import time import numpy as np import pandas as pd +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch +import torch.distributed as dist +from torch.utils.data import Dataset -from ..torch_defs import floatstr_torch from ...io import RandomAccessDataReaderFactory as RF -from ...utils.utt2info import Utt2Info - -from torch.utils.data import Dataset +from ...utils.class_info import ClassInfo +from ...utils.misc import filter_func_args +from ...utils.segment_set import SegmentSet +from ..torch_defs import floatstr_torch class FeatSeqDataset(Dataset): def __init__( self, - rspecifier, - key_file, - class_file=None, + feat_file, + segments_file, + class_names=None, + class_files=None, num_frames_file=None, + return_segment_info=None, path_prefix=None, - min_chunk_length=1, - max_chunk_length=None, - return_fullseqs=False, - return_class=True, transpose_input=True, is_val=False, ): - logging.info("opening dataset %s" % rspecifier) - self.r = RF.create(rspecifier, path_prefix=path_prefix, scp_sep=" ") - logging.info("loading utt2info file %s" % key_file) - self.u2c = Utt2Info.load(key_file, sep=" ") - logging.info("dataset contains %d seqs" % self.num_seqs) + super().__init__() + try: + rank = dist.get_rank() + world_size = dist.get_world_size() + except: + rank = 0 + world_size = 1 + + if rank == 0: + logging.info("opening feature reader %s", feat_file) + + self.r = RF.create(feat_file, path_prefix=path_prefix, scp_sep=" ") + + if rank == 0: + logging.info("loading segments file %s" % segments_file) + + self.seg_set = SegmentSet.load(segments_file) + if rank == 0: + logging.info("dataset contains %d seqs", len(self.seg_set)) self.is_val = is_val - self._seq_lengths = None if num_frames_file is not None: - self._read_num_frames_file(num_frames_file) - self._prune_short_seqs(min_chunk_length) - - self.short_seq_exist = self._seq_shorter_than_max_length_exists( - max_chunk_length - ) + if rank == 0: + logging.info("loading durations file %s", num_frames_file) - self._prepare_class_info(class_file) + time_durs = SegmentSet.load(num_frames_file) + self.seg_set["num_frames"] = time_durs.loc[ + self.seg_set["id"] + ].class_id.values.astype(int, copy=False) + else: + assert "num_frames" in self.seg_set - if max_chunk_length is None: - max_chunk_length = min_chunk_length - self._min_chunk_length = min_chunk_length - self._max_chunk_length = max_chunk_length + logging.info("loading class-info files") + self._load_class_infos(class_names, class_files, is_val) - self.return_fullseqs = return_fullseqs - self.return_class = return_class + self.return_segment_info = ( + [] if return_segment_info is None else return_segment_info + ) self.transpose_input = transpose_input - def _read_num_frames_file(self, file_path): - logging.info("reading num_frames file %s" % file_path) - nf_df = pd.read_csv(file_path, header=None, sep=" ") - nf_df.index = nf_df[0] - self._seq_lengths = nf_df.loc[self.u2c.key, 1].values + def _load_class_infos(self, class_names, class_files, is_val): + self.class_info = {} + if class_names is None: + assert class_files is None + return + + assert len(class_names) == len(class_files) + for name, file in zip(class_names, class_files): + assert ( + name in self.seg_set + ), f"class_name {name} not present in the segment set" + if self.rank == 0: + logging.info("loading class-info file %s" % file) + table = ClassInfo.load(file) + self.class_info[name] = table + if not is_val: + # check that all classes are present in the training segments + class_ids = table["id"] + segment_class_ids = self.seg_set[name].unique() + for c_id in class_ids: + if c_id not in segment_class_ids: + logging.warning( + "%s class: %s not present in dataset", name, c_id + ) + + def set_epoch(self, epoch): + self.epoch = epoch @property def num_seqs(self): - return len(self.u2c) + return len(self.seg_set) def __len__(self): return self.num_seqs @property def seq_lengths(self): - if self._seq_lengths is None: - self._seq_lengths = self.r.read_num_rows(self.u2c.key) - - return self._seq_lengths + return self.seg_set["num_frames"] @property def total_length(self): return np.sum(self.seq_lengths) - @property - def min_chunk_length(self): - if self.return_fullseqs: - self._min_chunk_length = np.min(self.seq_lengths) - return self._min_chunk_length - - @property - def max_chunk_length(self): - if self._max_chunk_length is None: - self._max_chunk_length = np.max(self.seq_lengths) - return self._max_chunk_length - @property def min_seq_length(self): return np.min(self.seq_lengths) @@ -111,223 +131,114 @@ def min_seq_length(self): def max_seq_length(self): return np.max(self.seq_lengths) - def _prune_short_seqs(self, min_length): - logging.info("pruning short seqs") - keep_idx = self.seq_lengths >= min_length - self.u2c = self.u2c.filter_index(keep_idx) - self._seq_lengths = self.seq_lengths[keep_idx] - logging.info( - "pruned seqs with min_length < %d," - "keep %d/%d seqs" % (min_length, self.num_seqs, len(keep_idx)) - ) - - def _prepare_class_info(self, class_file): - class_weights = None - if class_file is None: - classes, class_idx = np.unique(self.u2c.info, return_inverse=True) - class2idx = {k: i for i, k in enumerate(classes)} - else: - logging.info("reading class-file %s" % (class_file)) - class_info = pd.read_csv(class_file, header=None, sep=" ") - class2idx = {str(k): i for i, k in enumerate(class_info[0])} - class_idx = np.array([class2idx[k] for k in self.u2c.info], dtype=int) - if class_info.shape[1] == 2: - class_weights = np.array(class_info[1]).astype( - floatstr_torch(), copy=False - ) - - self.num_classes = len(class2idx) - - class2utt_idx = {} - class2num_utt = np.zeros((self.num_classes,), dtype=int) - - for k in range(self.num_classes): - idx = (class_idx == k).nonzero()[0] - class2utt_idx[k] = idx - class2num_utt[k] = len(idx) - if class2num_utt[k] == 0: - if not self.is_val: - logging.warning("class %d doesn't have any samples" % (k)) - if class_weights is None: - class_weights = np.ones((self.num_classes,), dtype=floatstr_torch()) - class_weights[k] = 0 - - count_empty = np.sum(class2num_utt == 0) - if count_empty > 0: - logging.warning("%d classes have 0 samples" % (count_empty)) - - self.utt_idx2class = class_idx - self.class2utt_idx = class2utt_idx - self.class2num_utt = class2num_utt - if class_weights is not None: - class_weights /= np.sum(class_weights) - class_weights = torch.Tensor(class_weights) - self.class_weights = class_weights - - if self.short_seq_exist: - # if there are seq shorter than max_chunk_lenght we need some extra variables - # we will need class_weights to put to 0 classes that have all utts shorter than the batch chunk length - if self.class_weights is None: - self.class_weights = torch.ones((self.num_classes,)) - - # we need the max length of the utterances of each class - class2max_length = torch.zeros((self.num_classes,), dtype=torch.int) - for c in range(self.num_classes): - if class2num_utt[c] > 0: - class2max_length[c] = int( - np.max(self.seq_lengths[self.class2utt_idx[c]]) - ) - - self.class2max_length = class2max_length - - def _seq_shorter_than_max_length_exists(self, max_length): - return np.any(self.seq_lengths < max_length) - @property - def var_chunk_length(self): - return self.min_chunk_length < self.max_chunk_length - - def get_random_chunk_length(self): - - if self.var_chunk_length: - return torch.randint( - low=self.min_chunk_length, high=self.max_chunk_length + 1, size=(1,) - ).item() - - return self.max_chunk_length - - # def get_random_chunk_length(self, index): - - # if self.min_chunk_length < self.max_chunk_length: - # if self.short_seq_exist: - # max_chunk_length = min(int(np.min(self.seq_lengths[index])), - # self.max_chunk_length) - # else: - # max_chunk_length = self.max_chunk_length + def num_classes(self): + return {k: t.num_classes for k, t in self.class_info.items()} + + def _parse_segment_item(self, segment): + if isinstance(segment, (tuple, list)): + seg_id, start, num_frames = segment + assert num_frames <= self.seg_set.loc[seg_id].num_frames, ( + f"{seg_id} with start={start} num_frames " + f"({self.seg_set.loc[seg_id].num_frames}) < " + f"chunk duration ({num_frames})" + ) + else: + seg_id, start, num_frames = segment, 0, 0 - # chunk_length = torch.randint( - # low=self.min_chunk_length, high=max_chunk_length+1, size=(1,)).item() + if "start" in self.seg_set: + start += self.seg_set.loc[seg_id].start - # # logging.info('{} {} {} set_random_chunk_length={}'.format( - # # self,os.getpid(), threading.get_ident(), chunk_length)) - # return chunk_length + return seg_id, int(start), int(num_frames) - # return self.max_chunk_length + def _read_feats(self, seg_id, start, num_frames): + x = self.r.read(seg_id, row_offset=start, num_rows=num_frames)[0].astype( + floatstr_torch(), copy=False + ) + return x - def __getitem__(self, index): - # logging.info('{} {} {} get item {}'.format( - # self, os.getpid(), threading.get_ident(), index)) - if self.return_fullseqs: - return self._get_fullseq(index) - else: - return self._get_random_chunk(index) + def _get_segment_info(self, seg_id): + seg_info = {} + # converts the class_ids to integers + for info_name in self.return_segment_info: + seg_info_i = self.seg_set.loc[seg_id, info_name] + if info_name in self.class_info: + # if the type of information is a class-id + # we use the class information table to + # convert from id to integer + class_info = self.class_info[info_name] + seg_info_i = class_info.loc[seg_info_i, "class_idx"] - def _get_fullseq(self, index): - key = self.u2c.key[index] - x = self.r.read([key])[0].astype(floatstr_torch(), copy=False) - if self.transpose_input: - x = x.T - if not self.return_class: - return x + seg_info[info_name] = seg_info_i - class_idx = self.utt_idx2class[index] - return x, class_idx + return seg_info - def _get_random_chunk(self, index): + def __getitem__(self, segment): - if len(index) == 2: - index, chunk_length = index - else: - chunk_length = self.max_chunk_length - - key = self.u2c.key[index] - full_seq_length = int(self.seq_lengths[index]) - assert ( - chunk_length <= full_seq_length - ), "chunk_length(%d) <= full_seq_length(%d)" % (chunk_length, full_seq_length) - first_frame = torch.randint( - low=0, high=full_seq_length - chunk_length + 1, size=(1,) - ).item() - - x = self.r.read([key], row_offset=first_frame, num_rows=chunk_length)[0].astype( - floatstr_torch(), copy=False - ) + seg_id, start, num_frames = self._parse_segment_item(segment) + x = self._read_feats(seg_id, start, num_frames) + num_frames = x.shape[0] if self.transpose_input: x = x.T - if not self.return_class: - return x + data = {"seg_id": seg_id, "x": x, "x_lengths": num_frames} - class_idx = self.utt_idx2class[index] - return x, class_idx + # adds the segment labels + seg_info = self._get_segment_info(seg_id) + data.update(seg_info) + return data @staticmethod def filter_args(**kwargs): - valid_args = ( - "path_prefix", - "class_file", - "num_frames_file", - "min_chunk_length", - "max_chunk_length", - "return_fullseqs", - "part_idx", - "num_parts", - ) - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return filter_func_args(FeatSeqDataset.__init__, kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - parser.add_argument( - "--path-prefix", default="", help=("path prefix for rspecifier scp file") - ) + if "feat_file" not in skip: + parser.add_argument( + "--audio-file", required=True, help=("feature manifest file"), + ) - parser.add_argument( - "--class-file", - default=None, - help=("ordered list of classes keys, it can contain class weights"), - ) + if "segments_file" not in skip: + parser.add_argument( + "--segments-file", required=True, help=("segments manifest file"), + ) parser.add_argument( - "--num-frames-file", + "--class-names", default=None, + nargs="+", help=( - "utt to num_frames file, if None it reads from the dataset " - "but it is slow" + "list with the names of the types of classes in the datasets, e.g., speaker, language" ), ) parser.add_argument( - "--min-chunk-length", - type=int, - default=None, - help=("minimum length of sequence chunks"), + "--class-files", default=None, nargs="+", help=("list of class info files"), ) + parser.add_argument( - "--max-chunk-length", - type=int, + "--num-frames-file", default=None, - help=("maximum length of sequence chunks"), + help=("segment to num-frames file, if durations are not in segments_file"), ) parser.add_argument( - "--return-fullseqs", - default=False, - action="store_true", - help=("returns full sequences instead of chunks"), + "--return-segment-info", + default=None, + nargs="+", + help=( + "list of columns of the segment file which should be returned as supervisions" + ), ) - # parser.add_argument('--part-idx', - # type=int, default=1, - # help=('splits the list of files in num-parts and process part_idx')) - # parser.add_argument('--num-parts', - # type=int, default=1, - # help=('splits the list of files in num-parts and process part_idx')) + parser.add_argument( + "--path-prefix", default="", help=("path prefix for rspecifier scp file") + ) + RF.add_class_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='feature sequence dataset options') - - add_argparse_args = add_class_args diff --git a/hyperion/torch/data/hyp_sampler.py b/hyperion/torch/data/hyp_sampler.py new file mode 100644 index 00000000..30010f5e --- /dev/null +++ b/hyperion/torch/data/hyp_sampler.py @@ -0,0 +1,57 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +from typing import Optional + +import numpy as np +import torch +import torch.distributed as dist +from jsonargparse import ActionParser, ArgumentParser +from torch.utils.data import Sampler + + +class HypSampler(Sampler): + def __init__( + self, + max_batches_per_epoch: Optional[int] = None, + shuffle: bool = False, + seed: int = 1234, + ): + super().__init__(None) + self.epoch = 0 + self.batch = 0 + self.init_batch = 0 + self.shuffle = shuffle + self.seed = seed + self.max_batches_per_epoch = max_batches_per_epoch + + try: + rank = dist.get_rank() + world_size = dist.get_world_size() + except: + rank = 0 + world_size = 1 + + self.rank = rank + self.world_size = world_size + self.rng = torch.Generator() + + def set_epoch(self, epoch, batch=0): + self.epoch = epoch + self.init_batch = batch + + def _set_seed(self): + if self.shuffle: + self.rng.manual_seed(self.seed + 10 * self.epoch + 100 * self.init_batch) + else: + self.rng.manual_seed(self.seed) + + def __iter__(self): + self.batch = self.init_batch + self.init_batch = 0 + self._set_seed() + return self diff --git a/hyperion/torch/data/paired_feat_seq_dataset.py b/hyperion/torch/data/paired_feat_seq_dataset.py index 671bb6bf..eff2ed58 100644 --- a/hyperion/torch/data/paired_feat_seq_dataset.py +++ b/hyperion/torch/data/paired_feat_seq_dataset.py @@ -4,13 +4,13 @@ """ import logging + import numpy as np import torch -from ..torch_defs import floatstr_torch - from ...utils.utt2info import Utt2Info +from ..torch_defs import floatstr_torch from .feat_seq_dataset import FeatSeqDataset diff --git a/hyperion/torch/data/seg_chunk_sampler.py b/hyperion/torch/data/seg_chunk_sampler.py new file mode 100644 index 00000000..e6c78775 --- /dev/null +++ b/hyperion/torch/data/seg_chunk_sampler.py @@ -0,0 +1,162 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +from typing import Optional, Type + +import numpy as np +import pandas as pd +import torch +import torch.distributed as dist +from jsonargparse import ActionParser, ArgumentParser + +from ...utils import SegmentSet +from ...utils.misc import filter_func_args +from .hyp_sampler import HypSampler +from .seg_sampler import SegSampler + + +class SegChunkSampler(HypSampler): + def __init__( + self, + seg_set: SegmentSet, + min_chunk_length: int, + max_chunk_length: Optional[int] = None, + base_sampler: Type[HypSampler] = SegSampler, + length_name: str = "duration", + max_batches_per_epoch: Optional[int] = None, + shuffle: bool = False, + seed: int = 1234, + **base_kwargs, + ): + super().__init__(shuffle=shuffle, seed=seed) + self.seg_set = seg_set + self.min_chunk_length = min_chunk_length + self.max_chunk_length = ( + min_chunk_length if max_chunk_length is None else max_chunk_length + ) + self.avg_chunk_length = (max_chunk_length + min_chunk_length) / 2 + self.chunk_set = None + self.length_name = length_name + self.chunk_sampler = base_sampler + if "subbase_sampler" in base_kwargs: + base_kwargs["base_sampler"] = base_kwargs.pop("subbase_sampler") + + self.base_kwargs = base_sampler.filter_args(**base_kwargs) + self.base_kwargs["seed"] = seed + self.base_kwargs["shuffle"] = shuffle + self.base_kwargs["max_batches_per_epoch"] = max_batches_per_epoch + + self.__iter__() + self.avg_batch_size = self._seg_sampler.avg_batch_size + + def __len__(self): + return len(self._seg_sampler) + + # def _compute_num_chunks(self, seg_set): + # num_chunks = 0 + # for len in seg_set['duration']: + # if len < self.min_chunk_length: + # #discard too short sequences + # continue + + # num_chunks += math.ceil(len/self._avg_chunk_length) + + # self.num_chunks = num_chunks + + @property + def duration_is_random(self): + return self.min_chunk_length != self.max_chunk_length + + def get_random_duration(self): + if self.duration_is_random: + return ( + torch.rand(size=(1,), generator=self.rng).item() + * (self.max_chunk_length - self.min_chunk_length) + + self.min_chunk_length + ) + else: + return self.min_chunk_length + + def _create_chunks(self): + chunks = [] + for id, len in zip(self.seg_set["id"], self.seg_set[self.length_name]): + if len < self.min_chunk_length: + # discard too short sequences + continue + + # making it this way, we get the same number of chunks in all epochs + num_chunks = math.ceil(len / self.avg_chunk_length) + start = 0 + for i in range(num_chunks - 1): + remainder = len - start + if remainder < self.min_chunk_length: + remainder = self.min_chunk_length + dur = remainder + start = len - dur + else: + dur = self.get_random_duration() + if dur > remainder: + dur = remainder + + chunk = (f"{id}-{i}", id, start, dur) + chunks.append(chunk) + start += dur + + # special treatment for last chunk we get from the recording + remainder = len - start + chunk_id = f"{id}-{num_chunks - 1}" + if remainder > self.max_chunk_length: + # here we discard part of the end + chunk = (chunk_id, id, start, self.max_chunk_length) + elif remainder < self.min_chunk_length: + # here we overlap with second last chunk + chunk = ( + chunk_id, + id, + len - self.min_chunk_length, + self.min_chunk_length, + ) + else: + # here the last chunk is what it is left + chunk = (chunk_id, id, start, remainder) + + chunks.append(chunk) + + chunk_set = pd.DataFrame( + chunks, columns=["id", "seg_id", "chunk_start", self.length_name] + ) + self.chunk_set = SegmentSet(chunk_set) + + def __iter__(self): + super().__iter__() + self._create_chunks() + self._seg_sampler = SegSampler(self.chunk_set, **self.base_kwargs) + self._seg_sampler.set_epoch(self.epoch) + self._seg_sampler.__iter__() + + return self + + def __next__(self): + return next(self._seg_sampler) + + @staticmethod + def filter_args(**kwargs): + return kwargs + # valid_args = filter_func_args(SegChunkSampler.__init__, kwargs) + # base_args = filter_func_args(SegSampler.__init__, kwargs) + # valid_args.update(base_args) + # return valid_args + + # valid_args = ( + # "min_chunk_length", + # "max_chunk_length", + # "length_name", + # "shuffle", + # "seed", + # ) + + # return dict((k, kwargs[k]) for k in valid_args if k in kwargs) diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py new file mode 100644 index 00000000..bb3a37ac --- /dev/null +++ b/hyperion/torch/data/seg_sampler.py @@ -0,0 +1,222 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +from typing import Optional + +import numpy as np +import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils import SegmentSet +from ...utils.misc import filter_func_args +from .hyp_sampler import HypSampler + + +class SegSampler(HypSampler): + def __init__( + self, + seg_set: SegmentSet, + min_batch_size: int = 1, + max_batch_size: Optional[int] = None, + max_batch_length: Optional[int] = None, + length_name: str = "duration", + max_batches_per_epoch: Optional[int] = None, + shuffle: bool = False, + drop_last: bool = False, + sort_by_length: bool = True, + seed: int = 1234, + ): + super().__init__( + max_batches_per_epoch=max_batches_per_epoch, shuffle=shuffle, seed=seed + ) + self.seg_set = seg_set + self.min_batch_size = min_batch_size + self.max_batch_size = max_batch_size + self.max_batch_length = max_batch_length + self.var_batch_size = max_batch_length is not None + self.length_name = length_name + self.sort_by_length = sort_by_length + if self.var_batch_size: + avg_batch_size = max_batch_length / np.mean(self.seg_set[self.length_name]) + else: + avg_batch_size = min_batch_size + + self.avg_batch_size = avg_batch_size + + if drop_last: + self._len = int(len(self.seg_set) / (avg_batch_size * self.world_size)) + else: + self._len = int( + math.ceil((len(self.seg_set) // self.world_size) / avg_batch_size) + ) + + if self.max_batches_per_epoch is not None: + self._len = min(self._len, self.max_batches_per_epoch) + + self._permutation = None + + def __len__(self): + return self._len + + def _shuffle_segs(self): + self._permutation = torch.randperm( + len(self.seg_set), generator=self.rng + ).numpy() + + def __iter__(self): + super().__iter__() + if self.shuffle: + self._shuffle_segs() + + self.start = self.rank + return self + + def __next__(self): + + if self.batch == self._len: + raise StopIteration + + if self.var_batch_size: + column_idx = self.seg_set.columns.get_loc(self.length_name) + idxs = [] + max_length = 0 + batch_size = 0 + while True: + if self.shuffle: + idx = self._permutation[self.start] + else: + idx = self.start + + max_length = max(max_length, self.seg_set.iloc[idx, column_idx]) + if max_length * (batch_size + 1) > self.max_batch_length: + break + + idxs.append(idx) + self.start = (self.start + self.world_size) % len(self.seg_set) + batch_size += 1 + if ( + self.max_batch_size is not None + and batch_size >= self.max_batch_size + ): + break + + assert ( + len(idxs) >= 1 + ), f"increase max_batch_length {self.max_batch_length} >= {max_length}" + else: + stop = min( + self.start + self.world_size * self.min_batch_size, len(self.seg_set) + ) + if self.shuffle: + idxs = self._permutation[self.start : stop : self.world_size] + else: + idxs = slice(self.start, stop, self.world_size) + + self.start += self.world_size * self.min_batch_size + + ids = self.seg_set.iloc[idxs].id.values + if self.sort_by_length: + lengths = self.seg_set.loc[ids, self.length_name].values + sort_idx = np.argsort(lengths)[::-1] + ids = ids[sort_idx] + + if "chunk_start" in self.seg_set: + chunks = self.seg_set.loc[ids] + seg_ids = [ + (id, s, d) + for id, s, d in zip( + chunks.seg_id, chunks.chunk_start, chunks[self.length_name] + ) + ] + else: + seg_ids = ids + + if self.batch == 0: + logging.info("batch 0 seg_ids=%s", str(seg_ids[:10])) + + self.batch += 1 + return seg_ids + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(SegSampler.__init__, kwargs, skip={"seg_set"}) + + # valid_args = ( + # "min_batch_size", + # "max_batch_size", + # "max_batch_length", + # "length_name", + # "shuffle", + # "drop_last", + # "seed", + # ) + + # return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--min-batch-size", + type=int, + default=1, + help=("minimum batch size per gpu"), + ) + parser.add_argument( + "--max-batch-size", + type=int, + default=None, + help=( + "maximum batch size per gpu, if None, estimated from max_batch_length" + ), + ) + + parser.add_argument( + "--max-batch-duration", + type=float, + default=None, + help=( + "maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), + ) + + parser.add_argument( + "--drop-last", + action=ActionYesNo, + help="drops the last batch of the epoch", + ) + + parser.add_argument( + "--max-batches-per-epoch", + type=int, + default=None, + help=("Max. batches per epoch"), + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the segments or chunks at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--length-name", + default="duration", + help="which column in the segment table indicates the duration of the file", + ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py new file mode 100644 index 00000000..8a37344d --- /dev/null +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -0,0 +1,263 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Optional, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from .audio_dataset import AudioDataset +from .bucketing_seg_sampler import BucketingSegSampler +from .class_weighted_seg_chunk_sampler import ClassWeightedRandomSegChunkSampler +from .feat_seq_dataset import FeatSeqDataset +from .seg_chunk_sampler import SegChunkSampler +from .seg_sampler import SegSampler + +sampler_dict = { + "class_weighted_random_seg_chunk_sampler": ClassWeightedRandomSegChunkSampler, + "seg_sampler": SegSampler, + "seg_chunk_sampler": SegChunkSampler, + "bucketing_seg_sampler": BucketingSegSampler, +} + + +class SegSamplerFactory(object): + """Factory class to create different types of samplers for + sequencial data like audio or acoustic features. + """ + + @staticmethod + def create( + dataset: Union[AudioDataset, FeatSeqDataset], + sampler_type: str = "class_weighted_random_seg_chunk_sampler", + base_sampler_type: str = "seg_sampler", + subbase_sampler_type: str = "seg_sampler", + **kwargs, + ): + """Functions that creates a sequence sampler based on a dataset, sampler_type and sampler arguments. + + Args: + dataset: sequence dataset object containing the data info of class AudioDataset or FeatSeqDataset. + sampler_type: string indicating the sampler type. + """ + + sampler_class = sampler_dict[sampler_type] + sampler_kwargs = sampler_class.filter_args(**kwargs) + + if sampler_type in ["bucketing_seg_sampler", "seg_chunk_sampler"]: + base_sampler_class = sampler_dict[base_sampler_type] + base_sampler_kwargs = base_sampler_class.filter_args(**kwargs) + sampler_kwargs.update(base_sampler_kwargs) + sampler_kwargs["base_sampler"] = base_sampler_class + if base_sampler_type == "bucketing_seg_sampler": + base_sampler_class = sampler_dict[subbase_sampler_type] + base_sampler_kwargs = base_sampler_class.filter_args(**kwargs) + sampler_kwargs.update(base_sampler_kwargs) + + if sampler_type in ["class_weighted_random_seg_chunk_sampler"]: + try: + class_name = sampler_kwargs["class_name"] + except: + class_name = "class_id" + sampler_kwargs["class_info"] = dataset.class_info[class_name] + + logging.info(f"sampler-args={sampler_kwargs}") + + return sampler_class(dataset.seg_set, **sampler_kwargs) + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "sampler_type", + "num_buckets", + "min_chunk_length", + "max_chunk_length", + "min_batch_size", + "max_batch_size", + "max_batch_length", + "num_chunks_per_seg_epoch", + "num_segs_per_class", + "num_chunks_per_seg", + "weight_mode", + "weight_exponent", + "seg_weight_mode", + "num_hard_prototypes", + "class_name", + "length_name", + "iters_per_epoch", + "batch_size", + "max_batches_per_epoch", + "shuffle", + "drop_last", + "sort_by_length", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--sampler-type", + choices=sampler_dict.keys(), + default="class_weighted_random_seg_chunk_sampler", + help="batch sampler type", + ) + + parser.add_argument( + "--base-sampler-type", + choices=["seg_sampler", "bucketing_seg_sampler"], + default="seg_sampler", + help="base sampler used for seg_chunk_sampler or bucketing_seg_sampler", + ) + + parser.add_argument( + "--min-chunk-length", + type=float, + default=4.0, + help=("minimum length of the segment chunks"), + ) + + parser.add_argument( + "--max-chunk-length", + type=float, + default=None, + help=("maximum length of segment chunks"), + ) + + parser.add_argument( + "--min-batch-size", + type=int, + default=64, + help=("minimum batch size per gpu"), + ) + parser.add_argument( + "--max-batch-size", + type=int, + default=None, + help=( + "maximum batch size per gpu, if None, estimated from max_batch_length" + ), + ) + + parser.add_argument( + "--batch-size", + default=None, + type=int, + help=("deprecated, use min-batch-size instead"), + ) + + parser.add_argument( + "--max-batch-length", + type=float, + default=None, + help=( + "maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), + ) + + parser.add_argument( + "--iters-per-epoch", + default=None, + type=lambda x: x if (x == "auto" or x is None) else float(x), + help=("deprecated, use --num-egs-per-seg-epoch instead"), + ) + + parser.add_argument( + "--num-chunks-per-seg-epoch", + default="auto", + type=lambda x: x if x == "auto" else float(x), + help=("number of times we sample a segment in each epoch"), + ) + + parser.add_argument( + "--num-segs-per-class", + type=int, + default=1, + help=("number of segments per class in batch"), + ) + parser.add_argument( + "--num-chunks-per-seg", + type=int, + default=1, + help=("number of chunks per segment in batch"), + ) + + parser.add_argument( + "--weight-exponent", + default=1.0, + type=float, + help=("exponent for class weights"), + ) + parser.add_argument( + "--weight-mode", + default="custom", + choices=["custom", "uniform", "data-prior"], + help=("method to get the class weights"), + ) + + parser.add_argument( + "--seg-weight-mode", + default="uniform", + choices=["uniform", "data-prior"], + help=("method to sample segments given a class"), + ) + + parser.add_argument( + "--num-hard-prototypes", + type=int, + default=0, + help=("number of hard prototype classes per batch"), + ) + + parser.add_argument( + "--drop-last", + action=ActionYesNo, + help="drops the last batch of the epoch", + ) + + parser.add_argument( + "--max-batches-per-epoch", + type=int, + default=None, + help=("Max. batches per epoch"), + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the segments or chunks at the beginning of the epoch", + ) + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--length-name", + default="duration", + help="which column in the segment table indicates the duration of the segment", + ) + parser.add_argument( + "--class-name", + default="class_id", + help="which column in the segment table indicates the class of the segment", + ) + parser.add_argument( + "--sort-by-length", + default=True, + action=ActionYesNo, + help="sort sequences in the batch by duration", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/weighted_embed_sampler.py b/hyperion/torch/data/weighted_embed_sampler.py index 61e4a0ad..5870512a 100644 --- a/hyperion/torch/data/weighted_embed_sampler.py +++ b/hyperion/torch/data/weighted_embed_sampler.py @@ -2,9 +2,9 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging # import os import math -import logging import numpy as np @@ -36,6 +36,10 @@ def __iter__(self): self.batch = 0 return self + @property + def avg_batch_size(self): + return self.batch_size + def _remove_duplicate_idx(self, utt_idx): utt_idx_uniq = torch.unique(utt_idx) c = 0 diff --git a/hyperion/torch/data/weighted_seq_sampler.py b/hyperion/torch/data/weighted_seq_sampler.py index 9d128bb8..b6f0b670 100644 --- a/hyperion/torch/data/weighted_seq_sampler.py +++ b/hyperion/torch/data/weighted_seq_sampler.py @@ -2,27 +2,49 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging # import os import math -from jsonargparse import ArgumentParser, ActionParser -import logging import numpy as np +from jsonargparse import ActionParser, ArgumentParser import torch -from torch.utils.data import Sampler import torch.distributed as dist +from torch.utils.data import Sampler class ClassWeightedSeqSampler(Sampler): + """Samples utterances following: + 1. It samples a class with a given probability. + 2. It samples an random utterance from the class. + + Attributes: + dataset: dataset containing audio or feature sequences. + batch_size: batch size per gpu for the largest chunk-size. + num_egs_per_utt_epoch: number of samples per utterance and epoch. + num_egs_per_class: number of samples per class in each batch. + num_egs_per_utt: number of samples per utterance in each batch. + var_batch_size: whether to use variable batch size when using + variable utterance length. + num_hard_prototypes: number of hard prototype classes per random class + in a batch. + num_egs_per_hard_prototype: number of utterances per each hard + prototype in a batch. + iters_per_epoch: deprecated, if not None, will overwrite "num_egs_per_utt_epoch". + """ + def __init__( self, dataset, batch_size=1, - iters_per_epoch="auto", + num_egs_per_utt_epoch="auto", num_egs_per_class=1, num_egs_per_utt=1, var_batch_size=False, + num_hard_prototypes=0, + affinity_matrix=None, + iters_per_epoch=None, ): super().__init__(None) @@ -34,63 +56,100 @@ def __init__( rank = 0 world_size = 1 + if iters_per_epoch is not None: + num_egs_per_utt_epoch = iters_per_epoch + self.dataset = dataset - self.batch_size = int(math.ceil(batch_size / world_size)) + self.batch_size = batch_size self.num_egs_per_class = num_egs_per_class self.num_egs_per_utt = num_egs_per_utt self.var_batch_size = var_batch_size + self.num_hard_prototypes = num_hard_prototypes self.batch = 0 self.rank = rank self.world_size = world_size - if rank > 0: # this will make sure that each process produces different data # when using ddp dummy = torch.rand(1000 * rank) del dummy - if iters_per_epoch == "auto": - self._compute_iters_auto() - else: - self.iters_per_epoch = iters_per_epoch - - if var_batch_size: - avg_batch_size = self._compute_avg_batch_size() - else: - avg_batch_size = self.batch_size - - self._len = int( - math.ceil( - self.iters_per_epoch * dataset.num_seqs / avg_batch_size / world_size - ) - ) - - logging.info("num batches per epoch: %d" % self._len) - - self._num_classes_per_batch = int( - math.ceil(batch_size / num_egs_per_class / num_egs_per_utt) + self.has_short_seqs = self.dataset.short_seq_exist + self.set_num_egs_per_utt_epoch(num_egs_per_utt_epoch) + self._compute_avg_batch_size() + self._compute_len(world_size) + self._compute_num_classes_per_batch() + self.set_hard_prototypes(affinity_matrix) + logging.info( + "batches/epoch=%d classes/batch=%d avg-batch-size/gpu=%d samples/(utt*epoch)=%d", + self._len, + self._num_classes_per_batch, + self.avg_batch_size, + self.num_egs_per_utt_epoch, ) - logging.info("num classes per batch: %d" % self._num_classes_per_batch) - - # self.weights = torch.as_tensor(dataset.class_weights, dtype=torch.double) def _compute_avg_batch_size(self): + if not self.var_batch_size: + self.avg_batch_size = self.batch_size + return + dataset = self.dataset avg_chunk_length = int( (dataset.max_chunk_length + dataset.min_chunk_length) / 2 ) batch_mult = dataset.max_chunk_length / avg_chunk_length - return int(self.batch_size * batch_mult) + self.avg_batch_size = int(self.batch_size * batch_mult) + + def set_num_egs_per_utt_epoch(self, num_egs_per_utt_epoch): + if num_egs_per_utt_epoch == "auto": + self._compute_num_egs_per_utt_epoch_auto() + else: + self.num_egs_per_utt_epoch = num_egs_per_utt_epoch - def _compute_iters_auto(self): + def _compute_num_egs_per_utt_epoch_auto(self): dataset = self.dataset avg_seq_length = np.mean(dataset.seq_lengths) avg_chunk_length = int( (dataset.max_chunk_length + dataset.min_chunk_length) / 2 ) - self.iters_per_epoch = math.ceil(avg_seq_length / avg_chunk_length) - logging.debug("num iters per epoch: %d" % self.iters_per_epoch) + self.num_egs_per_utt_epoch = math.ceil(avg_seq_length / avg_chunk_length) + logging.debug("num iters per epoch: %d", self.num_egs_per_utt_epoch) + + def _compute_len(self, world_size): + self._len = int( + math.ceil( + self.num_egs_per_utt_epoch + * self.dataset.num_seqs + / self.avg_batch_size + / world_size + ) + ) + + def _compute_num_classes_per_batch(self): + self._num_classes_per_batch = int( + math.ceil( + self.avg_batch_size / self.num_egs_per_class / self.num_egs_per_utt + ) + ) + + def _get_class_weights(self, chunk_length): + if not self.has_short_seqs: + return self.dataset.class_weights + + # get classes with utt shorter than chunk length and put weight to 0 + zero_idx = self.dataset.class2max_length < chunk_length + if not np.any(zero_idx): + return self.dataset.class_weights + + class_weights = self.dataset.class_weights.clone() + class_weights[zero_idx] = 0 + # renormalize weights + class_weights /= class_weights.sum() + return class_weights + + def _get_seq_weights(self, chunk_length): + pass def __len__(self): return self._len @@ -99,10 +158,31 @@ def __iter__(self): self.batch = 0 return self + @property + def hard_prototype_mining(self): + return self.num_hard_prototypes > 0 + + def set_hard_prototypes(self, affinity_matrix): + if affinity_matrix is None: + self.hard_prototypes = None + return + + # affinity_matrix[np.diag(affinity_matrix.shape[0])] = -1.0 + # hard prototypes for a class are itself and k-1 closest to it. + self.hard_prototypes = torch.topk( + affinity_matrix, self.num_hard_prototypes, dim=-1 + ).indices + + def get_hard_prototypes(self, class_idx): + return self.hard_prototypes[class_idx].flatten() + def _get_utt_idx_basic(self, batch_mult=1): dataset = self.dataset - num_classes_per_batch = batch_mult * self._num_classes_per_batch + if self.hard_prototype_mining: + num_classes_per_batch = int( + math.ceil(num_classes_per_batch / self.num_hard_prototypes) + ) if dataset.class_weights is None: class_idx = torch.randint( @@ -115,6 +195,9 @@ def _get_utt_idx_basic(self, batch_mult=1): replacement=True, ) + if self.hard_prototype_mining: + class_idx = self.get_hard_prototypes(class_idx) + if self.num_egs_per_class > 1: class_idx = class_idx.repeat(self.num_egs_per_class) @@ -133,6 +216,10 @@ def _get_utt_idx_seq_st_max_length(self, chunk_length, batch_mult=1): dataset = self.dataset num_classes_per_batch = batch_mult * self._num_classes_per_batch + if self.hard_prototype_mining: + num_classes_per_batch = int( + math.ceil(num_classes_per_batch / self.num_hard_prototypes) + ) # first we sample the batch classes class_weights = dataset.class_weights.clone() @@ -146,6 +233,9 @@ def _get_utt_idx_seq_st_max_length(self, chunk_length, batch_mult=1): class_weights, num_samples=num_classes_per_batch, replacement=True ) + if self.hard_prototype_mining: + class_idx = self.get_hard_prototypes(class_idx) + utt_idx = torch.zeros( (len(class_idx) * self.num_egs_per_class,), dtype=torch.long ) @@ -203,7 +293,6 @@ def __next__(self): logging.info("batch 0 uttidx=%s", str(utt_idx[:10])) self.batch += 1 - index = [(i, chunk_length) for i in utt_idx] return index @@ -217,8 +306,10 @@ def filter_args(**kwargs): "batch_size", "var_batch_size", "iters_per_epoch", + "num_egs_per_utt_epoch", "num_egs_per_class", "num_egs_per_utt", + "num_hard_prototypes", ) return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -228,7 +319,9 @@ def add_class_args(parser, prefix=None): outer_parser = parser parser = ArgumentParser(prog="") - parser.add_argument("--batch-size", default=128, type=int, help=("batch size")) + parser.add_argument( + "--batch-size", default=128, type=int, help=("batch size per gpu") + ) parser.add_argument( "--var-batch-size", @@ -244,6 +337,13 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--iters-per-epoch", + default=None, + type=lambda x: x if (x == "auto" or x is None) else float(x), + help=("number of times we sample an utterance in each epoch"), + ) + + parser.add_argument( + "--num-egs-per-utt-epoch", default="auto", type=lambda x: x if x == "auto" else float(x), help=("number of times we sample an utterance in each epoch"), @@ -261,9 +361,14 @@ def add_class_args(parser, prefix=None): default=1, help=("number of samples per utterance in batch"), ) + parser.add_argument( + "--num-hard-prototypes", + type=int, + default=0, + help=("number of hard prototype classes per batch"), + ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='weighted seq sampler options') add_argparse_args = add_class_args diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py index c292f09a..735df21d 100644 --- a/hyperion/torch/layer_blocks/__init__.py +++ b/hyperion/torch/layer_blocks/__init__.py @@ -3,48 +3,59 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .fc_blocks import FCBlock -from .se_blocks import SEBlock2D, TSEBlock2D, SEBlock2d, TSEBlock2d, SEBlock1d -from .tdnn_blocks import TDNNBlock -from .etdnn_blocks import ETDNNBlock -from .resetdnn_blocks import ResETDNNBlock -from .resnet_blocks import ResNetInputBlock, ResNetBasicBlock, ResNetBNBlock -from .resnet_blocks import ResNetEndpointBlock -from .seresnet_blocks import SEResNetBasicBlock, SEResNetBNBlock -from .res2net_blocks import Res2NetBasicBlock, Res2NetBNBlock -from .mbconv_blocks import MBConvBlock, MBConvInOutBlock -from .transformer_feedforward import PositionwiseFeedForward, Conv1dx2, Conv1dLinear -from .transformer_encoder_v1 import TransformerEncoderBlockV1 -from .transformer_conv2d_subsampler import TransformerConv2dSubsampler from .conformer_conv import ConformerConvBlock +from .conformer_decoder_v1 import ConformerDecoderBlockV1 from .conformer_encoder_v1 import ConformerEncoderBlockV1 -from .dc1d_blocks import DC1dEncBlock, DC1dDecBlock -from .dc2d_blocks import DC2dEncBlock, DC2dDecBlock +from .dc1d_blocks import DC1dDecBlock, DC1dEncBlock +from .dc2d_blocks import DC2dDecBlock, DC2dEncBlock +from .etdnn_blocks import ETDNNBlock +from .fc_blocks import FCBlock +from .mbconv_blocks import MBConvBlock, MBConvInOutBlock +from .res2net1d_blocks import Res2Net1dBasicBlock, Res2Net1dBNBlock +from .res2net2d_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock +from .res2net_blocks import Res2NetBasicBlock, Res2NetBNBlock +from .resetdnn_blocks import ResETDNNBlock from .resnet1d_blocks import ( ResNet1dBasicBlock, ResNet1dBasicDecBlock, ResNet1dBNBlock, ResNet1dBNDecBlock, ResNet1dEndpoint, -) -from .resnet1d_blocks import ( SEResNet1dBasicBlock, SEResNet1dBasicDecBlock, SEResNet1dBNBlock, SEResNet1dBNDecBlock, ) -from .res2net1d_blocks import Res2Net1dBasicBlock, Res2Net1dBNBlock from .resnet2d_blocks import ( ResNet2dBasicBlock, ResNet2dBasicDecBlock, ResNet2dBNBlock, ResNet2dBNDecBlock, -) -from .resnet2d_blocks import ( SEResNet2dBasicBlock, SEResNet2dBasicDecBlock, SEResNet2dBNBlock, SEResNet2dBNDecBlock, ) -from .res2net2d_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock -from .spine_blocks import BlockSpec, SpineResample, SpineEndpoints, SpineConv +from .resnet_blocks import ( + ResNetBasicBlock, + ResNetBNBlock, + ResNetEndpointBlock, + ResNetInputBlock, +) +from .se_blocks import ( + CFwSEBlock2d, + FwSEBlock2d, + SEBlock1d, + SEBlock2D, + SEBlock2d, + TSEBlock2D, + TSEBlock2d, +) +from .seresnet_blocks import SEResNetBasicBlock, SEResNetBNBlock +from .spine_blocks import BlockSpec, SpineConv, SpineEndpoints, SpineResample +from .tdnn_blocks import TDNNBlock +from .transducer_joiner import TransducerJoiner +from .transducer_predictor import TransducerConvPredictor, TransducerRNNPredictor +from .transformer_encoder_v1 import TransformerEncoderBlockV1 +from .transformer_feedforward import Conv1dLinear, Conv1dx2, PositionwiseFeedForward +from .transformer_input import TransformerConv1dSubsampler, TransformerConv2dSubsampler diff --git a/hyperion/torch/layer_blocks/conformer_conv.py b/hyperion/torch/layer_blocks/conformer_conv.py index 7ed9a43a..0c42f34a 100644 --- a/hyperion/torch/layer_blocks/conformer_conv.py +++ b/hyperion/torch/layer_blocks/conformer_conv.py @@ -100,14 +100,16 @@ def __init__( self.context = stride * (kernel_size - 1) // 2 - def forward(self, x): + def forward(self, x, x_mask=None): """Forward function Args: - x: input size = (batch, num_channels, time) + x: input tesnosr shape = (batch, num_channels, time) + x_mask: mask indicating the valid frames in the sequence with + shape = (batch, 1, time) or (batch, time) Returns - torch.Tensor size = (batch, num_channels, (time-1)//stride+1) + Tensor with shape = (batch, num_channels, (time-1)//stride+1) """ residual = x @@ -121,7 +123,7 @@ def forward(self, x): # depthwide conv phase x = self.act(self.norm_dw(self.conv_dw(x))) if self.has_se: - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) # final projection x = self.conv_proj(x) diff --git a/hyperion/torch/layer_blocks/conformer_decoder_v1.py b/hyperion/torch/layer_blocks/conformer_decoder_v1.py new file mode 100644 index 00000000..e3d0893a --- /dev/null +++ b/hyperion/torch/layer_blocks/conformer_decoder_v1.py @@ -0,0 +1,213 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +# + +import torch +import torch.nn as nn + +from ..layers.attention import * +from .conformer_conv import ConformerConvBlock +from .conformer_encoder_v1 import ConformerEncoderBlockV1 +from .transformer_feedforward import * + + +class ConformerDecoderBlockV1(ConformerEncoderBlockV1): + """Building block for conformer decoder based on conformer encoder introduced in + https://arxiv.org/pdf/2005.08100.pdf + + This includes some optional extra features + not included in the original paper: + - Choose local-attention (attending only to close frames + instead of all the frames in the sequence) + - Choose number of conv blocks + - Squeeze-Excitation after depthwise-conv + - Allows downsampling in time dimension + - Allows choosing activation and layer normalization type + We call this Conformer+ + + Attributes: + num_feats: input/output feat. dimension (aka d_model) + self_attn: attention module in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1'] + num_heads: number of heads + conv_repeats: number of conv blocks + conv_kernel_size: kernel size for conv blocks + conv_stride: stride for depth-wise conv in first conv block + feed_forward: position-wise feed-forward string in ['linear', 'conv1dx2', 'conv1d-linear'] + d_ff: dimension of middle layer in feed_forward block + ff_kernel_size: kernel size for convolutional versions of ff block + hid_act: ff and conv block hidden activation + dropout_rate: dropout rate for ff and conv blocks + att_context: maximum context range for local attention + att_dropout_rate: dropout rate for attention block + causal_pos_enc: if True, use causal positional encodings (when rel_pos_enc=True), it assumes + that query q_i only attents to key k_j when j<=i + conv_norm_layer: norm layer constructor for conv block, + if None it uses BatchNorm + se_r: Squeeze-Excitation compression ratio, + if None it doesn't use Squeeze-Excitation + ff_macaron: if True, it uses macaron-net style ff layers, otherwise transformer style. + out_lnorm: if True, use LNorm layer at the output as in the conformer paper, + we think that this layer is redundant and put it to False by default + concat_after: if True, if concats attention input and output and apply linear transform, i.e., + y = x + linear(concat(x, att(x))) + if False, y = x + att(x) + + """ + + def __init__( + self, + num_feats, + self_attn, + cross_attn, + num_heads, + conv_repeats=0, + conv_kernel_size=31, + conv_stride=1, + feed_forward="linear", + d_ff=2048, + ff_kernel_size=3, + hid_act="swish", + dropout_rate=0, + att_context=25, + att_dropout_rate=0, + pos_enc_type="rel", + causal_pos_enc=False, + conv_norm_layer=None, + se_r=None, + ff_macaron=True, + src_lnorm=False, + out_lnorm=False, + concat_after=False, + ): + super().__init__( + num_feats, + self_attn, + num_heads, + conv_repeats=conv_repeats, + conv_kernel_size=conv_kernel_size, + conv_stride=conv_stride, + feed_forward=feed_forward, + d_ff=d_ff, + ff_kernel_size=ff_kernel_size, + hid_act=hid_act, + dropout_rate=dropout_rate, + att_context=att_context, + att_dropout_rate=att_dropout_rate, + pos_enc_type=pos_enc_type, + causal_pos_enc=causal_pos_enc, + conv_norm_layer=conv_norm_layer, + se_r=se_r, + ff_macaron=ff_macaron, + out_lnorm=out_lnorm, + concat_after=concat_after, + ) + + self.cross_att = self._make_att( + cross_attn, + num_feats, + num_heads, + 0, + att_dropout_rate, + "no", + False, + ) + + self.norm_cross_att = nn.LayerNorm(num_feats) + self.src_lnorm = src_lnorm + if src_lnorm: + self.norm_src = nn.LayerNorm(num_feats) + + if self.concat_after: + self.cross_concat_linear = nn.Linear(num_feats + num_feats, num_feats) + + def _forward_self_attn(self, x, pos_emb=None, mask=None, cache=None): + residual = x + x = self.norm_att(x) + + if cache is None: + x_q = x + mask_q = mask + else: + # compute only the last frame query keeping dim: max_time_out -> 1 + assert_cache_shape = (x.size(0), x.size(1) - 1, x.size(2)) + assert ( + cache.shape == assert_cache_shape + ), f"{cache.shape} != {assert_cache_shape}" + x_q = x[:, -1:, :] + residual = residual[:, -1:, :] + mask_q = None if mask is None else mask[:, -1:, :] + + if pos_emb is None: + x_att = self.self_attn(x_q, x, x, mask=mask_q) + else: + x_att = self.self_attn(x_q, x, x, pos_emb=pos_emb, mask=mask_q) + + if self.concat_after: + x = torch.cat((x_q, x_att), dim=-1) + x = self.concat_linear(x) + else: + x = x_att + + if self.dropout_rate > 0: + x = self.dropout(x) + + x = residual + x + return x + + def _forward_cross_attn(self, x, x_src, pos_emb=None, mask=None): + residual = x + x = self.norm_cross_att(x) + if self.src_lnorm: + x_src = self.norm_src(x_src) + + if pos_emb is None: + x_att = self.cross_attn(x, x_src, x_src, mask=mask) + else: + x_att = self.cross_attn(x, x_src, x_src, pos_emb=pos_emb, mask=mask) + + if self.concat_after: + x = torch.cat((x, x_att), dim=-1) + x = self.cross_concat_linear(x) + else: + x = x_att + + if self.dropout_rate > 0: + x = self.dropout(x) + + x = residual + x + return x + + def forward(self, x, x_src, pos_emb=None, mask=None, mask_src=None, cache=None): + """Forward pass function + + Args: + x: input tensor with size=(batch, time, num_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0, + when using relative postional encoder, otherwise None + mask: mask to indicate valid time steps for x (batch, time) + + Returns: + Tensor with output features + Tensor with mask + """ + # macaron feed forward + if self.ff_macaron: + x = self._forward_ff_macaron(x) + + # multihead attention + x = self._forward_self_attn(x, pos_emb, mask, cache=cache) + x = self._forward_cross_attn(x, x_src, mask=mask_src) + + # convolutional blocks + x = self._forward_convs(x) + + # feed-forward block + x = self._forward_ff(x) + + # output norm + if self.out_lnorm: + x = self.norm_out(x) + + return x, mask diff --git a/hyperion/torch/layer_blocks/conformer_encoder_v1.py b/hyperion/torch/layer_blocks/conformer_encoder_v1.py index a54e3b99..349bef4b 100644 --- a/hyperion/torch/layer_blocks/conformer_encoder_v1.py +++ b/hyperion/torch/layer_blocks/conformer_encoder_v1.py @@ -8,8 +8,8 @@ import torch.nn as nn from ..layers.attention import * -from .transformer_feedforward import * from .conformer_conv import ConformerConvBlock +from .transformer_feedforward import * class ConformerEncoderBlockV1(nn.Module): @@ -78,7 +78,6 @@ def __init__( out_lnorm=False, concat_after=False, ): - super().__init__() self.self_attn = self._make_att( self_attn, @@ -132,6 +131,12 @@ def __init__( if self.concat_after: self.concat_linear = nn.Linear(num_feats + num_feats, num_feats) + def change_attn_dropout(self, att_dropout_rate): + attn = self.self_attn + if hasattr(attn, "dropout_rate"): + attn.dropout_rate = att_dropout_rate + attn.dropout.p = att_dropout_rate + @staticmethod def _make_att( att_type, @@ -145,9 +150,10 @@ def _make_att( """Creates multihead attention block from att_type string Args: - att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1'] + att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1', 'block-scaled-dot-prod-att-v1'] num_feats: input/output feat. dimension (aka d_model) num_heads: number of heads + context: block attention receptive field dropout_rate: dropout rate for attention block pos_enc_type: type of positional encoder causal_pos_enc: if True, use causal positional encodings (when rel_pos_enc=True), it assumes @@ -170,11 +176,15 @@ def _make_att( d_k, causal_pos_enc, dropout_rate, - time_dim=1, ) return ScaledDotProdAttV1( - num_feats, num_feats, num_heads, d_k, d_k, dropout_rate, time_dim=1 + num_feats, + num_feats, + num_heads, + d_k, + d_k, + dropout_rate, ) if att_type == "local-scaled-dot-prod-v1": @@ -188,7 +198,6 @@ def _make_att( context, causal_pos_enc, dropout_rate, - time_dim=1, ) return LocalScaledDotProdAttV1( @@ -199,7 +208,29 @@ def _make_att( d_k, context, dropout_rate, - time_dim=1, + ) + + if att_type == "block-scaled-dot-prod-v1": + if pos_enc_type == "rel": + return BlockScaledDotProdAttRelPosEncV1( + num_feats, + num_feats, + num_heads, + d_k, + d_k, + context, + causal_pos_enc, + dropout_rate, + ) + + return BlockScaledDotProdAttV1( + num_feats, + num_feats, + num_heads, + d_k, + d_k, + context, + dropout_rate, ) @staticmethod @@ -233,36 +264,24 @@ def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rat num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 ) - def forward(self, x, pos_emb=None, mask=None): - """Forward pass function - - Args: - x: input tensor with size=(batch, time, num_feats) - pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0, - when using relative postional encoder, otherwise None - mask: mask to indicate valid time steps for x (batch, time) - - Returns: - Tensor with output features - Tensor with mask - """ + def _forward_ff_macaron(self, x): + residual = x + x = self.norm_ff_macaron(x) + x = self.feed_forward_macaron(x) + if self.dropout_rate > 0: + x = self.dropout(x) - # macaron feed forward - if self.ff_macaron: - residual = x - x = self.norm_ff_macaron(x) - x = self.feed_forward_macaron(x) - if self.dropout_rate > 0: - x = self.dropout(x) - x = residual + self.ff_scale * x + x = residual + self.ff_scale * x + return x - # multihead attention + def _forward_self_attn(self, x, pos_emb=None, mask=None): residual = x x = self.norm_att(x) if pos_emb is None: x_att = self.self_attn(x, x, x, mask=mask) else: x_att = self.self_attn(x, x, x, pos_emb=pos_emb, mask=mask) + if self.concat_after: x = torch.cat((x, x_att), dim=-1) x = self.concat_linear(x) @@ -273,15 +292,17 @@ def forward(self, x, pos_emb=None, mask=None): x = self.dropout(x) x = residual + x + return x - # convolutional blocks + def _forward_convs(self, x): x = x.transpose(1, 2) for block in range(len(self.conv_blocks)): x = self.conv_blocks[block](x) x = x.transpose(1, 2) + return x - # feed-forward block + def _forward_ff(self, x): residual = x x = self.norm_ff(x) x = self.feed_forward(x) @@ -289,6 +310,33 @@ def forward(self, x, pos_emb=None, mask=None): x = self.dropout(x) x = residual + self.ff_scale * x + return x + + def forward(self, x, pos_emb=None, mask=None): + """Forward pass function + + Args: + x: input tensor with size=(batch, time, num_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0, + when using relative postional encoder, otherwise None + mask: mask to indicate valid time steps for x (batch, time) + + Returns: + Tensor with output features + Tensor with mask + """ + # macaron feed forward + if self.ff_macaron: + x = self._forward_ff_macaron(x) + + # multihead attention + x = self._forward_self_attn(x, pos_emb, mask) + + # convolutional blocks + x = self._forward_convs(x) + + # feed-forward block + x = self._forward_ff(x) # output norm if self.out_lnorm: diff --git a/hyperion/torch/layer_blocks/dc1d_blocks.py b/hyperion/torch/layer_blocks/dc1d_blocks.py index f5b794ef..780af960 100644 --- a/hyperion/torch/layer_blocks/dc1d_blocks.py +++ b/hyperion/torch/layer_blocks/dc1d_blocks.py @@ -4,7 +4,7 @@ """ import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear from ..layers import ActivationFactory as AF from ..layers import Dropout1d @@ -12,6 +12,22 @@ class DC1dEncBlock(nn.Module): + """Build block for deep convolutional encoder 1d. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + stride: downsampling stride. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm1d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -62,14 +78,25 @@ def __init__( self.context = dilation * (kernel_size - 1) // 2 def freeze(self): + """Freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = False def unfreeze(self): + """Un freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = True - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_time). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_time). + """ x = self.conv1(x) if self.norm_before: @@ -88,6 +115,22 @@ def forward(self, x): class DC1dDecBlock(nn.Module): + """Build block for deep convolutional decoder 1d. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + stride: upsampling stride. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm1d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -149,15 +192,25 @@ def __init__( self.context = dilation * (kernel_size - 1) // 2 def freeze(self): + """Freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = False def unfreeze(self): + """Unfreezes trainable parameters.""" for param in self.parameters(): param.requires_grad = True - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_time). + x_mask: unused. + Returns: + Tensor with shape = (batch, out_channels, out_time). + """ x = self.conv1(x) if self.norm_before: x = self.bn1(x) diff --git a/hyperion/torch/layer_blocks/dc2d_blocks.py b/hyperion/torch/layer_blocks/dc2d_blocks.py index 0d251528..a99f9211 100644 --- a/hyperion/torch/layer_blocks/dc2d_blocks.py +++ b/hyperion/torch/layer_blocks/dc2d_blocks.py @@ -4,13 +4,29 @@ """ import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d +from torch.nn import BatchNorm2d, Conv2d, Dropout2d from ..layers import ActivationFactory as AF from ..layers.subpixel_convs import SubPixelConv2d class DC2dEncBlock(nn.Module): + """Build block for deep convolutional encoder 2d. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + stride: downsampling stride. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm2d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -61,15 +77,25 @@ def __init__( self.context = dilation * (kernel_size - 1) // 2 def freeze(self): + """Freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = False def unfreeze(self): + """Unfreezes trainable parameters.""" for param in self.parameters(): param.requires_grad = True def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ x = self.conv1(x) if self.norm_before: x = self.bn1(x) @@ -87,6 +113,22 @@ def forward(self, x): class DC2dDecBlock(nn.Module): + """Build block for deep convolutional decoder 2d. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + stride: upsampling stride. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm2d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -148,15 +190,25 @@ def __init__( self.context = dilation * (kernel_size - 1) // 2 def freeze(self): + """Freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = False def unfreeze(self): + """Unfreezes trainable parameters.""" for param in self.parameters(): param.requires_grad = True def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ x = self.conv1(x) if self.norm_before: x = self.bn1(x) diff --git a/hyperion/torch/layer_blocks/etdnn_blocks.py b/hyperion/torch/layer_blocks/etdnn_blocks.py index 958c31ba..b6afdd29 100644 --- a/hyperion/torch/layer_blocks/etdnn_blocks.py +++ b/hyperion/torch/layer_blocks/etdnn_blocks.py @@ -6,13 +6,28 @@ import numpy as np import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear from ..layers import ActivationFactory as AF from ..layers import Dropout1d class ETDNNBlock(nn.Module): + """Building block for Extended-TDNN. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm1d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -62,15 +77,21 @@ def __init__( ) self.conv2 = Conv1d(out_channels, out_channels, bias=bias, kernel_size=1) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. - x = self.conv1(x) + Args: + x: input tensor with shape = (batch, in_channels, in_time). + x_mask: unused. + Returns: + Tensor with shape = (batch, out_channels, out_time). + """ + x = self.conv1(x) if self.norm_before: x = self.bn1(x) x = self.activation1(x) - if self.norm_after: x = self.bn1(x) @@ -78,12 +99,10 @@ def forward(self, x): x = self.dropout1(x) x = self.conv2(x) - if self.norm_before: x = self.bn2(x) x = self.activation2(x) - if self.norm_after: x = self.bn2(x) diff --git a/hyperion/torch/layer_blocks/fc_blocks.py b/hyperion/torch/layer_blocks/fc_blocks.py index 567474bf..49bf12db 100644 --- a/hyperion/torch/layer_blocks/fc_blocks.py +++ b/hyperion/torch/layer_blocks/fc_blocks.py @@ -7,7 +7,7 @@ # import numpy as np import torch.nn as nn -from torch.nn import Linear, BatchNorm1d, Dropout +from torch.nn import BatchNorm1d, Dropout, Linear from ..layers import ActivationFactory as AF @@ -84,4 +84,7 @@ def forward_linear(self, x): if self.norm_before: x = self.bn1(x) + if self.activation is None and self.norm_after: + x = self.bn1(x) + return x diff --git a/hyperion/torch/layer_blocks/mbconv_blocks.py b/hyperion/torch/layer_blocks/mbconv_blocks.py index 6d9a3141..8a956b21 100644 --- a/hyperion/torch/layer_blocks/mbconv_blocks.py +++ b/hyperion/torch/layer_blocks/mbconv_blocks.py @@ -7,12 +7,13 @@ import torch import torch.nn as nn -# from torch.nn import Conv2d, BatchNorm2d - from ..layers import ActivationFactory as AF from ..layers import DropConnect2d from .se_blocks import SEBlock2D, TSEBlock2D +# from torch.nn import Conv2d, BatchNorm2d + + def _conv1x1(in_channels, out_channels, stride=1, bias=False): """1x1 convolution""" @@ -42,6 +43,22 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer): class MBConvBlock(nn.Module): + """MobileNet/EfficentNet Inverted bottleneck Block + + Attributes: + in_channels: input channels. + out_channels: output channels + expansion: expansion of channels for the inverted bottleneck. + kernel_size: kernel size of the convs. + stride: downsampling stride of the convs. + activation: Non-linear activation object, string of configuration dictionary. + drop_connect_rate: Drop-connect rate for stochastic number of layers. + norm_layer: Normalization layer constructor, if None BatchNorm2d is used. + se_r=None: Squeeze-excitation compression ratio. + time_se: If true, squeeze is done only in time dimension. + num_feats: Number of features in dimension 2, needed if time_se=True. + """ + def __init__( self, in_channels, @@ -113,8 +130,17 @@ def __init__( self.context = stride * (kernel_size - 1) // 2 self.downsample_factor = stride - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x if self.expansion > 1: x = self.act(self.bn_exp(self.conv_exp(x))) @@ -137,6 +163,19 @@ def forward(self, x): class MBConvInOutBlock(nn.Module): + """Convolutional block used as input/output + in MobileNet/EffcientNet + + Attributes: + in_channels: input channels. + out_channels: output channels + kernel_size: kernel size of the convs. + stride: downsampling stride of the convs. + activation: Non-linear activation object, string of configuration dictionary. + norm_layer: Normalization layer constructor, if None BatchNorm2d is used. + + """ + def __init__( self, in_channels, @@ -169,4 +208,13 @@ def __init__( self.downsample_factor = stride def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ return self.act(self.bn(self.conv(x))) diff --git a/hyperion/torch/layer_blocks/res2net1d_blocks.py b/hyperion/torch/layer_blocks/res2net1d_blocks.py index 6f66557b..0fbdc301 100644 --- a/hyperion/torch/layer_blocks/res2net1d_blocks.py +++ b/hyperion/torch/layer_blocks/res2net1d_blocks.py @@ -3,12 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import math + import torch import torch.nn as nn -from torch.nn import Conv1d, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d from ..layers import ActivationFactory as AF -from ..layers import Dropout1d, DropConnect1d +from ..layers import DropConnect1d, Dropout1d from .se_blocks import SEBlock1d @@ -46,6 +47,28 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before) class Res2Net1dBasicBlock(nn.Module): + """Res2Net basic Block. This is a modified Res2Net block with + two 3x3 convolutions, instead of the standard bottleneck block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + width_factor: multiplication factor for the number of channels in the first layer + or the block. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r: squeeze-excitation compression ratio. + """ + expansion = 1 def __init__( @@ -53,7 +76,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation={"name": "relu6", "inplace": True}, + activation={"name": "relu", "inplace": True}, stride=1, dropout_rate=0, drop_connect_rate=0, @@ -160,8 +183,21 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time). + + Returns: + Tensor with shape = (batch, out_channels, time). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) + split_size = [self.width_in for i in range(self.scale - 1)] split_size.append(self.in_channels % self.width_in + self.width_in) split_x = torch.split(x, split_size, 1) @@ -180,8 +216,8 @@ def forward(self, x): if self.norm_before: x_i = self.bn1s[i](x_i) x_i = self.act1(x_i) - if not self.norm_before: - x_i = self.bn1(x_i) + if self.norm_after: + x_i = self.bn1s[i](x_i) x.append(x_i) if self.scale > 1: @@ -190,23 +226,28 @@ def forward(self, x): x = torch.cat(x, dim=1) x = self.conv2(x) - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if self.se_layer: - x = self.se_layer(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - if self.downsample is not None: - residual = self.downsample(residual) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - x += residual - x = self.act2(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if not self.norm_before: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -215,6 +256,26 @@ def forward(self, x): class Res2Net1dBNBlock(nn.Module): + """Res2Net bottleneck Block. + + Attributes: + in_channels: input channels. + channels: channels in bottleneck layer when width_factor=1. + kernel_size: kernel size in bottleneck layers. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r: squeeze-excitation compression ratio. + """ + def __init__( self, in_channels, @@ -232,7 +293,6 @@ def __init__( norm_layer=None, norm_before=True, se_r=None, - num_feats=None, ): super().__init__() @@ -322,7 +382,90 @@ def out_channels(self): def expansion(self): return self.channels / self.width / self.scale - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time). + + Returns: + Tensor with shape = (batch, out_channels, time). + """ + residual = x + if self.downsample is not None: + residual = self.downsample(residual) + + x = self.conv1(x) + if self.norm_before: + x = self.bn1(x) + x = self.act1(x) + if self.norm_after: + x = self.bn1(x) + + split_x = torch.split(x, self.width, 1) + x = [] + for i in range(self.num_k): + if i == 0 or self.stride > 1: + x_i = split_x[i] + else: + x_i = x_i + split_x[i] + x_i = self.conv2s[i](x_i) + if self.norm_before: + x_i = self.bn2s[i](x_i) + x_i = self.act2(x_i) + if self.norm_after: + x_i = self.bn2s[i](x_i) + x.append(x_i) + + if self.scale > 1: + if self.stride == 1: + x.append(split_x[-1]) + else: + x.append(self.pool(split_x[-1])) + + x = torch.cat(x, dim=1) + + x = self.conv3(x) + if self.norm_after: + x = self.act3(x) + x = self.bn3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) + + if self.drop_connect_rate > 0: + x = self.drop_connect(x) + + x += residual + else: + if self.norm_before: + x = self.bn3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) + + if self.drop_connect_rate > 0: + x = self.drop_connect(x) + + x += residual + x = self.act3(x) + + if self.dropout_rate > 0: + x = self.dropout(x) + + return x + + def forward0(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time). + + Returns: + Tensor with shape = (batch, out_channels, time). + """ residual = x x = self.conv1(x) @@ -344,7 +487,7 @@ def forward(self, x): x_i = self.bn2s[i](x_i) x_i = self.act2(x_i) if not self.norm_before: - x_i = self.bn2(x_i) + x_i = self.bn2s[i](x_i) x.append(x_i) if self.scale > 1: @@ -360,7 +503,7 @@ def forward(self, x): x = self.bn3(x) if self.se_layer: - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) if self.drop_connect_rate > 0: x = self.drop_connect(x) diff --git a/hyperion/torch/layer_blocks/res2net2d_blocks.py b/hyperion/torch/layer_blocks/res2net2d_blocks.py index 37bbd966..4050f936 100644 --- a/hyperion/torch/layer_blocks/res2net2d_blocks.py +++ b/hyperion/torch/layer_blocks/res2net2d_blocks.py @@ -3,9 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import math + import torch import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d +from torch.nn import BatchNorm2d, Conv2d, Dropout2d from ..layers import ActivationFactory as AF from .se_blocks import SEBlock2d, TSEBlock2d @@ -45,6 +46,29 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before) class Res2Net2dBasicBlock(nn.Module): + """Res2Net basic Block. This is a modified Res2Net block with + two 3x3 convolutions, instead of the standard bottleneck block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the first layer + or the block. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r=None: squeeze-excitation compression ratio. + time_se: If true, squeeze is done only in time dimension. + num_feats: Number of features in dimension 2, needed if time_se=True. + """ + expansion = 1 def __init__( @@ -159,8 +183,21 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) + split_size = [self.width_in for i in range(self.scale - 1)] split_size.append(self.in_channels % self.width_in + self.width_in) split_x = torch.split(x, split_size, 1) @@ -179,8 +216,8 @@ def forward(self, x): if self.norm_before: x_i = self.bn1s[i](x_i) x_i = self.act1(x_i) - if not self.norm_before: - x_i = self.bn1(x_i) + if self.norm_after: + x_i = self.bn1s[i](x_i) x.append(x_i) if self.scale > 1: @@ -189,20 +226,22 @@ def forward(self, x): x = torch.cat(x, dim=1) x = self.conv2(x) - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if self.downsample is not None: - residual = self.downsample(residual) - - if self.se_layer: - x = self.se_layer(x) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - x += residual - x = self.act2(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if not self.norm_before: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -211,6 +250,27 @@ def forward(self, x): class Res2Net2dBNBlock(nn.Module): + """Res2Net bottleneck Block. + + Attributes: + in_channels: input channels. + channels: channels in bottleneck layer when width_factor=1. + kernel_size: kernel size in bottleneck layers. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r=None: squeeze-excitation compression ratio. + time_se: If true, squeeze is done only in time dimension. + num_feats: Number of features in dimension 2, needed if time_se=True. + """ + def __init__( self, in_channels, @@ -316,14 +376,26 @@ def out_channels(self): def expansion(self): return self.channels / self.width / self.scale - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: x = self.bn1(x) x = self.act1(x) - if not self.norm_before: + if self.norm_after: x = self.bn1(x) split_x = torch.split(x, self.width, 1) @@ -337,8 +409,8 @@ def forward(self, x): if self.norm_before: x_i = self.bn2s[i](x_i) x_i = self.act2(x_i) - if not self.norm_before: - x_i = self.bn2(x_i) + if self.norm_after: + x_i = self.bn2s[i](x_i) x.append(x_i) if self.scale > 1: @@ -350,20 +422,22 @@ def forward(self, x): x = torch.cat(x, dim=1) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if self.downsample is not None: - residual = self.downsample(residual) - - if self.se_layer: - x = self.se_layer(x) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - x += residual - x = self.act3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if not self.norm_before: - x = self.bn3(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py index 56804307..55e35e5f 100644 --- a/hyperion/torch/layer_blocks/res2net_blocks.py +++ b/hyperion/torch/layer_blocks/res2net_blocks.py @@ -3,12 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import math + import torch import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d +from torch.nn import BatchNorm2d, Conv2d, Dropout2d from ..layers import ActivationFactory as AF -from .se_blocks import SEBlock2D, TSEBlock2D +from .resnet_blocks import FreqPosEnc +from .se_blocks import CFwSEBlock2d, FwSEBlock2d, SEBlock2d, TSEBlock2d def _conv3x3(in_channels, out_channels, stride=1, groups=1, dilation=1, bias=False): @@ -31,7 +33,6 @@ def _conv1x1(in_channels, out_channels, stride=1, bias=False): def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before): - if norm_before: return nn.Sequential( _conv1x1(in_channels, out_channels, stride, bias=False), @@ -42,6 +43,30 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before) class Res2NetBasicBlock(nn.Module): + """Res2Net basic Block. This is a modified Res2Net block with + two 3x3 convolutions, instead of the standard bottleneck block. + + Attributes: + in_channels: input channels. + channels: output channels. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the first layer + or the block. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r: squeeze-excitation compression ratio. + se_type: type of squeeze excitation in [t-se, cw-se, fw-se, cfw-se] + freq_pos_enc: use frequency wise positional encoder + num_feats: Number of features in dimension 2, needed if se_type!=cw-se or freq_pos_enc=True. + time_se: (legacy deprecated) If true, use t-se + """ + expansion = 1 def __init__( @@ -58,10 +83,11 @@ def __init__( norm_layer=None, norm_before=True, se_r=None, - time_se=False, + se_type="cw-se", + freq_pos_enc=False, num_feats=None, + time_se=False, ): - super().__init__() self.in_channels = in_channels @@ -74,7 +100,7 @@ def __init__( width_in = in_channels // scale width_mid = int(width_factor * channels) // scale self.width_in = width_in - self.has_proj1 = width_in != width_mid + self.has_proj1 = width_in != width_mid and stride == 1 self.scale = scale channels_mid = width_mid * scale if scale == 1: @@ -124,11 +150,22 @@ def __init__( self.context = dilation self.downsample_factor = stride + self.pos_enc = None + if freq_pos_enc: + self.pos_enc = FreqPosEnc(num_feats) + if se_r is not None: if time_se: - self.se_layer = TSEBlock2D(channels, num_feats, se_r, activation) - else: - self.se_layer = SEBlock2D(channels, se_r, activation) + se_type = "t-se" + + if se_type == "t-se": + self.se_layer = TSEBlock2d(channels, num_feats, se_r, activation) + elif se_type == "cw-se": + self.se_layer = SEBlock2d(channels, se_r, activation) + elif se_type == "fw-se": + self.se_layer = FwSEBlock2d(num_feats, se_r, activation) + elif se_type == "cfw-se": + self.se_layer = CFwSEBlock2d(channels, num_feats, se_r, activation) else: self.se_layer = None @@ -136,12 +173,28 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) + + if self.pos_enc is not None: + x = self.pos_enc(x) + split_size = [self.width_in for i in range(self.scale - 1)] split_size.append(self.in_channels % self.width_in + self.width_in) split_x = torch.split(x, split_size, 1) - # split_x = torch.split(x, self.width_in, 1) + x = [] for i in range(self.num_3x3): if i == 0 or self.stride > 1: @@ -157,7 +210,7 @@ def forward(self, x): x_i = self.bn1s[i](x_i) x_i = self.act1(x_i) if not self.norm_before: - x_i = self.bn1(x_i) + x_i = self.bn1s[i](x_i) x.append(x_i) if self.scale > 1: @@ -168,18 +221,18 @@ def forward(self, x): x = self.conv2(x) if self.norm_before: x = self.bn2(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if self.downsample is not None: - residual = self.downsample(residual) - - if self.se_layer: - x = self.se_layer(x) - - x += residual - x = self.act2(x) - - if not self.norm_before: + x += residual + x = self.act2(x) + else: + x = self.act2(x) x = self.bn2(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) + + x += residual if self.dropout_rate > 0: x = self.dropout(x) @@ -188,6 +241,27 @@ def forward(self, x): class Res2NetBNBlock(nn.Module): + """Res2Net bottleneck Block. + + Attributes: + in_channels: input channels. + channels: channels in bottleneck layer when width_factor=1. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r: squeeze-excitation compression ratio. + se_type: type of squeeze excitation in [t-se, cw-se, fw-se, cfw-se] + freq_pos_enc: use frequency wise positional encoder + num_feats: Number of features in dimension 2, needed if se_type!=cw-se or freq_pos_enc=True. + time_se: (legacy deprecated) If true, use t-se + """ + expansion = 4 def __init__( @@ -204,10 +278,11 @@ def __init__( norm_layer=None, norm_before=True, se_r=None, - time_se=False, + se_type="cw-se", + freq_pos_enc=False, num_feats=None, + time_se=False, ): - super().__init__() self.in_channels = in_channels @@ -265,13 +340,23 @@ def __init__( self.context = dilation self.downsample_factor = stride + self.pos_enc = None + if freq_pos_enc: + self.pos_enc = FreqPosEnc(num_feats) + if se_r is not None: if time_se: - self.se_layer = TSEBlock2D( - channels * self.expansion, num_feats, se_r, activation - ) - else: - self.se_layer = SEBlock2D(channels * self.expansion, se_r, activation) + se_type = "t-se" + + se_channels = channels * self.expansion + if se_type == "t-se": + self.se_layer = TSEBlock2d(se_channels, num_feats, se_r, activation) + elif se_type == "cw-se": + self.se_layer = SEBlock2d(se_channels, se_r, activation) + elif se_type == "fw-se": + self.se_layer = FwSEBlock2d(num_feats, se_r, activation) + elif se_type == "cfw-se": + self.se_layer = CFwSEBlock2d(se_channels, num_feats, se_r, activation) else: self.se_layer = None @@ -279,8 +364,23 @@ def __init__( def out_channels(self): return self.channels * self.expansion - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) + + if self.pos_enc is not None: + x = self.pos_enc(x) x = self.conv1(x) if self.norm_before: @@ -301,7 +401,7 @@ def forward(self, x): x_i = self.bn2s[i](x_i) x_i = self.act2(x_i) if not self.norm_before: - x_i = self.bn2(x_i) + x_i = self.bn2s[i](x_i) x.append(x_i) if self.scale > 1: @@ -315,18 +415,18 @@ def forward(self, x): x = self.conv3(x) if self.norm_before: x = self.bn3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if self.downsample is not None: - residual = self.downsample(residual) - - if self.se_layer: - x = self.se_layer(x) - - x += residual - x = self.act3(x) - - if not self.norm_before: + x += residual + x = self.act3(x) + else: + x = self.act3(x) x = self.bn3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) + + x += residual if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/layer_blocks/resetdnn_blocks.py b/hyperion/torch/layer_blocks/resetdnn_blocks.py index 9d849719..dfea3720 100644 --- a/hyperion/torch/layer_blocks/resetdnn_blocks.py +++ b/hyperion/torch/layer_blocks/resetdnn_blocks.py @@ -7,7 +7,7 @@ import numpy as np import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear from ..layers import ActivationFactory as AF from ..layers import Dropout1d @@ -15,6 +15,21 @@ class ResETDNNBlock(ETDNNBlock): + """Building block for Residual Extended-TDNN. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm1d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, num_channels, @@ -39,7 +54,16 @@ def __init__( norm_before, ) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_time). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_time). + """ residual = x x = self.conv1(x) diff --git a/hyperion/torch/layer_blocks/resnet1d_blocks.py b/hyperion/torch/layer_blocks/resnet1d_blocks.py index d1965708..4ad9b8ce 100644 --- a/hyperion/torch/layer_blocks/resnet1d_blocks.py +++ b/hyperion/torch/layer_blocks/resnet1d_blocks.py @@ -3,10 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import torch.nn as nn -from torch.nn import Conv1d, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d from ..layers import ActivationFactory as AF -from ..layers import Dropout1d, DropConnect1d, Interpolate +from ..layers import DropConnect1d, Dropout1d, Interpolate from ..layers.subpixel_convs import SubPixelConv1d from .se_blocks import SEBlock1d @@ -113,6 +113,23 @@ def _make_upsample( class ResNet1dBasicBlock(nn.Module): + """ResNet 1d basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + expansion = 1 # __constants__ = ['downsample'] @@ -122,7 +139,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -182,8 +199,19 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -195,21 +223,22 @@ def forward(self, x): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.downsample is not None: - residual = self.downsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - x += residual - x = self.act2(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -218,6 +247,23 @@ def forward(self, x): class ResNet1dBasicDecBlock(nn.Module): + """ResNet 1d basic Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: upsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + expansion = 1 # __constants__ = ['downsample'] @@ -227,7 +273,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -288,8 +334,19 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -301,21 +358,22 @@ def forward(self, x): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.upsample is not None: - residual = self.upsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - x += residual - x = self.act2(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -324,12 +382,31 @@ def forward(self, x): class ResNet1dBNBlock(nn.Module): + """ResNet 1d bottleneck Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -401,8 +478,20 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ + residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -421,20 +510,22 @@ def forward(self, x): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.downsample is not None: - residual = self.downsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - x += residual - x = self.act3(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn3(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -443,12 +534,31 @@ def forward(self, x): class ResNet1dBNDecBlock(nn.Module): + """ResNet 1d bottleneck Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: upsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -514,8 +624,19 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -534,20 +655,22 @@ def forward(self, x): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.upsample is not None: - residual = self.upsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - x += residual - x = self.act3(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -556,6 +679,24 @@ def forward(self, x): class SEResNet1dBasicBlock(ResNet1dBasicBlock): + """Squeeze-excitation ResNet 1d basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + expansion = 1 def __init__( @@ -563,7 +704,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -592,8 +733,20 @@ def __init__( self.se_layer = SEBlock1d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -605,22 +758,24 @@ def forward(self, x): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - x = self.se_layer(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - if self.downsample is not None: - residual = self.downsample(residual) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - x += residual - x = self.act2(x) - - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -629,6 +784,24 @@ def forward(self, x): class SEResNet1dBasicDecBlock(ResNet1dBasicDecBlock): + """Squeeze-excitation ResNet 1d basic Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: upsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + expansion = 1 def __init__( @@ -636,7 +809,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -669,8 +842,20 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -682,22 +867,24 @@ def forward(self, x): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - x = self.se_layer(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.upsample is not None: - residual = self.upsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - x += residual - x = self.act2(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -706,12 +893,32 @@ def forward(self, x): class SEResNet1dBNBlock(ResNet1dBNBlock): + """Squeeze-excitation ResNet 1d bottleneck Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -742,8 +949,20 @@ def __init__( self.se_layer = SEBlock1d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -762,21 +981,24 @@ def forward(self, x): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - x = self.se_layer(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - if self.downsample is not None: - residual = self.downsample(residual) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - x += residual - x = self.act3(x) - - if self.norm_after: - x = self.bn3(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -785,12 +1007,32 @@ def forward(self, x): class SEResNet1dBNDecBlock(ResNet1dBNDecBlock): + """Squeeze-excitation ResNet 1d bottleneck Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -821,8 +1063,20 @@ def __init__( self.se_layer = SEBlock1d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -841,21 +1095,24 @@ def forward(self, x): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - x = self.se_layer(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.upsample is not None: - residual = self.upsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - x += residual - x = self.act3(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn3(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -864,6 +1121,25 @@ def forward(self, x): class ResNet1dEndpoint(nn.Module): + """Class that connects the ouputs of the ResNet1d to the rest of the network + when using multilevel feature aggregation. + + It converts the features of all the levels that we are going to aggregate + to the same temporal scale. + + Attributes: + in_channels: input channels. + channels: output channels. + in_scale: resolution scale of the input feature maps. + scale: resolution scale of the output feature maps. + upsampling_mode: algorithm used for upsampling: 'nearest' | 'linear' | 'bilinear' + activation: Non-linear activation object, string of configuration dictionary. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + + """ + def __init__( self, in_channels, @@ -871,22 +1147,18 @@ def __init__( in_scale, scale, upsampling_mode="nearest", - activation={"name": "relu6", "inplace": True}, + activation={"name": "relu", "inplace": True}, + use_norm=True, norm_layer=None, norm_before=True, ): - """ - Class that connects the ouputs of the ResNet1d to the rest of the network - when using multilevel feature aggregation - It converts the features of all the levels that we are going to aggregate - to the same temporal scale - """ super().__init__() if norm_layer is None: norm_layer = nn.BatchNorm1d self.in_channels = in_channels self.channels = channels + self.use_norm = use_norm self.norm_before = norm_before self.rel_scale = in_scale / scale if scale >= in_scale: @@ -906,12 +1178,21 @@ def __init__( ) self.act = AF.create(activation) - if not self.norm_before: + if use_norm and not self.norm_before: self.bn = norm_layer(channels) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ x = self.resample(x) x = self.act(x) - if not self.norm_before: + if self.use_norm and not self.norm_before: x = self.bn(x) return x diff --git a/hyperion/torch/layer_blocks/resnet2d_blocks.py b/hyperion/torch/layer_blocks/resnet2d_blocks.py index 6149319c..6c2dca74 100644 --- a/hyperion/torch/layer_blocks/resnet2d_blocks.py +++ b/hyperion/torch/layer_blocks/resnet2d_blocks.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d +from torch.nn import BatchNorm2d, Conv2d, Dropout2d from ..layers import ActivationFactory as AF from ..layers.subpixel_convs import SubPixelConv2d @@ -79,6 +79,23 @@ def _make_upsample(in_channels, out_channels, stride, norm_layer, norm_before): class ResNet2dBasicBlock(nn.Module): + """ResNet 2d basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + + """ + expansion = 1 def __init__( @@ -86,7 +103,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -140,8 +157,19 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -154,17 +182,16 @@ def forward(self, x): x = self.conv2(x) - if self.norm_before: - x = self.bn2(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x += residual - x = self.act2(x) - if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x += residual + else: + if self.norm_before: + x = self.bn2(x) + + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -173,6 +200,23 @@ def forward(self, x): class ResNet2dBasicDecBlock(nn.Module): + """ResNet 2d basic Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: upsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + + """ + expansion = 1 def __init__( @@ -180,7 +224,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -235,8 +279,19 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -248,18 +303,16 @@ def forward(self, x): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: - x = self.bn2(x) - - if self.upsample is not None: - residual = self.upsample(residual) - - x += residual - x = self.act2(x) - if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x += residual + else: + if self.norm_before: + x = self.bn2(x) + + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -268,12 +321,29 @@ def forward(self, x): class ResNet2dBNBlock(nn.Module): + """ResNet 2d bottleneck Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size in bottleneck. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -339,8 +409,19 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -359,17 +440,16 @@ def forward(self, x): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: - x = self.bn3(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x += residual - x = self.act3(x) - if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x += residual + else: + if self.norm_before: + x = self.bn3(x) + + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -378,12 +458,29 @@ def forward(self, x): class ResNet2dBNDecBlock(nn.Module): + """ResNet 2d bottleneck Block decoder. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size in bottleneck. + activation: Non-linear activation object, string of configuration dictionary. + stride: upsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -443,8 +540,19 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -463,17 +571,16 @@ def forward(self, x): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - if self.upsample is not None: - residual = self.upsample(residual) - - x += residual - x = self.act3(x) - - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -482,6 +589,24 @@ def forward(self, x): class SEResNet2dBasicBlock(ResNet2dBasicBlock): + """Squeeze-excitation ResNet 2d basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + expansion = 1 def __init__( @@ -489,7 +614,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -516,8 +641,20 @@ def __init__( self.se_layer = SEBlock2d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -529,19 +666,18 @@ def forward(self, x): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: - x = self.bn2(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x = self.se_layer(x) - x += residual - x = self.act2(x) - if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual + else: + if self.norm_before: + x = self.bn2(x) + + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -550,6 +686,24 @@ def forward(self, x): class SEResNet2dBasicDecBlock(ResNet2dBasicDecBlock): + """Squeeze-excitation ResNet 2d basic Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + expansion = 1 def __init__( @@ -557,7 +711,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -588,8 +742,20 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -601,19 +767,18 @@ def forward(self, x): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: - x = self.bn2(x) - - if self.upsample is not None: - residual = self.upsample(residual) - - x = self.se_layer(x) - x += residual - x = self.act2(x) - if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual + else: + if self.norm_before: + x = self.bn2(x) + + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -622,12 +787,30 @@ def forward(self, x): class SEResNet2dBNBlock(ResNet2dBNBlock): + """Squeeze-excitation ResNet 2d bottleneck Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -656,8 +839,20 @@ def __init__( self.se_layer = SEBlock2d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -676,18 +871,18 @@ def forward(self, x): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: - x = self.bn3(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x = self.se_layer(x) - x += residual - x = self.act3(x) - if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual + else: + if self.norm_before: + x = self.bn3(x) + + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -696,12 +891,30 @@ def forward(self, x): class SEResNet2dBNDecBlock(ResNet2dBNDecBlock): + """Squeeze-excitation ResNet 2d bottleneck Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -730,8 +943,20 @@ def __init__( self.se_layer = SEBlock2d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -750,18 +975,18 @@ def forward(self, x): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: - x = self.bn3(x) - - if self.upsample is not None: - residual = self.upsample(residual) - - x = self.se_layer(x) - x += residual - x = self.act3(x) - if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual + else: + if self.norm_before: + x = self.bn3(x) + + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/layer_blocks/resnet_blocks.py b/hyperion/torch/layer_blocks/resnet_blocks.py index 439a440a..17b6ce25 100644 --- a/hyperion/torch/layer_blocks/resnet_blocks.py +++ b/hyperion/torch/layer_blocks/resnet_blocks.py @@ -2,10 +2,10 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - +import torch import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d import torch.nn.functional as nnf +from torch.nn import BatchNorm2d, Conv2d, Dropout2d from ..layers import ActivationFactory as AF @@ -30,7 +30,6 @@ def _conv1x1(in_channels, out_channels, stride=1, bias=False): def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before): - if norm_before: return nn.Sequential( _conv1x1(in_channels, out_channels, stride, bias=False), @@ -40,6 +39,15 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before) return _conv1x1(in_channels, out_channels, stride, bias=True) +class FreqPosEnc(nn.Module): + def __init__(self, num_feats): + super().__init__() + self.pos_enc = nn.Parameter(torch.zeros((num_feats, 1))) + + def forward(self, x): + return x + self.pos_enc + + class ResNetInputBlock(nn.Module): """Input block for ResNet architecture @@ -67,7 +75,6 @@ def __init__( norm_before=True, do_maxpool=True, ): - super().__init__() padding = int((kernel_size - 1) / 2) @@ -96,7 +103,6 @@ def __init__( self.downsample_factor *= 2 def forward(self, x): - x = self.conv(x) if self.norm_before: x = self.bn(x) @@ -112,9 +118,25 @@ def forward(self, x): class ResNetBasicBlock(nn.Module): - expansion = 1 + """ResNet basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + freq_pos_enc: use frequency wise positional encoder + num_feats: Number of features in dimension 2, needed if freq_pos_enc=True. - # __constants__ = ['downsample'] + """ + + expansion = 1 def __init__( self, @@ -127,8 +149,9 @@ def __init__( dilation=1, norm_layer=None, norm_before=True, + freq_pos_enc=False, + num_feats=None, ): - super().__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d @@ -161,13 +184,30 @@ def __init__( self.context = dilation + stride self.downsample_factor = stride + self.pos_enc = None + if freq_pos_enc: + self.pos_enc = FreqPosEnc(num_feats*stride) @property def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) + + if self.pos_enc is not None: + x = self.pos_enc(x) x = self.conv1(x) if self.norm_before: @@ -182,15 +222,12 @@ def forward(self, x): if self.norm_before: x = self.bn2(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x += residual - x = self.act2(x) - - if not self.norm_before: + x += residual + x = self.act2(x) + else: + x = self.act2(x) x = self.bn2(x) + x += residual if self.dropout_rate > 0: x = self.dropout(x) @@ -199,6 +236,22 @@ def forward(self, x): class ResNetBNBlock(nn.Module): + """Res2Net bottleneck Block. + + Attributes: + in_channels: input channels. + channels: channels in bottleneck layer when width_factor=1. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + freq_pos_enc: use frequency wise positional encoder + num_feats: Number of features in dimension 2, needed if freq_pos_enc=True. + """ + expansion = 4 # __constants__ = ['downsample'] @@ -213,8 +266,9 @@ def __init__( dilation=1, norm_layer=None, norm_before=True, + freq_pos_enc=False, + num_feats=None, ): - super().__init__() self.in_channels = in_channels @@ -251,13 +305,30 @@ def __init__( self.context = dilation self.downsample_factor = stride + self.pos_enc = None + if freq_pos_enc: + self.pos_enc = FreqPosEnc(num_feats) @property def out_channels(self): return self.channels * self.expansion - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) + + if self.pos_enc is not None: + x = self.pos_enc(x) x = self.conv1(x) if self.norm_before: @@ -276,15 +347,12 @@ def forward(self, x): x = self.conv3(x) if self.norm_before: x = self.bn3(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x += residual - x = self.act3(x) - - if not self.norm_before: + x += residual + x = self.act3(x) + else: + x = self.act3(x) x = self.bn3(x) + x += residual if self.dropout_rate > 0: x = self.dropout(x) @@ -305,6 +373,18 @@ def forward(self, x): class ResNetEndpointBlock(nn.Module): + """ResNet endpoint basic block. This is used as output block when + the output combines feature maps from different resolution levels. + + Attributes: + in_channels: input channels. + out_channels: output channels. + scale: interpolation factor. + activation: Non-linear activation object, string of configuration dictionary. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, @@ -314,7 +394,6 @@ def __init__( norm_layer=None, norm_before=True, ): - super().__init__() if norm_layer is None: @@ -334,7 +413,16 @@ def __init__( if self.scale > 1: self.upsample = Interpolate(scale_factor=scale, mode="nearest") - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ if self.in_channels != self.out_channels: x = self.conv(x) diff --git a/hyperion/torch/layer_blocks/se_blocks.py b/hyperion/torch/layer_blocks/se_blocks.py index 3d33f7d4..fd98db2e 100644 --- a/hyperion/torch/layer_blocks/se_blocks.py +++ b/hyperion/torch/layer_blocks/se_blocks.py @@ -5,13 +5,21 @@ import torch import torch.nn as nn -from torch.nn import Conv2d, Conv1d +from torch.nn import Conv1d, Conv2d from ..layers import ActivationFactory as AF -class SEBlock2D(nn.Module): - """From https://arxiv.org/abs/1709.01507""" +class SEBlock2d(nn.Module): + """Squeeze-excitation block 2d + from https://arxiv.org/abs/1709.01507. + + Attributes: + num_channels: input/output channels. + r: Squeeze-excitation compression ratio. + activation: Non-linear activation object, string of configuration dictionary. + + """ def __init__( self, num_channels, r=16, activation={"name": "relu", "inplace": True} @@ -26,16 +34,62 @@ def __init__( ) self.sigmoid = nn.Sigmoid() - def forward(self, x): - z = torch.mean(x, dim=(2, 3), keepdim=True) - scale = self.sigmoid(self.conv2(self.act(self.conv1(z)))) + def _standardize_mask(self, mask): + if mask.dim() == 2: + return mask.view(mask.size(0), 1, 1, mask.size(-1)) + + if mask.dim() == 3: + return mask.unsqueeze(1) + + return mask + + def compute_scale_logits(self, x, x_mask=None): + """comptue the scale before the sigmoid + + Args: + x: input tensor with shape = (batch, channels, heigh, width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, channels, heigh, width). + """ + if x_mask is None: + z = torch.mean(x, dim=(2, 3), keepdim=True) + else: + x_mask = self._standardize_mask(x_mask) + total = torch.mean(x_mask, dim=(2, 3), keepdim=True) + z = torch.mean(x * x_mask, dim=(2, 3), keepdim=True) / total + + return self.conv2(self.act(self.conv1(z))) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, channels, heigh, width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, channels, heigh, width). + """ + scale_logits = self.compute_scale_logits(x, x_mask) + scale = self.sigmoid(scale_logits) y = scale * x return y -class TSEBlock2D(nn.Module): +class TSEBlock2d(nn.Module): """From https://arxiv.org/abs/1709.01507 - Modified to do pooling only in time dimension + Modified to do pooling only in time dimension. + + Attributes: + num_channels: input/output channels. + num_feats: Number of features in dimension 2. + r: Squeeze-excitation compression ratio. + activation: Non-linear activation object, string of configuration dictionary. + """ def __init__( @@ -62,10 +116,35 @@ def __init__( ) self.sigmoid = nn.Sigmoid() - def forward(self, x): + def _standardize_mask(self, mask): + if mask.dim() == 2: + return mask.view(mask.size(0), 1, 1, mask.size(-1)) + + if mask.dim() == 3: + return mask.unsqueeze(1) + + return mask + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, channels, heigh, width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, channels, heigh, width). + """ num_feats = x.shape[2] num_channels = x.shape[1] - z = torch.mean(x, dim=-1, keepdim=True) + if x_mask is None: + z = torch.mean(x, dim=-1, keepdim=True) + else: + x_mask = self._standardize_mask(x_mask) + total = torch.mean(x_mask, dim=-1, keepdim=True) + z = torch.mean(x * x_mask, dim=-1, keepdim=True) / total + z = z.view(-1, self.num_channels_1d, 1, 1) scale = self.sigmoid(self.conv2(self.act(self.conv1(z)))) scale = scale.view(-1, num_channels, num_feats, 1) @@ -73,9 +152,89 @@ def forward(self, x): return y +class FwSEBlock2d(SEBlock2d): + """frequency-wise Squeeze-excitation block 2d + + Attributes: + num_feats: input/output channels. + r: Squeeze-excitation compression ratio. + activation: Non-linear activation object, string of configuration dictionary. + + """ + + def __init__(self, num_feats, r=16, activation={"name": "relu", "inplace": True}): + super().__init__(num_feats, r, activation) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, channels, heigh, width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time) + Returns: + Tensor with shape = (batch, channels, heigh, width). + """ + x = x.transpose(1, 2) + y = super().forward(x, x_mask) + y = y.transpose(1, 2).contiguous() + return y + + +class CFwSEBlock2d(nn.Module): + """2-d channel and frequency wise squeeze-excitation block + + Attributes: + num_channels: input/output channels. + num_feats: Number of features in dimension 2. + r: Squeeze-excitation compression ratio. + activation: Non-linear activation object, string of configuration dictionary. + + """ + + def __init__( + self, + num_channels, + num_feats, + r=16, + activation={"name": "relu", "inplace": True}, + ): + super().__init__() + self.cw_se = SEBlock2d(num_channels, r, activation) + # the bottlenet features will have at least dimension 4 + if num_feats // r < 4: + r = num_feats // 4 + + self.fw_se = SEBlock2d(num_feats, r, activation) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, channels, heigh, width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time) + Returns: + Tensor with shape = (batch, channels, heigh, width). + """ + cw_scale_logits = self.cw_se.compute_scale_logits(x, x_mask) + fw_scale_logits = self.fw_se.compute_scale_logits( + x.transpose(1, 2), x_mask + ).transpose(1, 2) + scale_logits = cw_scale_logits + fw_scale_logits + scale = torch.sigmoid(scale_logits) + y = scale * x + return y + + class SEBlock1d(nn.Module): """1d Squeeze Excitation version of https://arxiv.org/abs/1709.01507 + + Attributes: + num_channels: input/output channels. + r: Squeeze-excitation compression ratio. + activation: Non-linear activation object, string of configuration dictionary. """ def __init__( @@ -91,13 +250,35 @@ def __init__( ) self.sigmoid = nn.Sigmoid() - def forward(self, x): - z = torch.mean(x, dim=2, keepdim=True) + def _standardize_mask(self, mask): + if mask.dim() == 2: + return mask.unsqueeze(1) + + return mask + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, channels, time). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time) + + Returns: + Tensor with shape = (batch, channels, time). + """ + if x_mask is None: + z = torch.mean(x, dim=2, keepdim=True) + else: + x_mask = self._standardize_mask(x_mask) + total = torch.mean(x_mask, dim=-1, keepdim=True) + z = torch.mean(x * x_mask, dim=-1, keepdim=True) / total + scale = self.sigmoid(self.conv2(self.act(self.conv1(z)))) y = scale * x return y # aliases to mantein backwards compatibility -SEBlock2d = SEBlock2D -TSEBlock2d = TSEBlock2D +SEBlock2D = SEBlock2d +TSEBlock2D = TSEBlock2d diff --git a/hyperion/torch/layer_blocks/seresnet_blocks.py b/hyperion/torch/layer_blocks/seresnet_blocks.py index a5a7fecd..9c25055b 100644 --- a/hyperion/torch/layer_blocks/seresnet_blocks.py +++ b/hyperion/torch/layer_blocks/seresnet_blocks.py @@ -5,14 +5,33 @@ import torch import torch.nn as nn -from torch.nn import Conv2d, Linear, BatchNorm2d, Dropout2d +from torch.nn import BatchNorm2d, Conv2d, Dropout2d, Linear from ..layers import ActivationFactory as AF -from .se_blocks import SEBlock2D, TSEBlock2D from .resnet_blocks import ResNetBasicBlock, ResNetBNBlock +from .se_blocks import CFwSEBlock2d, FwSEBlock2d, SEBlock2d, TSEBlock2d class SEResNetBasicBlock(ResNetBasicBlock): + """Squeeze-excitation ResNet basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r: squeeze-excitation compression ratio. + se_type: type of squeeze excitation in [t-se, cw-se, fw-se, cfw-se] + freq_pos_enc: use frequency wise positional encoder. + num_feats: Number of features in dimension 2, needed if time_se=True. + time_se: (legacy deprecated) If true, use t-se + """ + def __init__( self, in_channels, @@ -25,10 +44,11 @@ def __init__( norm_layer=None, norm_before=True, se_r=16, - time_se=False, + se_type="cw-se", + freq_pos_enc=False, num_feats=None, + time_se=False, ): - super().__init__( in_channels, channels, @@ -39,16 +59,38 @@ def __init__( dilation=dilation, norm_layer=norm_layer, norm_before=norm_before, + freq_pos_enc=freq_pos_enc, + num_feats=num_feats, ) if time_se: - self.se_layer = TSEBlock2D(channels, num_feats, se_r, activation) - else: - self.se_layer = SEBlock2D(channels, se_r, activation) - - def forward(self, x): + se_type = "t-se" + + if se_type == "t-se": + self.se_layer = TSEBlock2d(channels, num_feats, se_r, activation) + elif se_type == "cw-se": + self.se_layer = SEBlock2d(channels, se_r, activation) + elif se_type == "fw-se": + self.se_layer = FwSEBlock2d(num_feats, se_r, activation) + elif se_type == "cfw-se": + self.se_layer = CFwSEBlock2d(channels, num_feats, se_r, activation) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.pos_enc is not None: + x = self.pos_enc(x) + x = self.conv1(x) if self.norm_before: x = self.bn1(x) @@ -60,18 +102,19 @@ def forward(self, x): x = self.conv2(x) - if self.norm_before: - x = self.bn2(x) - if self.downsample is not None: residual = self.downsample(residual) - x = self.se_layer(x) - x += residual - x = self.act2(x) - - if not self.norm_before: + if self.norm_before: + x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act2(x) + else: + x = self.act2(x) x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual if self.dropout_rate > 0: x = self.dropout(x) @@ -80,6 +123,25 @@ def forward(self, x): class SEResNetBNBlock(ResNetBNBlock): + """Squeeze-excitation ResNet bottleneck Block. + + Attributes: + in_channels: input channels. + channels: channels in bottleneck layer when width_factor=1. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r=None: squeeze-excitation compression ratio. + se_type: type of squeeze excitation in [t-se, cw-se, fw-se, cfw-se] + freq_pos_enc: use frequency wise positional encoder. + num_feats: Number of features in dimension 2, needed if time_se=True. + time_se: (legacy deprecated) If true, use t-se + """ + def __init__( self, in_channels, @@ -92,10 +154,11 @@ def __init__( norm_layer=None, norm_before=True, se_r=16, - time_se=False, + se_type="cw-se", + freq_pos_enc=False, num_feats=None, + time_se=False, ): - super().__init__( in_channels, channels, @@ -106,17 +169,40 @@ def __init__( dilation=dilation, norm_layer=norm_layer, norm_before=norm_before, + freq_pos_enc=freq_pos_enc, + num_feats=num_feats, ) if time_se: - self.se_layer = TSEBlock2D( - channels * self.expansion, num_feats, se_r, activation - ) - else: - self.se_layer = SEBlock2D(channels * self.expansion, se_r, activation) - - def forward(self, x): + se_type = "t-se" + + se_channels = channels * self.expansion + if se_type == "t-se": + self.se_layer = TSEBlock2d(se_channels, num_feats, se_r, activation) + elif se_type == "cw-se": + self.se_layer = SEBlock2d(se_channels, se_r, activation) + elif se_type == "fw-se": + self.se_layer = FwSEBlock2d(num_feats, se_r, activation) + elif se_type == "cfw-se": + self.se_layer = CFwSEBlock2d(se_channels, num_feats, se_r, activation) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) + + if self.pos_enc is not None: + x = self.pos_enc(x) x = self.conv1(x) if self.norm_before: @@ -135,16 +221,14 @@ def forward(self, x): x = self.conv3(x) if self.norm_before: x = self.bn3(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x = self.se_layer(x) - x += residual - x = self.act3(x) - - if not self.norm_before: + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act3(x) + else: + x = self.act3(x) x = self.bn3(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/layer_blocks/spine_blocks.py b/hyperion/torch/layer_blocks/spine_blocks.py index 21978192..bb7a454a 100644 --- a/hyperion/torch/layer_blocks/spine_blocks.py +++ b/hyperion/torch/layer_blocks/spine_blocks.py @@ -3,14 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging + import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d import torch.nn.functional as nnf +from torch.nn import BatchNorm2d, Conv2d, Dropout2d -from ..layers.subpixel_convs import SubPixelConv2d from ..layers import ActivationFactory as AF - -import logging +from ..layers.subpixel_convs import SubPixelConv2d class Interpolate(nn.Module): @@ -132,6 +132,16 @@ def __init__( self.act1 = AF.create(activation) def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ x = self.conv1(x) if self.norm_before: x = self.bn1(x) @@ -200,6 +210,16 @@ def __init__( ) def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ if self.do_endpoint_conv and self.in_channels != self.channels: x = self.conv1(x) if self.norm_before: @@ -254,6 +274,16 @@ def __init__( self.bn2 = norm_layer(out_channels) def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ x = self.conv1(x) if self.norm_before: x = self.bn1(x) diff --git a/hyperion/torch/layer_blocks/tdnn_blocks.py b/hyperion/torch/layer_blocks/tdnn_blocks.py index 8fcbb056..c1a21d52 100644 --- a/hyperion/torch/layer_blocks/tdnn_blocks.py +++ b/hyperion/torch/layer_blocks/tdnn_blocks.py @@ -4,13 +4,28 @@ """ import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear from ..layers import ActivationFactory as AF from ..layers import Dropout1d class TDNNBlock(nn.Module): + """Building block for TDNN. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm1d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -56,15 +71,25 @@ def __init__( ) def freeze(self): + """Freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = False def unfreeze(self): + """Unreezes trainable parameters.""" for param in self.parameters(): param.requires_grad = True def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_time). + x_mask: unused. + Returns: + Tensor with shape = (batch, out_channels, out_time). + """ x = self.conv1(x) if self.norm_before: diff --git a/hyperion/torch/layer_blocks/transducer_joiner.py b/hyperion/torch/layer_blocks/transducer_joiner.py new file mode 100644 index 00000000..d2a7310d --- /dev/null +++ b/hyperion/torch/layer_blocks/transducer_joiner.py @@ -0,0 +1,70 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba, Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Optional, Tuple + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +import torch +import torch.nn as nn + + +class TransducerJoiner(nn.Module): + """ RNN-T Joiner network. + Implementation based on + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer_stateless7/joiner.py + + Attributes: + in_feats: input feature dimension. + vocab_size: vocabulary size + """ + + def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, + vocab_size: int): + super().__init__() + self.enc_feats = enc_feats + self.pred_feats = pred_feats + self.hid_feats = hid_feats + self.vocab_size = vocab_size + + self.enc_proj = nn.Linear(enc_feats, hid_feats) + self.pred_proj = nn.Linear(pred_feats, hid_feats) + self.output = nn.Linear(hid_feats, vocab_size) + + def get_config(self): + config = { + "joiner_type": "basic", + "hid_feats": self.hid_feats, + } + return config + + def forward(self, + enc_out: torch.Tensor, + pred_out: torch.Tensor, + project_input: bool = True) -> torch.Tensor: + """ + Args: + enc_out: output from the encoder with shape = (N, T, C) or (N, T, s_range, C) + pred_out: output from the predictor with shape = (N, U, C) or (N, T, s_range, C) + project_input: if True projects the encoder and predictor features + in the forward founction, if False it expects them outside. + Returns: + Symbols' logits of shape (N, T, U, C). + """ + assert enc_out.ndim == pred_out.ndim + assert enc_out.ndim in (3, 4) + + if enc_out.ndim == 3: + enc_out = enc_out.unsqueeze(2) # (N, T, 1, C) + pred_out = pred_out.unsqueeze(1) # (N, 1, U, C) + + if project_input: + x = self.enc_proj(enc_out) + self.pred_proj(pred_out) + else: + x = enc_out + pred_out + + x = torch.tanh(x) + logits = self.output(x) + return logits diff --git a/hyperion/torch/layer_blocks/transducer_predictor.py b/hyperion/torch/layer_blocks/transducer_predictor.py new file mode 100644 index 00000000..256753c5 --- /dev/null +++ b/hyperion/torch/layer_blocks/transducer_predictor.py @@ -0,0 +1,273 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba, Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Optional, Tuple + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +import torch +import torch.nn as nn + +from ...utils.misc import filter_func_args +from ..layers import ActivationFactory as AF + + +class TransducerRNNPredictor(nn.Module): + """ RNN-T prediction network with LSTM or GRU + Implmentation based on: + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer/decoder.py + + Attributes: + vocab_size: Number of tokens of the modeling unit including blank. + embed_dim: Dimension of the input embedding. + num_layers: Number of LSTM layers. + hid_feats: Hidden dimension of LSTM layers. + out_feats: Output dimension of the predictor. + embed_dropout_rate: Dropout rate for the embedding layer. + rnn_dropout_rate: Dropout for LSTM layers. + rnn_type: between lstm and gru + blank_id: The ID of the blank symbol. + """ + + def __init__(self, + vocab_size: int, + embed_dim: int, + num_layers: int, + hid_feats: int, + out_feats: Optional[int] = None, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + rnn_type: str = "lstm", + blank_id: int = 0): + super().__init__() + self.embedding = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=embed_dim, + padding_idx=blank_id, + ) + self.embed_dropout = nn.Dropout(embed_dropout_rate) + if rnn_type == "lstm": + self.rnn = nn.LSTM( + input_size=embed_dim, + hidden_size=hid_feats, + num_layers=num_layers, + batch_first=True, + dropout=rnn_dropout_rate, + ) + elif rnn_type == "gru": + self.rnn = nn.GRU( + input_size=embed_dim, + hidden_size=hid_feats, + num_layers=num_layers, + batch_first=True, + dropout=rnn_dropout_rate, + ) + else: + raise Exception(f"Unknown RNN type {rnn_type}") + + self.out_feats = out_feats + self.blank_id = blank_id + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.num_layers = num_layers + self.hid_feats = hid_feats + self.embed_dropout_rate = embed_dropout_rate + self.rnn_dropout_rate = rnn_dropout_rate + if out_feats is None: + out_feats = hid_feats + + self.out_feats = out_feats + if out_feats != hid_feats: + self.output_proj = nn.Linear(hid_feats, out_feats) + else: + self.output_proj = None + + def get_config(self): + config = { + "pred_type": "rnn", + "vocab_size": self.vocab_size, + "embed_dim": self.embed_dim, + "num_layers": self.num_layers, + "hid_feats": self.hid_feats, + "out_feats": self.out_feats, + "embed_dropout_rate": self.embed_dropout_rate, + "rnn_dropout_rate": self.rnn_dropout_rate, + "rnn_type": self.rnn_type, + "blank_id": self.blank_id, + } + return config + + def forward( + self, + y: torch.Tensor, + states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + Args: + y: previous y_{ prepended. + states: tuple of tensors containing RNN layers states + Returns: + - rnn_output, a tensor of shape (N, U, C) + - (h, c), containing the states i for RNN layers with shape (num_layers, N, C). + """ + embed = self.embedding(y) + embed = self.embed_dropout(embed) + out, (h, c) = self.rnn(embed, states) + if self.output_proj: + out = self.output_proj(out) + + return out, (h, c) + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + ): + logging.info("changing decoder config") + + if override_dropouts: + logging.info("overriding decoder dropouts") + self.rnn_dropout_rate = rnn_dropout_rate + self.rnn.p = self.rnn_dropout_rate + self.embed_dropout_rate = embed_dropout_rate + self.embed_dropout = nn.Dropout(self.embed_dropout_rate) + + +class TransducerConvPredictor(nn.Module): + """ RNN-T prediction network based on Convolutions + Implmentation based on: + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7/decoder.py + + Attributes: + vocab_size: Number of tokens of the modeling unit including blank. + embed_dim: Dimension of the input embedding. + blank_id: The ID of the blank symbol. + out_feats: Output dimension of the predictor. + embed_dropout_rate: Dropout rate for the embedding layer. + """ + + def __init__( + self, + vocab_size: int, + embed_dim: int, + out_feats: Optional[int] = None, + context_size: int = 2, + embed_dropout_rate: float = 0.0, + hid_act: str = "relu", + blank_id: int = 0, + ): + super().__init__() + self.embedding = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=embed_dim, + padding_idx=blank_id, + ) + self.embed_dropout = nn.Dropout(embed_dropout_rate) + assert context_size >= 1, context_size + if context_size > 1: + self.conv = nn.Conv1d( + in_channels=embed_dim, + out_channels=embed_dim, + kernel_size=context_size, + padding=0, + groups=out_feats // 4, + bias=False, + ) + + self.blank_id = blank_id + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.embed_dropout_rate = embed_dropout_rate + self.context_size = context_size + self.hid_act = AF.create(hid_act) + + if out_feats is None: + out_feats = embed_dim + + self.out_feats = out_feats + if out_feats != embed_dim: + self.output_proj = nn.Linear(embed_dim, out_feats) + else: + self.output_proj = None + + def get_config(self): + hid_act = AF.get_config(self.hid_act) + config = { + "pred_type": "conv", + "vocab_size": self.vocab_size, + "embed_dim": self.embed_dim, + "out_feats": self.out_feats, + "context_size": self.context_size, + "embed_dropout_rate": self.embed_dropout_rate, + "blank_id": self.blank_id, + "hid_act": hid_act, + } + return config + + def forward( + self, + y: torch.Tensor, + states: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]: + """ + Args: + y: + A 2-D tensor of shape (N, U). + # need_pad: + # True to left pad the input. Should be True during training. + # False to not pad the input. Should be False during inference. + Returns: + Return a tensor of shape (N, U, decoder_dim). + """ + y = y.to(torch.int64) + embed = self.embedding(y) + if self.context_size > 1: + embed = embed.transpose(1, 2) + if states is None: + embed = nn.functional.pad(embed, + pad=(self.context_size - 1, 0)) + else: + embed = torch.cat((states[0], embed), dim=-1) + + out = self.conv(embed).transpose(1, 2) + + out = self.hid_act(out) + if self.output_proj: + out = self.output_proj(out) + + return out, (embed[:, :, -self.context_size + 1:], ) + + # # this stuff about clamp() is a temporary fix for a mismatch + # # at utterance start, we use negative ids in beam_search.py + # if torch.jit.is_tracing(): + # # This is for exporting to PNNX via ONNX + # embedding_out = self.embedding(y) + # else: + # embedding_out = self.embedding(y.clamp(min=0)) * (y >= 0).unsqueeze(-1) + # if self.context_size > 1: + # embedding_out = embedding_out.permute(0, 2, 1) + # if need_pad is True: + # embedding_out = F.pad(embedding_out, pad=(self.context_size - 1, 0)) + # else: + # # During inference time, there is no need to do extra padding + # # as we only need one output + # assert embedding_out.size(-1) == self.context_size + # embedding_out = self.conv(embedding_out) + # embedding_out = embedding_out.permute(0, 2, 1) + # embedding_out = F.relu(embedding_out) + # return embedding_out + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + ): + logging.info("changing predictor config") + + if override_dropouts: + logging.info("overriding predictor dropouts") + self.embed_dropout_rate = embed_dropout_rate + self.embed_dropout = nn.Dropout(self.embed_dropout_rate) diff --git a/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py b/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py deleted file mode 100644 index c841a056..00000000 --- a/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py +++ /dev/null @@ -1,54 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import torch -import torch.nn as nn - - -class TransformerConv2dSubsampler(nn.Module): - """Convolutional 2D subsampling (to 1/4 length) Tor transformer - - Attributes: - in_feats: input feature dimension - out_feats: Transformer d_model - hid_act: activation layer object - pos_enc: positional encoder layer - time_dim: indicates which is the time dimension in the input tensor - """ - - def __init__(self, in_feats, out_feats, hid_act, pos_enc, time_dim=1): - super().__init__() - self.time_dim = time_dim - self.conv = nn.Sequential( - nn.Conv2d(1, out_feats, 3, 2, padding=(0, 1)), - hid_act, - nn.Conv2d(out_feats, out_feats, 3, 2, padding=(0, 1)), - hid_act, - ) - self.out = nn.Sequential( - nn.Linear(out_feats * (((in_feats - 1) // 2 - 1) // 2), out_feats), pos_enc - ) - - def forward(self, x, mask): - """Forward function. - - Args: - x: input tensor with size=(batch, time, num_feats) - mask: mask to indicate valid time steps for x (batch, time1, time2) - - Returns: - Tensor with output features - Tensor with subsampled mask - """ - if self.time_dim == 1: - x = x.transpose(1, 2) - - x = x.unsqueeze(1) # (b, c, f, t) - x = self.conv(x) - b, c, f, t = x.size() - x = self.out(x.contiguous().view(b, c * f, t).transpose(1, 2)) - if mask is None: - return x, None - return x, mask[:, :, :-2:2][:, :, :-2:2] diff --git a/hyperion/torch/layer_blocks/transformer_encoder_v1.py b/hyperion/torch/layer_blocks/transformer_encoder_v1.py index c8eaaa1b..cfb843b6 100644 --- a/hyperion/torch/layer_blocks/transformer_encoder_v1.py +++ b/hyperion/torch/layer_blocks/transformer_encoder_v1.py @@ -67,9 +67,9 @@ def __init__( self.self_attn = self_attn if isinstance(feed_forward, str): - self.feed_forward = self._make_ff( - feed_forward, num_feats, d_ff, ff_kernel_size, ff_act, ff_dropout_rate - ) + self.feed_forward = self._make_ff(feed_forward, num_feats, d_ff, + ff_kernel_size, ff_act, + ff_dropout_rate) else: self.feed_forward = feed_forward @@ -122,11 +122,15 @@ def _make_att( d_k, causal_pos_enc, dropout_rate, - time_dim=1, ) return ScaledDotProdAttV1( - num_feats, num_feats, num_heads, d_k, d_k, dropout_rate, time_dim=1 + num_feats, + num_feats, + num_heads, + d_k, + d_k, + dropout_rate, ) if att_type == "local-scaled-dot-prod-v1": @@ -140,7 +144,6 @@ def _make_att( context, causal_pos_enc, dropout_rate, - time_dim=1, ) return LocalScaledDotProdAttV1( @@ -151,11 +154,11 @@ def _make_att( d_k, context, dropout_rate, - time_dim=1, ) @staticmethod - def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rate): + def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, + dropout_rate): """Creates position-wise feed forward block from ff_type string Args: @@ -171,19 +174,27 @@ def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rat """ if ff_type == "linear": - return PositionwiseFeedForward( - num_feats, hid_feats, activation, dropout_rate, time_dim=1 - ) + return PositionwiseFeedForward(num_feats, + hid_feats, + activation, + dropout_rate, + time_dim=1) if ff_type == "conv1dx2": - return Conv1dx2( - num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 - ) + return Conv1dx2(num_feats, + hid_feats, + kernel_size, + activation, + dropout_rate, + time_dim=1) if ff_type == "conv1d-linear": - return Conv1dLinear( - num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 - ) + return Conv1dLinear(num_feats, + hid_feats, + kernel_size, + activation, + dropout_rate, + time_dim=1) def forward(self, x, pos_emb=None, mask=None): """Forward pass function diff --git a/hyperion/torch/layer_blocks/transformer_feedforward.py b/hyperion/torch/layer_blocks/transformer_feedforward.py index 900500ff..7d2e8c1b 100644 --- a/hyperion/torch/layer_blocks/transformer_feedforward.py +++ b/hyperion/torch/layer_blocks/transformer_feedforward.py @@ -40,10 +40,10 @@ def forward(self, x): x: input size=(batch, time, num_feats) Returns: - tensor size=(batch, time, num_feats) + Tensor size=(batch, time, num_feats) """ if self.time_dim != 1: - x = x.transpose(1, time_dim) + x = x.transpose(1, self.time_dim) x = self.activation(self.w_1(x)) if self.dropout_rate > 0: @@ -51,7 +51,7 @@ def forward(self, x): x = self.w_2(x) if self.time_dim != 1: - x = x.transpose(1, time_dim) + x = x.transpose(1, self.time_dim) return x @@ -73,7 +73,13 @@ class Conv1dx2(nn.Module): """ def __init__( - self, num_channels, hid_channels, kernel_size, dropout_rate=0, time_dim=-1 + self, + num_channels, + hid_channels, + kernel_size, + activation="relu6", + dropout_rate=0, + time_dim=-1, ): super().__init__() @@ -133,7 +139,13 @@ class Conv1dLinear(nn.Module): """ def __init__( - self, num_channels, hid_channels, kernel_size, dropout_rate=0, time_dim=-1 + self, + num_channels, + hid_channels, + kernel_size, + activation="relu6", + dropout_rate=0, + time_dim=-1, ): super().__init__() self.w_1 = nn.Conv1d( @@ -157,7 +169,7 @@ def forward(self, x): x: input tensors with size=(batch, time, num_channels) or size=(batch, num_channels, time). Returns: - output tensor same size as input + Output tensor same size as input """ if self.time_dim != -1: x.transpose(-1, self.time_dim) diff --git a/hyperion/torch/layer_blocks/transformer_input.py b/hyperion/torch/layer_blocks/transformer_input.py new file mode 100644 index 00000000..e55071b9 --- /dev/null +++ b/hyperion/torch/layer_blocks/transformer_input.py @@ -0,0 +1,152 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import math + +import torch +import torch.nn as nn + +from ..layers import ActivationFactory as AF + + +class TransformerConv2dSubsampler(nn.Module): + """Convolutional 2D subsampling (to 1//stride length) Tor transformer + + Attributes: + in_feats: input feature dimension + out_feats: Transformer d_model + hid_act: activation layer object + stride: total stride of the subsampler + pos_enc: positional encoder layer + time_dim: indicates which is the time dimension in the input tensor + """ + + def __init__( + self, in_feats, out_feats, hid_act, stride=4, pos_enc=None, time_dim=1 + ): + super().__init__() + self.time_dim = time_dim + hid_act = AF.create(hid_act) + self.stride = stride + if stride == 4: + stride_1 = 2 + stride_2 = 2 + hid_feats = out_feats * (((in_feats - 1) // 2 - 1) // 2) + elif stride == 2: + stride_1 = 2 + stride_2 = 1 + hid_feats = out_feats * ((in_feats - 1) // 2 - 2) + elif stride == 1: + stride_1 = 1 + stride_2 = 1 + hid_feats = out_feats * (in_feats - 4) + else: + raise NotImplementedError( + "Valid TransformerConv2dSubsampler stride==1,2,4 !={stride}" + ) + + self.conv = nn.Sequential( + nn.Conv2d(1, out_feats, 3, stride_1, padding=(0, 1)), + hid_act, + nn.Conv2d(out_feats, out_feats, 3, stride_2, padding=(0, 1)), + hid_act, + ) + + linear = nn.Linear(hid_feats, out_feats) + if pos_enc is None: + self.out = linear + else: + self.out = nn.Sequential(linear, pos_enc) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with size=(batch, time, in_feats) + x_mask: mask to indicate valid time steps for x (batch, time1, time2) + + Returns: + Tensor with output features with shape = (batch, time//stride, out_feats) + Tensor with subsampled mask // stride. + """ + if self.time_dim == 1: + x = x.transpose(1, 2) + + x = x.unsqueeze(1) # (b, c, f, t) + x = self.conv(x) + b, c, f, t = x.size() + x = self.out(x.contiguous().view(b, c * f, t).transpose(1, 2)) + if x_mask is None: + return x, None + + return x, x_mask[..., :: self.stride] + + +class TransformerConv1dSubsampler(nn.Module): + """Convolutional 1D subsampling (to 1//stride length) Tor transformer + + Attributes: + in_feats: input feature dimension + out_feats: Transformer d_model + hid_act: activation layer object + stride: total stride of the subsampler + pos_enc: positional encoder layer + time_dim: indicates which is the time dimension in the input tensor + """ + + def __init__( + self, in_feats, out_feats, hid_act, stride=4, pos_enc=None, time_dim=1 + ): + super().__init__() + self.time_dim = time_dim + hid_act = AF.create(hid_act) + self.stride = stride + if stride == 4: + stride_1 = 2 + stride_2 = 2 + elif stride == 2: + stride_1 = 2 + stride_2 = 1 + elif stride == 1: + stride_1 = 1 + stride_2 = 1 + else: + raise NotImplementedError( + "Valid TransformerConv1dSubsampler stride==1,2,4 !={stride}" + ) + + self.conv = nn.Sequential( + nn.Conv1d(in_feats, out_feats, 3, stride_1, padding=1), + hid_act, + nn.Conv1d(out_feats, out_feats, 3, stride_2, padding=1), + hid_act, + ) + + linear = nn.Linear(out_feats, out_feats) + if pos_enc is None: + self.out = linear + else: + self.out = nn.Sequential(linear, pos_enc) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with size=(batch, time, in_feats) + x_mask: mask to indicate valid time steps for x (batch, time1, time2) + + Returns: + Tensor with output features with shape = (batch, time//stride, out_feats) + Tensor with subsampled mask // stride. + """ + if self.time_dim == 1: + x = x.transpose(1, 2) + + x = self.conv(x) + x = self.out(x.transpose(1, 2)) + if x_mask is None: + return x, None + + return x, x_mask[:, :, :: self.stride] diff --git a/hyperion/torch/layers/__init__.py b/hyperion/torch/layers/__init__.py index 45ce75f8..d53646ed 100644 --- a/hyperion/torch/layers/__init__.py +++ b/hyperion/torch/layers/__init__.py @@ -3,29 +3,37 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .dropout import Dropout1d, DropConnect2d, DropConnect1d -from .global_pool import * - from .activation_factory import ActivationFactory -from .norm_layer_factory import NormLayer2dFactory, NormLayer1dFactory -from .pool_factory import GlobalPool1dFactory - -from .margin_losses import CosLossOutput, ArcLossOutput, SubCenterArcLossOutput - -from .audio_feats import * -from .audio_feats_factory import AudioFeatsFactory -from .spec_augment import AxisMasker, SpecWarper, SpecAugment -from .mvn import MeanVarianceNorm - from .attention import ( - ScaledDotProdAttV1, + LocalScaledDotProdAttRelPosEncV1, LocalScaledDotProdAttV1, ScaledDotProdAttRelPosEncV1, - LocalScaledDotProdAttRelPosEncV1, + ScaledDotProdAttV1, ) -from .pos_encoder import PosEncoder, RelPosEncoder, NoPosEncoder - -from .subpixel_convs import SubPixelConv1d, SubPixelConv2d, ICNR1d, ICNR2d -from .interpolate import Interpolate - +from .audio_feats import * +from .audio_feats_factory import AudioFeatsFactory from .calibrators import LinBinCalibrator +from .dropout import DropConnect1d, DropConnect2d, Dropout1d +from .feat_fuser_factory import FeatFuserFactory +from .feat_fusers import ( + CatFeatFuser, + LastFeatFuser, + LinearFeatFuser, + WeightedAvgFeatFuser, +) +from .global_pool import * +from .interpolate import Interpolate +from .lora import LoRAFactory +from .margin_losses import ArcLossOutput, CosLossOutput, SubCenterArcLossOutput +from .mvn import MeanVarianceNorm +from .norm_layer_factory import NormLayer1dFactory, NormLayer2dFactory +from .pool_factory import GlobalPool1dFactory +from .pos_encoder import ( + ConvPosEncoder, + NoPosEncoder, + PosEncoder, + PosEncoderBase, + RelPosEncoder, +) +from .spec_augment import AxisMasker, SpecAugment, SpecWarper +from .subpixel_convs import ICNR1d, ICNR2d, SubPixelConv1d, SubPixelConv2d diff --git a/hyperion/torch/layers/activation_factory.py b/hyperion/torch/layers/activation_factory.py index 7bc09827..e656eff5 100644 --- a/hyperion/torch/layers/activation_factory.py +++ b/hyperion/torch/layers/activation_factory.py @@ -5,7 +5,8 @@ # import torch.nn as nn -from .swish import Swish + +from .swish import DoubleSwish, DoubleSwish6, Swish, Swish6 act_dict = { "elu": nn.ELU, @@ -32,6 +33,10 @@ "logsoftmax": nn.LogSoftmax, "alogsoftmax": nn.AdaptiveLogSoftmaxWithLoss, "swish": Swish, + "double_swish": DoubleSwish, + "swish6": Swish6, + "double_swish6": DoubleSwish6, + "gelu": nn.GELU, } @@ -41,12 +46,13 @@ def create(activation, **kwargs): """Creates a non-linear activation object Args: - activation: str with activation type, - dictionary with name field indicating the activation type, and extra activation arguments + activation: String with activation type, + dictionary with name field indicating the activation type, + and extra activation arguments None, then it returns None, Activation constructor - **kwargs: extra arguments for activation constructor + **kwargs: Extra arguments for activation constructor Return: Non-linear activation object @@ -88,7 +94,6 @@ def create_from_str(activation_name, **kwargs): except: # activation didn't have inplace option del kwargs["inplace"] - pass return act_dict[activation_name](**kwargs) @@ -184,3 +189,12 @@ def get_config(activation): } if isinstance(activation, Swish): return {"name": "swish"} + if isinstance(activation, DoubleSwish): + return {"name": "double_swish"} + if isinstance(activation, Swish6): + return {"name": "swish6"} + if isinstance(activation, DoubleSwish6): + return {"name": "double_swish6"} + + if isinstance(activation, nn.GELU): + return {"name": "gelu"} diff --git a/hyperion/torch/layers/attention.py b/hyperion/torch/layers/attention.py index 7b4f5c06..3e53cec9 100644 --- a/hyperion/torch/layers/attention.py +++ b/hyperion/torch/layers/attention.py @@ -20,20 +20,23 @@ class ScaledDotProdAttV1(nn.Module): d_k: key/query projection dimension d_v: value projection dimension dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input - dimensions are (batch, time, in_feats) """ def __init__( - self, in_feats, out_feats, num_heads, d_k, d_v, dropout_rate=0, time_dim=1 + self, + in_feats, + out_feats, + num_heads, + d_k, + d_v, + dropout_rate=0, ): super().__init__() - # We assume d_v always equals d_k + # We assume d_q always equals d_k self.d_v = d_v self.d_k = d_k self.num_heads = num_heads self.dropout_rate = dropout_rate - self.time_dim = time_dim self.linear_q = nn.Linear(in_feats, num_heads * d_k) self.linear_k = nn.Linear(in_feats, num_heads * d_k) self.linear_v = nn.Linear(in_feats, num_heads * d_v) @@ -54,7 +57,7 @@ def __repr__(self): return self.__str__() def __str__(self): - s = "{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, dropout_rate={}, time_dim={})".format( + s = "{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, dropout_rate={})".format( self.__class__.__name__, self.in_feats, self.out_feats, @@ -62,17 +65,11 @@ def __str__(self): self.d_k, self.d_v, self.dropout_rate, - self.time_dim, ) return s def _compute_qkv(self, query, key, value): batch_size = value.size(0) - if self.time_dim != 1: - query = query.transpose(1, self.time_dim) - key = key.transpose(1, self.time_dim) - value = value.transpose(1, self.time_dim) - q = self.linear_q(query).view(batch_size, -1, self.num_heads, self.d_k) k = self.linear_k(key).view(batch_size, -1, self.num_heads, self.d_k) v = self.linear_v(value).view(batch_size, -1, self.num_heads, self.d_v) @@ -85,8 +82,7 @@ def _compute_qkv(self, query, key, value): def _compute_softmax(self, scores, mask): if mask is not None: mask = mask.unsqueeze(1).eq( - 0 - ) # (batch, 1, time1, time2) or (batch, 1, time) + 0) # (batch, 1, time1, time2) or (batch, 1, time) if scores.dtype == torch.half: min_value = -65504 else: @@ -95,14 +91,14 @@ def _compute_softmax(self, scores, mask): if mask.dim() == 4: scores = scores.masked_fill(mask, min_value) return torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0 - ) # (batch, head, time1, time2) + mask, 0.0) # (batch, head, time1, time2) else: - mask1 = mask.unsqueze(2) + mask1 = mask.unsqueeze(2) mask2 = mask.unsqueeze(-1) scores = scores.masked_fill(mask1, min_value) scores = scores.masked_fill(mask2, min_value) - return torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + return torch.softmax(scores, + dim=-1) # (batch, head, time1, time2) return torch.softmax(scores, dim=-1) # (batch, head, time1, time2) @@ -114,15 +110,13 @@ def _apply_attn(self, v): p_attn = self.attn x = torch.matmul(p_attn, v) # (batch, head, time1, d_k) - x = ( - x.transpose(1, 2) - .contiguous() - .view(batch_size, -1, self.num_heads * self.d_v) - ) # (batch, time1, d_model) + x = (x.transpose(1, 2).contiguous().view(batch_size, -1, + self.num_heads * self.d_v) + ) # (batch, time1, d_model) return self.linear_out(x) # (batch, time1, d_model) - ___compute_softmax = _compute_softmax - ___apply_attn = _apply_attn + _base_compute_softmax = _compute_softmax + _base_apply_attn = _apply_attn def forward(self, query, key, value, mask=None): """Computes 'Scaled Dot Product Attention'. @@ -141,10 +135,9 @@ def forward(self, query, key, value, mask=None): """ q, k, v = self._compute_qkv(query, key, value) scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt( - self.d_k - ) # (batch, head, time1, time2) - self.attn = self.___compute_softmax(scores, mask) - return self.___apply_attn(v) + self.d_k) # (batch, head, time1, time2) + self.attn = self._base_compute_softmax(scores, mask) + return self._base_apply_attn(v) class LocalScaledDotProdAttV1(ScaledDotProdAttV1): @@ -160,8 +153,6 @@ class LocalScaledDotProdAttV1(ScaledDotProdAttV1): d_v: value projection dimension context: maximum attention temporal context. dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input - dimensions are (batch, time, in_feats) """ def __init__( @@ -173,85 +164,39 @@ def __init__( d_v, context=25, dropout_rate=0, - time_dim=1, ): """Construct an MultiHeadedAttention object.""" - super().__init__( - in_feats, out_feats, num_heads, d_k, d_v, dropout_rate, time_dim - ) + super().__init__(in_feats, out_feats, num_heads, d_k, d_v, + dropout_rate) self.context = context def __repr__(self): return self.__str__() def __str__(self): - s = ( - "{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, " - "context={}, dropout_rate={}, time_dim={})".format( - self.__class__.__name__, - self.in_feats, - self.out_feats, - self.num_heads, - self.d_k, - self.d_v, - self.context, - self.dropout_rate, - self.time_dim, - ) - ) + s = ("{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, " + "context={}, dropout_rate={})".format( + self.__class__.__name__, + self.in_feats, + self.out_feats, + self.num_heads, + self.d_k, + self.d_v, + self.context, + self.dropout_rate, + )) return s - def _compute_qkv00(self, query, key, value): - batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) - if self.time_dim != 1: - query = query.transpose(1, self.time_dim) - key = key.transpose(1, self.time_dim) - value = value.transpose(1, self.time_dim) - - context_k = self.context - num_blocks = math.ceil(t2 / context_k) # (t2 + context_k//2)//context_k - context_q = math.ceil(t1 / num_blocks) - num_blocks_q = math.ceil(t1 / context_q) # (t1 + context_q//2)//context_q - assert ( - num_blocks == num_blocks_q - ), "num_blocks_k({})!=num_blocks_q({}), context_k={}, context_q={}, t1={}, t2={}".format( - num_blocks, num_blocks_q, context_k, context_q, t1, t2 - ) - pad1 = context_q * num_blocks - t1 - pad2 = context_k * num_blocks - t2 - # print('1',query.shape,key.shape,value.shape,pad1,pad2, context_q, context_k) - if pad1 > 0: - query = nn.functional.pad(query, (0, 0, 0, pad1)) - - if pad2 > 0: - key = nn.functional.pad(key, (0, 0, 0, pad2)) - value = nn.functional.pad(value, (0, 0, 0, pad2)) - - # print('2',query.shape,key.shape,value.shape) - q0 = self.linear_q(query) # (batch, time1, head*d_k) - k0 = self.linear_k(key) # (batch, time2, head*d_k) - v0 = self.linear_v(value) # (batch, time2, head*d_v) - - return q0, k0, v0, context_q, context_k, num_blocks - def _compute_qkv0(self, query, key, value): batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) - if self.time_dim != 1: - query = query.transpose(1, self.time_dim) - key = key.transpose(1, self.time_dim) - value = value.transpose(1, self.time_dim) - - num_blocks = round(t2 / self.context) - # print(num_blocks, t2, self.context) + t1 = query.size(1) + t2 = key.size(1) + + num_blocks = max(1, round(t2 / self.context)) context_k = math.ceil(t2 / num_blocks) context_q = math.ceil(t1 / num_blocks) pad1 = context_q * num_blocks - t1 pad2 = context_k * num_blocks - t2 - # print('1',query.shape,key.shape,value.shape,pad1,pad2, context_q, context_k) if pad1 > 0: query = nn.functional.pad(query, (0, 0, 0, pad1)) @@ -259,17 +204,16 @@ def _compute_qkv0(self, query, key, value): key = nn.functional.pad(key, (0, 0, 0, pad2)) value = nn.functional.pad(value, (0, 0, 0, pad2)) - # print('2',query.shape,key.shape,value.shape) q0 = self.linear_q(query) # (batch, time1, head*d_k) k0 = self.linear_k(key) # (batch, time2, head*d_k) v0 = self.linear_v(value) # (batch, time2, head*d_v) return q0, k0, v0, context_q, context_k, num_blocks - def _compute_scores( - self, q0, k0, num_blocks, context_q, context_k, q_left_shift, k_left_shift - ): - + def _compute_scores(self, q0, k0, num_blocks, context_q, context_k, + q_left_shift, k_left_shift): + # q0 (batch, time1, head*d_k) + # k0 (batch, time2, head*d_k) batch_size = q0.size(0) if q_left_shift > 0: # we are computing the shifted block-diag score matrix @@ -278,22 +222,14 @@ def _compute_scores( q0 = q0[:, q_left_shift:-q_right_shift] k0 = k0[:, k_left_shift:-k_right_shift] - q = ( - q0.view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + q = (q0.view(batch_size, -1, self.num_heads, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, head, blocks, time1, d_k) - k = ( - k0.view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + k = (k0.view(batch_size, -1, self.num_heads, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, head, blocks time2, d_k) - # print('4',q.shape,k.shape) - return torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) @staticmethod @@ -331,7 +267,7 @@ def _softmax(scores1, scores2, shift1, shift2, t1, t2): context2 = scores1.size(4) # set elements in scores2 that overlap with elements in scores1 to -inf - scores2[:, :, :, : context1 - shift1, : context2 - shift2] = min_val + scores2[:, :, :, :context1 - shift1, :context2 - shift2] = min_val scores2[:, :, :, shift1:, shift2:] = min_val # set the padding time steps that we had to add to make integer block-number to -inf @@ -371,9 +307,9 @@ def _softmax(scores1, scores2, shift1, shift2, t1, t2): scores2 = scores2.view(batch_size, num_heads, -1, context2) # print('aa', scores1.shape, scores2.shape) # pad scores2 to have the same size as scores1 - scores2 = nn.functional.pad( - scores2, (0, 0, shift1, context1 - shift1), mode="constant", value=min_val - ) + scores2 = nn.functional.pad(scores2, (0, 0, shift1, context1 - shift1), + mode="constant", + value=min_val) # print('bb', scores1.shape, scores2.shape) # concat scores1, scores2 and do softmax in time2 dimension # (batch, heads, blocks*time1, 2*time2) @@ -381,17 +317,13 @@ def _softmax(scores1, scores2, shift1, shift2, t1, t2): # now we separate back probs into probs1, and probs2 # probs1 - probs1 = ( - probs[:, :, :, :context2] - .contiguous() - .view(batch_size, num_heads, num_blocks, -1, context2) - ) + probs1 = (probs[:, :, :, :context2].contiguous().view( + batch_size, num_heads, num_blocks, -1, context2)) # probs2 - probs2 = ( - probs[:, :, shift1 : -(context1 - shift1), context2:] - .contiguous() - .view(batch_size, num_heads, num_blocks - 1, -1, context2) - ) + probs2 = (probs[:, :, shift1:-(context1 - shift1), + context2:].contiguous().view(batch_size, num_heads, + num_blocks - 1, -1, + context2)) return probs1, probs2 @@ -406,9 +338,9 @@ def _mask_scores_1d(self, scores, mask, shift1, shift2): context1 = scores.size(3) context2 = scores.size(4) mask_blocks = torch.ones_like(scores, dtype=mask.dtype) - mask_single_block = torch.zeros( - (batch_size, context1, context2), dtype=mask.dtype - ) + mask_single_block = torch.zeros((batch_size, context1, context2), + dtype=mask.dtype, + device=mask.device) t1_start = shift1 t2_start = shift2 @@ -416,9 +348,11 @@ def _mask_scores_1d(self, scores, mask, shift1, shift2): t1_end = t1_start + context1 t2_end = t2_start + context2 mask_single_block.fill_(False) - mask_single_block.masked_fill_(mask[:, 0, t1_start:t1_end], True) - mask_single_block.masked_fill_(mask[:, :, t2_start:t2_end], True) - mask_blocks[:, block] = mask_single_block + mask_single_block.masked_fill_(mask[:, t1_start:t1_end, None], + True) + mask_single_block.masked_fill_(mask[:, None, t2_start:t2_end], + True) + mask_blocks[:, :, block] = mask_single_block.unsqueeze(1) t1_start += context1 t2_start += context2 @@ -437,23 +371,24 @@ def _mask_scores_2d(self, scores, mask, shift1, shift2): mask_blocks = torch.ones_like(scores, dtype=mask.dtype) t1_start = shift1 t2_start = shift2 + mask = mask.unsequeeze(1) for block in range(num_blocks): t1_end = min(t1_start + context1, mask.size(1)) t2_end = min(t2_start + context2, mask.size(2)) - mask_blocks[:, block, : (t1_end - t1_start), : (t2_end - t2_start)] = mask[ - :, t1_start:t1_end, t2_start:t2_end - ] + mask_blocks[:, :, block, :(t1_end - t1_start), :( + t2_end - t2_start)] = mask[:, :, t1_start:t1_end, + t2_start:t2_end] t1_start += context1 t2_start += context2 return scores.masked_fill(mask_blocks, min_value) - def _compute_softmax( - self, scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 - ): + def _compute_softmax(self, scores1, scores2, mask, q_left_shift, + k_left_shift, t1, t2): + if mask is not None: # put to -inf scores in points where mask==0 - if mask.dim() == 4: + if mask.dim() == 3: # case when mask is 2d matrix per batch element mask = mask.eq(0) # (batch, time1, time2) @@ -461,27 +396,27 @@ def _compute_softmax( scores1 = self._mask_scores_2d(scores1, mask, 0, 0) # second, we mask shifted block diagonal blocks - scores2 = self._mask_scores_2d( - scores2, mask, q_left_shift, k_left_shift - ) + scores2 = self._mask_scores_2d(scores2, mask, q_left_shift, + k_left_shift) - else: + elif mask.dim() == 2: # case when mask is 1d vector per batch element, # meaning that time1 and time2 are the same, so mask is symmetric + pad2 = scores1.size(2) * scores1.size(3) - mask.size(-1) mask = nn.functional.pad(mask, (0, pad2)) - mask = mask.squeeze(1).eq(0) # (batch, 1, time) + mask = mask.eq(0) # (batch, time) # first, we mask block diagonal blocks scores1 = self._mask_scores_1d(scores1, mask, 0, 0) # second, we mask shifted block diagonal blocks - scores2 = self._mask_scores_1d( - scores2, mask, q_left_shift, k_left_shift - ) + scores2 = self._mask_scores_1d(scores2, mask, q_left_shift, + k_left_shift) + else: + raise ValueError() - self.attn1, self.attn2 = self._softmax( - scores1, scores2, q_left_shift, k_left_shift, t1, t2 - ) + self.attn1, self.attn2 = self._softmax(scores1, scores2, q_left_shift, + k_left_shift, t1, t2) def _apply_attn(self, v0, t1): if self.dropout_rate > 0: @@ -500,51 +435,43 @@ def _apply_attn(self, v0, t1): q_right_shift = context_q - q_left_shift k_right_shift = context_k - k_left_shift - v = ( - v0.view(batch_size, -1, self.num_heads, self.d_v) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + v = (v0.view(batch_size, -1, self.num_heads, + self.d_v).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, heads, blocks, time2, d_v) # print('8',p_attn1.shape,p_attn2.shape, v.shape) # (batch, head, blocks, time1, time2) x (batch, head, blocks, time2, d_v) x = torch.matmul(p_attn1, v) # (batch, heads, blocks, time1, d_k) # print('9',x.shape) - x = ( - x.view(batch_size, self.num_heads, -1, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, -1, self.num_heads * self.d_v) - ) + x = (x.view(batch_size, self.num_heads, -1, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, -1, self.num_heads * self.d_v)) # (batch, time1, d_model) # print('10',x.shape) - v = ( - v0[:, k_left_shift:-k_right_shift] - .view(batch_size, -1, self.num_heads, self.d_v) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_v) - ) + v = (v0[:, k_left_shift:-k_right_shift].view( + batch_size, -1, self.num_heads, + self.d_v).transpose(1, + 2).contiguous().view(batch_size, + self.num_heads, + num_blocks - 1, -1, + self.d_v)) # (batch, blocks-1, head, time2, d_v) # print('11',p_attn1.shape,p_attn2.shape, v.shape) # (batch, blocks-1, head, time1, time2) x (batch, blocks-1, head, time2, d_v) x2 = torch.matmul(p_attn2, v) # (batch, heads, blocks-1, time1, d_k) # print('12',x2.shape) - x2 = ( - x2.view(batch_size, self.num_heads, -1, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, -1, self.num_heads * self.d_v) - ) + x2 = (x2.view(batch_size, self.num_heads, -1, self.d_k).transpose( + 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v)) # (batch, time1, d_model) # print('12',x2.shape) - x[:, q_left_shift:-q_right_shift:] = x[:, q_left_shift:-q_right_shift:] + x2 + x[:, + q_left_shift:-q_right_shift:] = x[:, + q_left_shift:-q_right_shift:] + x2 x = x[:, :t1] return self.linear_out(x) # (batch, time1, d_model) - def forward1(self, query, key, value, mask): + def forward(self, query, key, value, mask): """Computes 'Local Scaled Dot Product Attention'. Args: @@ -560,170 +487,91 @@ def forward1(self, query, key, value, mask): Attention weigthed average of the values with size=(batch, time1, out_feats) """ batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) - if t2 <= self.context: + t1 = query.size(1) + t2 = key.size(1) + if t2 <= 2 * self.context: return super().forward(query, key, value, mask) q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( - query, key, value - ) - # q0 size=(batch, time1, head * d_k) - # k0 size=(batch, time2, head * d_k) - # v0 size=(batch, time2, head * d_v) + query, key, value) + # q0 size=(batch, time1, head*d_k) + # k0 size=(batch, time2, head*d_k) + # v0 size=(batch, time2, head*d_v) # compute block diagonal affinity matrix - # # print('3',q0.shape,k0.shape,v0.shape) - # q = q0.view( - # batch_size, -1, self.num_heads, self.d_k).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks, -1, self.d_k) - # # (batch, head, blocks, time1, d_k) - # k = k0.view( - # batch_size, -1, self.num_heads, self.d_k).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks, -1, self.d_k) - # # (batch, head, blocks time2, d_k) - # # print('4',q.shape,k.shape) - - # scores1 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - scores1 = self._compute_scores(q0, k0, num_blocks, context_q, context_k, 0, 0) + scores1 = self._compute_scores(q0, k0, num_blocks, context_q, + context_k, 0, 0) # (batch, head, blocks context_q, context_k) - # print('5',scores1.shape) # compute shifted block diagonal affinity matrix q_left_shift = context_q // 2 k_left_shift = context_k // 2 - # q_right_shift = context_q - q_left_shift - # k_right_shift = context_k - k_left_shift - # q = q0[:,q_left_shift:-q_right_shift].view( - # batch_size, -1, self.num_heads, self.d_k).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks-1, -1, self.d_k) - # # (batch, blocks-1, head, time1, d_k) - # k = k0[:,k_left_shift:-k_right_shift].view( - # batch_size, -1, self.num_heads, self.d_k).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks-1, -1, self.d_k) - # # (batch, blocks-1, head, d_k) - # # print('6',q.shape,k.shape) - - # scores2 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - scores2 = self._compute_scores( - q0, k0, num_blocks - 1, context_q, context_k, q_left_shift, k_left_shift - ) + scores2 = self._compute_scores(q0, k0, num_blocks - 1, context_q, + context_k, q_left_shift, k_left_shift) # (batch, head, blocks-1 context_q, context_k) - # print('7',scores2.shape) # combine both block diagonal affinity matrix to do the softmax - # if mask is not None: - # # put to -inf scores in points where mask==0 - # if mask.dim() == 4: - # # case when mask is 2d matrix per batch element - # mask = mask.eq(0) # (batch, time1, time2) - - # # first, we mask block diagonal blocks - # scores1 = self._mask_scores_2d(scores1, mask, 0, 0) - - # # second, we mask shifted block diagonal blocks - # scores2 = self._mask_scores_2d(scores2, mask, q_left_shift, k_left_shift) + self._compute_softmax(scores1, scores2, mask, q_left_shift, + k_left_shift, t1, t2) + return self._apply_attn(v0, t1) - # else: - # # case when mask is 1d vector per batch element, - # # meaning that time1 and time2 are the same, so mask is symmetric - # mask = nn.functional.pad(mask, (0, pad2)) - # mask = mask.squeeze(1).eq(0) # (batch, 1, time) - # # first, we mask block diagonal blocks - # scores1 = self._mask_scores_1d(scores1, mask, 0, 0) +class BlockScaledDotProdAttV1(ScaledDotProdAttV1): + """Block Scaled dot product multihead attention layer + It calculates self-attention with block diagonal mask - # # second, we mask shifted block diagonal blocks - # scores2 = self._mask_scores_1d(scores2, mask, q_left_shift, k_left_shift) + Attributes: + in_feats: input feature dimension + out_feats: output feature dimension + num_heads: number of heads + d_k: key/query projection dimension + d_v: value projection dimension + context: maximum attention temporal context. + dropout_rate: dropout rate + """ - # self.attn1, self.attn2 = self._softmax( - # scores1, scores2, q_left_shift, k_left_shift, t1, t2) + def __init__( + self, + in_feats, + out_feats, + num_heads, + d_k, + d_v, + context=25, + dropout_rate=0, + ): + """Construct an MultiHeadedAttention object.""" + super().__init__(in_feats, out_feats, num_heads, d_k, d_v, + dropout_rate) + self.context = context - self._compute_softmax( - scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 - ) - return self._apply_attn(v0, t1) + def __repr__(self): + return self.__str__() - # if self.dropout_rate > 0: - # p_attn1 = self.dropout(self.attn1) - # p_attn2 = self.dropout(self.attn2) - # else: - # p_attn1 = self.attn1 - # p_attn2 = self.attn2 - - # v = v0.view( - # batch_size, -1, self.num_heads, self.d_v).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks, -1, self.d_k) - # # (batch, heads, blocks, time2, d_v) - # # print('8',p_attn1.shape,p_attn2.shape, v.shape) - # # (batch, blocks, head, time1, time2) x (batch, blocks, head, time2, d_v) - # x = torch.matmul(p_attn1, v) # (batch, heads, blocks, time1, d_k) - # # print('9',x.shape) - # x = x.view(batch_size, self.num_heads, -1, self.d_k).transpose( - # 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) - # # (batch, time1, d_model) - # # print('10',x.shape) - - # v = v0[:,k_left_shift:-k_right_shift].view( - # batch_size, -1, self.num_heads, self.d_v).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks-1, -1, self.d_v) - # # (batch, blocks-1, head, time2, d_v) - # # print('11',p_attn1.shape,p_attn2.shape, v.shape) - # # (batch, blocks-1, head, time1, time2) x (batch, blocks-1, head, time2, d_v) - # x2 = torch.matmul(p_attn2, v) # (batch, heads, blocks-1, time1, d_k) - # # print('12',x2.shape) - # x2 = x2.view(batch_size, self.num_heads, -1, self.d_k).transpose( - # 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) - # # (batch, time1, d_model) - # # print('12',x2.shape) - # x[:,q_left_shift:-q_right_shift:] = x[:,q_left_shift:-q_right_shift:] + x2 - # x = x[:,:t1] - # return self.linear_out(x) # (batch, time1, d_model) - - def forward2(self, query, key, value, mask): - """Computes 'Local Scaled Dot Product Attention'. + def __str__(self): + s = ("{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, " + "context={}, dropout_rate={})".format( + self.__class__.__name__, + self.in_feats, + self.out_feats, + self.num_heads, + self.d_k, + self.d_v, + self.context, + self.dropout_rate, + )) + return s - Args: - query: query with size=(batch, time1, in_feats), - where time1 is the output time dimension - key: key with size=(batch, time2, in_feats) - where time1 is the input time dimension - value: value with size=(batch, time2, in_feats) - mask: optional mask with size=(batch, time1, time2), - to zero attention between some time steps. - or (batch, time) if time1=time2 - Returns: - Attention weigthed average of the values with size=(batch, time1, out_feats) - """ + def _compute_qkv0(self, query, key, value): batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) - if t2 <= self.context: - return super().forward(query, key, value, mask) - - if self.time_dim != 1: - query = query.transpose(1, self.time_dim) - key = key.transpose(1, self.time_dim) - value = value.transpose(1, self.time_dim) + t1 = query.size(1) + t2 = key.size(1) - context_k = self.context - num_blocks = math.ceil(t2 / context_k) # (t2 + context_k//2)//context_k + num_blocks = max(1, t2 // self.context) + context_k = math.ceil(t2 / num_blocks) context_q = math.ceil(t1 / num_blocks) - num_blocks_q = math.ceil(t1 / context_q) # (t1 + context_q//2)//context_q - assert ( - num_blocks == num_blocks_q - ), "num_blocks_k({})!=num_blocks_q({}), context_k={}, context_q={}, t1={}, t2={}".format( - num_blocks, num_blocks_q, context_k, context_q, t1, t2 - ) pad1 = context_q * num_blocks - t1 pad2 = context_k * num_blocks - t2 - # print('1',query.shape,key.shape,value.shape,pad1,pad2, context_q, context_k) if pad1 > 0: query = nn.functional.pad(query, (0, 0, 0, pad1)) @@ -731,152 +579,185 @@ def forward2(self, query, key, value, mask): key = nn.functional.pad(key, (0, 0, 0, pad2)) value = nn.functional.pad(value, (0, 0, 0, pad2)) - # print('2',query.shape,key.shape,value.shape) q0 = self.linear_q(query) # (batch, time1, head*d_k) k0 = self.linear_k(key) # (batch, time2, head*d_k) v0 = self.linear_v(value) # (batch, time2, head*d_v) - # # q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( - # # query, key, value) - # # # q0 size=(batch, time1, head*d_k) - # # # k0 size=(batch, time2, head*d_k) - # # # v0 size=(batch, time2, head*d_v) + return q0, k0, v0, context_q, context_k, num_blocks - # compute block diagonal affinity matrix - # # print('3',q0.shape,k0.shape,v0.shape) - q = ( - q0.view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + def _compute_scores(self, q0, k0, num_blocks, context_q, context_k): + # q0 (batch, time1, head*d_k) + # k0 (batch, time2, head*d_k) + batch_size = q0.size(0) + q = (q0.view(batch_size, -1, self.num_heads, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, head, blocks, time1, d_k) - k = ( - k0.view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + k = (k0.view(batch_size, -1, self.num_heads, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, head, blocks time2, d_k) - # # print('4',q.shape,k.shape) - scores1 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - # # scores1 = self._compute_scores( - # # q0, k0, num_blocks, context_q, context_k, 0, 0) - # (batch, head, blocks context_q, context_k) - # print('5',scores1.shape) + return torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - # compute shifted block diagonal affinity matrix - q_left_shift = context_q // 2 - k_left_shift = context_k // 2 - q_right_shift = context_q - q_left_shift - k_right_shift = context_k - k_left_shift - q = ( - q0[:, q_left_shift:-q_right_shift] - .view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_k) - ) - # (batch, blocks-1, head, time1, d_k) - k = ( - k0[:, k_left_shift:-k_right_shift] - .view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_k) - ) - # # (batch, blocks-1, head, d_k) - # # print('6',q.shape,k.shape) + @staticmethod + def _softmax(scores, t1, t2): + """Computes softmax for block diagonal attention maps - scores2 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - # scores2 = self._compute_scores( - # q0, k0, num_blocks-1, context_q, context_k, - # q_left_shift, k_left_shift) - # (batch, head, blocks-1 context_q, context_k) - # print('7',scores2.shape) + Args: + scores: attention scores from block-diagonal score matrix + with size=(batch, heads, blocks, t1, t2) + t1: length of time dimension 1 (output time dimension) + t2: length of time dimension 2 (input time dimension), with self-att t1=t2. - # combine both block diagonal affinity matrix to do the softmax - # if mask is not None: - # # put to -inf scores in points where mask==0 - # if mask.dim() == 4: - # # case when mask is 2d matrix per batch element - # mask = mask.eq(0) # (batch, time1, time2) + Returns + probs1: posterior attention scores for block-diagonal att. matrix + with size=(batch, heads, blocks, t1, t2) + probs2: posterior attention scores for a shifted block-diagonal att. matrix + with size=(batch, heads, blocks-1, t1, t2) + + """ + if scores.dtype == torch.half: + min_val = -65504 + else: + min_val = -1e20 + + batch_size = scores.size(0) + num_heads = scores.size(1) + num_blocks = scores.size(2) + context1 = scores.size(3) + context2 = scores.size(4) + + # set the padding time steps that we had to add to make integer block-number to -inf + # in scores1 - # # first, we mask block diagonal blocks - # scores1 = self._mask_scores_2d(scores1, mask, 0, 0) + dt1 = max(0, scores.size(2) * scores.size(3) - t1) + if dt1 > 0: + scores[:, :, -1, -dt1:, :] = min_val - # # second, we mask shifted block diagonal blocks - # scores2 = self._mask_scores_2d(scores2, mask, q_left_shift, k_left_shift) + dt2 = max(0, scores.size(2) * scores.size(4) - t2) + if dt2 > 0: + scores[:, :, -1, :, -dt2:] = min_val - # else: - # # case when mask is 1d vector per batch element, - # # meaning that time1 and time2 are the same, so mask is symmetric - # mask = nn.functional.pad(mask, (0, pad2)) - # mask = mask.squeeze(1).eq(0) # (batch, 1, time) + # flatten blocks and time1 dimensions + scores = scores.view(batch_size, num_heads, -1, context2) + # pad scores2 to have the same size as scores1 - # # first, we mask block diagonal blocks - # scores1 = self._mask_scores_1d(scores1, mask, 0, 0) + # (batch, heads, blocks*time1, time2) + probs = torch.softmax(scores, dim=-1).contiguous().view( + batch_size, num_heads, num_blocks, -1, context2) - # # second, we mask shifted block diagonal blocks - # scores2 = self._mask_scores_1d(scores2, mask, q_left_shift, k_left_shift) + return probs - self.attn1, self.attn2 = self._softmax( - scores1, scores2, q_left_shift, k_left_shift, t1, t2 - ) + def _mask_scores_1d(self, scores, mask): + if scores.dtype == torch.half: + min_value = -65504 + else: + min_value = -1e20 + + batch_size = scores.size(0) + num_blocks = scores.size(2) + context1 = scores.size(3) + context2 = scores.size(4) + mask_blocks = torch.ones_like(scores, dtype=mask.dtype) + mask_single_block = torch.zeros((batch_size, context1, context2), + dtype=mask.dtype, + device=mask.device) + + t1_start = 0 + t2_start = 0 + for block in range(num_blocks): + t1_end = t1_start + context1 + t2_end = t2_start + context2 + mask_single_block.fill_(False) + mask_single_block.masked_fill_(mask[:, t1_start:t1_end, None], + True) + mask_single_block.masked_fill_(mask[:, None, t2_start:t2_end], + True) + mask_blocks[:, :, block] = mask_single_block.unsqueeze(1) + t1_start += context1 + t2_start += context2 + + return scores.masked_fill(mask_blocks, min_value) + + def _mask_scores_2d(self, scores, mask): + if scores.dtype == torch.half: + min_value = -65504 + else: + min_value = -1e20 + + batch_size = scores.size(0) + num_blocks = scores.size(2) + context1 = scores.size(3) + context2 = scores.size(4) + mask_blocks = torch.ones_like(scores, dtype=mask.dtype) + t1_start = 0 + t2_start = 0 + mask = mask.unsequeeze(1) + for block in range(num_blocks): + t1_end = min(t1_start + context1, mask.size(1)) + t2_end = min(t2_start + context2, mask.size(2)) + mask_blocks[:, :, block, :(t1_end - t1_start), :( + t2_end - t2_start)] = mask[:, :, t1_start:t1_end, + t2_start:t2_end] + t1_start += context1 + t2_start += context2 + + return scores.masked_fill(mask_blocks, min_value) + + def _compute_softmax(self, scores, mask, t1, t2): + + if mask is not None: + # put to -inf scores in points where mask==0 + if mask.dim() == 3: + # case when mask is 2d matrix per batch element + mask = mask.eq(0) # (batch, time1, time2) + + # first, we mask block diagonal blocks + scores = self._mask_scores_2d(scores, mask) + + elif mask.dim() == 2: + # case when mask is 1d vector per batch element, + # meaning that time1 and time2 are the same, so mask is symmetric + pad2 = scores.size(2) * scores.size(3) - mask.size(-1) + mask = nn.functional.pad(mask, (0, pad2)) + mask = mask.eq(0) # (batch, time) + + # first, we mask block diagonal blocks + scores = self._mask_scores_1d(scores, mask) + + else: + raise ValueError() - # # self._compute_softmax(scores1, scores2, mask, - # # q_left_shift, k_left_shift, t1, t2) - # # return self._apply_attn(v0, t1) + self.attn = self._softmax(scores, t1, t2) + def _apply_attn(self, v0, t1): if self.dropout_rate > 0: - p_attn1 = self.dropout(self.attn1) - p_attn2 = self.dropout(self.attn2) + p_attn = self.dropout(self.attn) else: - p_attn1 = self.attn1 - p_attn2 = self.attn2 + p_attn = self.attn - v = ( - v0.view(batch_size, -1, self.num_heads, self.d_v) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + batch_size = p_attn.size(0) + num_blocks = p_attn.size(2) + context_q = p_attn.size(3) + context_k = p_attn.size(4) + q_left_shift = context_q // 2 + k_left_shift = context_k // 2 + q_right_shift = context_q - q_left_shift + k_right_shift = context_k - k_left_shift + + v = (v0.view(batch_size, -1, self.num_heads, + self.d_v).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, heads, blocks, time2, d_v) - # print('8',p_attn1.shape,p_attn2.shape, v.shape) - # (batch, blocks, head, time1, time2) x (batch, blocks, head, time2, d_v) - x = torch.matmul(p_attn1, v) # (batch, heads, blocks, time1, d_k) - # print('9',x.shape) - x = ( - x.view(batch_size, self.num_heads, -1, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, -1, self.num_heads * self.d_v) - ) - # (batch, time1, d_model) - # print('10',x.shape) - v = ( - v0[:, k_left_shift:-k_right_shift] - .view(batch_size, -1, self.num_heads, self.d_v) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_v) - ) - # (batch, blocks-1, head, time2, d_v) - # print('11',p_attn1.shape,p_attn2.shape, v.shape) - # (batch, blocks-1, head, time1, time2) x (batch, blocks-1, head, time2, d_v) - x2 = torch.matmul(p_attn2, v) # (batch, heads, blocks-1, time1, d_k) - # print('12',x2.shape) - x2 = ( - x2.view(batch_size, self.num_heads, -1, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, -1, self.num_heads * self.d_v) - ) + # (batch, head, blocks, time1, time2) x (batch, head, blocks, time2, d_v) + x = torch.matmul(p_attn, v) # (batch, heads, blocks, time1, d_k) + x = (x.view(batch_size, self.num_heads, -1, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, -1, self.num_heads * self.d_v)) # (batch, time1, d_model) - # print('12',x2.shape) - x[:, q_left_shift:-q_right_shift:] = x[:, q_left_shift:-q_right_shift:] + x2 + x = x[:, :t1] return self.linear_out(x) # (batch, time1, d_model) @@ -896,35 +777,24 @@ def forward(self, query, key, value, mask): Attention weigthed average of the values with size=(batch, time1, out_feats) """ batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) + t1 = query.size(1) + t2 = key.size(1) - if t2 <= 2 * self.context: + if t2 < 2 * self.context: return super().forward(query, key, value, mask) q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( - query, key, value - ) + query, key, value) # q0 size=(batch, time1, head*d_k) # k0 size=(batch, time2, head*d_k) # v0 size=(batch, time2, head*d_v) # compute block diagonal affinity matrix - scores1 = self._compute_scores(q0, k0, num_blocks, context_q, context_k, 0, 0) + scores = self._compute_scores(q0, k0, num_blocks, context_q, context_k) # (batch, head, blocks context_q, context_k) - # compute shifted block diagonal affinity matrix - q_left_shift = context_q // 2 - k_left_shift = context_k // 2 - scores2 = self._compute_scores( - q0, k0, num_blocks - 1, context_q, context_k, q_left_shift, k_left_shift - ) - # (batch, head, blocks-1 context_q, context_k) - # combine both block diagonal affinity matrix to do the softmax - self._compute_softmax( - scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 - ) + self._compute_softmax(scores, mask, t1, t2) return self._apply_attn(v0, t1) @@ -941,8 +811,6 @@ class ScaledDotProdAttRelPosEncV1(ScaledDotProdAttV1): d_v: value projection dimension causal_pos_enc: positional encoder is 0 for attending future frames. dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input - dimensions are (batch, time, in_feats) """ def __init__( @@ -954,7 +822,6 @@ def __init__( d_v, causal_pos_enc=False, dropout_rate=0, - time_dim=1, ): super().__init__( in_feats, @@ -963,7 +830,6 @@ def __init__( d_k, d_v, dropout_rate=dropout_rate, - time_dim=time_dim, ) self.linear_pos = nn.Linear(in_feats, num_heads * d_k) @@ -991,19 +857,17 @@ def _apply_tril(self, x): 1 1 1 1 ] """ diag = x.size(3) - x.size(2) - if ( - self._tril is None - or self._tril.size(2) < x.size(2) - or self._tril.size(3) < x.size(3) - or self._tril_diag != diag - ): + if (self._tril is None or self._tril.size(2) < x.size(2) + or self._tril.size(3) < x.size(3) or self._tril_diag != diag): # in these cases we need to recompute the lower triangular mask - ones = torch.ones((x.size(2), x.size(3)), dtype=x.dtype, device=x.device) + ones = torch.ones((x.size(2), x.size(3)), + dtype=x.dtype, + device=x.device) self._tril = torch.tril(ones, diag)[None, None, :, :] self._tril_diag = diag tril = self._tril else: - tril = self._tril[:, :, : x.size(2), : x.size(3)] + tril = self._tril[:, :, :x.size(2), :x.size(3)] return x * tril @@ -1018,19 +882,17 @@ def _apply_triu(self, x): """ # we add 1 to put the diagonal to 0 so we don't count the R_0 embedding twice diag = x.size(3) - x.size(2) + 1 - if ( - self._triu is None - or self._triu.size(2) < x.size(2) - or self._triu.size(3) < x.size(3) - or self._triu_diag != diag - ): + if (self._triu is None or self._triu.size(2) < x.size(2) + or self._triu.size(3) < x.size(3) or self._triu_diag != diag): # in these cases we need to recompute the lower triangular mask - ones = torch.ones((x.size(2), x.size(3)), dtype=x.dtype, device=x.device) + ones = torch.ones((x.size(2), x.size(3)), + dtype=x.dtype, + device=x.device) self._triu = torch.triu(ones, diag)[None, None, :, :] self._triu_diag = diag triu = self._triu else: - triu = self._triu[:, :, -x.size(2) :, -x.size(3) :] + triu = self._triu[:, :, -x.size(2):, -x.size(3):] return x * triu @@ -1094,7 +956,8 @@ def forward(self, query, key, value, pos_emb=None, mask=None): q, k, v = self._compute_qkv(query, key, value) pos_batch_size = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, self.d_k) + p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, + self.d_k) p = p.transpose(1, 2) # (batch, head, time2, d_k) q = q.transpose(1, 2) # (batch, time1, head, d_k) @@ -1102,13 +965,14 @@ def forward(self, query, key, value, pos_emb=None, mask=None): q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) # compute A(a) + A(c) in Sec3.3, 2nd Eq. - AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) # (batch, head, time1, time2) + AC = torch.matmul(q_plus_u, + k.transpose(-2, -1)) # (batch, head, time1, time2) # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part # This is the sum of Btilde and Dtilde in the Appendix of the paper - BDtilde = torch.matmul( - q_plus_v, p.transpose(-2, -1) - ) # (batch, head, time1, time2) + BDtilde = torch.matmul(q_plus_v, + p.transpose(-2, + -1)) # (batch, head, time1, time2) # apply left shift as indicated in the Appendix to geth B+D BD = self._left_shift(BDtilde) @@ -1118,19 +982,15 @@ def forward(self, query, key, value, pos_emb=None, mask=None): # we assume that t2 >= t1 dt = key.size(1) - query.size(1) pos_emb_noncausal = pos_emb[:, dt:].flip( - dims=(1,) - ) # we flip to get R_0, ..., R_{L-1} - pos_emb_noncausal[ - :, :, 0::2 - ] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + dims=(1, )) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[:, :, 0:: + 2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] p = self.linear_pos(pos_emb_noncausal).view( - pos_batch_size, -1, self.num_heads, self.d_k - ) + pos_batch_size, -1, self.num_heads, self.d_k) p = p.transpose(1, 2) # (batch, head, time2-dt, d_k) - BDtilde = torch.matmul( - q_plus_v, p.transpose(-2, -1) - ) # (batch, head, time1, time2-dt) + BDtilde = torch.matmul(q_plus_v, p.transpose( + -2, -1)) # (batch, head, time1, time2-dt) BD_noncausal = self._right_shift(BDtilde) BD[:, :, :, dt:] += BD_noncausal @@ -1157,8 +1017,6 @@ class LocalScaledDotProdAttRelPosEncV1(LocalScaledDotProdAttV1): context: maximum attention temporal context. causal_pos_enc: positional encoder is 0 for attending future frames. dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input - dimensions are (batch, time, in_feats) """ def __init__( @@ -1171,7 +1029,6 @@ def __init__( context=25, causal_pos_enc=False, dropout_rate=0, - time_dim=1, ): super().__init__( in_feats, @@ -1181,7 +1038,6 @@ def __init__( d_v, context, dropout_rate=dropout_rate, - time_dim=time_dim, ) self.linear_pos = nn.Linear(in_feats, num_heads * d_k) @@ -1209,19 +1065,17 @@ def _apply_tril(self, x): 1 1 1 1 ] """ diag = x.size(4) - x.size(3) - if ( - self._tril is None - or self._tril.size(3) < x.size(3) - or self._tril.size(4) < x.size(4) - or self._tril_diag != diag - ): + if (self._tril is None or self._tril.size(3) < x.size(3) + or self._tril.size(4) < x.size(4) or self._tril_diag != diag): # in these cases we need to recompute the lower triangular mask - ones = torch.ones((x.size(3), x.size(4)), dtype=x.dtype, device=x.device) + ones = torch.ones((x.size(3), x.size(4)), + dtype=x.dtype, + device=x.device) self._tril = torch.tril(ones, diag)[None, None, None, :, :] self._tril_diag = diag tril = self._tril else: - tril = self._tril[:, :, :, : x.size(3), : x.size(4)] + tril = self._tril[:, :, :, :x.size(3), :x.size(4)] return x * tril @@ -1236,19 +1090,17 @@ def _apply_triu(self, x): """ # we add 1 to put the diagonal to 0 so we don't count the R_0 embedding twice diag = x.size(4) - x.size(3) + 1 - if ( - self._triu is None - or self._triu.size(3) < x.size(3) - or self._triu.size(4) < x.size(4) - or self._triu_diag != diag - ): + if (self._triu is None or self._triu.size(3) < x.size(3) + or self._triu.size(4) < x.size(4) or self._triu_diag != diag): # in these cases we need to recompute the lower triangular mask - ones = torch.ones((x.size(3), x.size(4)), dtype=x.dtype, device=x.device) + ones = torch.ones((x.size(3), x.size(4)), + dtype=x.dtype, + device=x.device) self._triu = torch.triu(ones, diag)[None, None, None, :, :] self._triu_diag = diag triu = self._triu else: - triu = self._triu[:, :, :, -x.size(3) :, -x.size(4) :] + triu = self._triu[:, :, :, -x.size(3):, -x.size(4):] return x * triu @@ -1319,27 +1171,52 @@ def forward(self, query, key, value, pos_emb=None, mask=None): Attention weigthed average of the value with size=(batch, time1, out_feats) """ batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) - q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( - query, key, value - ) - # q0 size=(batch, time1, head*d_k) - # k0 size=(batch, time2, head*d_k) - # v0 size=(batch, time2, head*d_v) - - q_plus_u0 = q0 + self.u.view(-1, q0.size(-1)) # (batch, time1, head*d_k) - - # q = q.transpose(1, 2) # (batch, time1, head, d_k) - # q_plus_u = (q + self.u).transpose(1, 2) #(batch, head, time1, d_k) - # q_plus_v = (q + self.v).transpose(1, 2) #(batch, head, time1, d_k) + t1 = query.size(1) + t2 = key.size(1) + if round(t2 / self.context) > 1: + return self._forward_nblocks(query, key, value, pos_emb, mask) + else: + return self._forward_1block(query, key, value, pos_emb, mask) - # compute A(a) + A(c) in Sec3.3, 2nd Eq. block diagonals - # 1) compute block diagonal affinity matrix - AC1 = self._compute_scores( - q_plus_u0, k0, num_blocks, context_q, context_k, 0, 0 - ) - # (batch, head, blocks, context_q, context_k) + def _forward_nblocks(self, query, key, value, pos_emb=None, mask=None): + """Computes 'Scaled Dot Product Attention' for the case that we have + more than 1block in the block diagonal attention matrix. + + Args: + query: query with size=(batch, time1, in_feats), + where time1 is the output time dimension + key: key with size=(batch, time2, in_feats) + where time1 is the input time dimension + value: value with size=(batch, time2, in_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 + mask: optional mask with size=(batch, time1, time2), + to zero attention between some time steps + or size=(batch, time) to make time1=time2 + Returns: + Attention weigthed average of the value with size=(batch, time1, out_feats) + """ + batch_size = query.size(0) + t1 = query.size(1) + t2 = key.size(1) + + q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( + query, key, value) + # q0 size=(batch, time1, head*d_k) + # k0 size=(batch, time2, head*d_k) + # v0 size=(batch, time2, head*d_v) + + q_plus_u0 = q0 + self.u.view(-1, + q0.size(-1)) # (batch, time1, head*d_k) + + # q = q.transpose(1, 2) # (batch, time1, head, d_k) + # q_plus_u = (q + self.u).transpose(1, 2) #(batch, head, time1, d_k) + # q_plus_v = (q + self.v).transpose(1, 2) #(batch, head, time1, d_k) + + # compute A(a) + A(c) in Sec3.3, 2nd Eq. block diagonals + # 1) compute block diagonal affinity matrix + AC1 = self._compute_scores(q_plus_u0, k0, num_blocks, context_q, + context_k, 0, 0) + # (batch, head, blocks, context_q, context_k) # 2) compute shifted block diagonal matrix q_left_shift = context_q // 2 @@ -1358,28 +1235,27 @@ def forward(self, query, key, value, pos_emb=None, mask=None): pos_emb = pos_emb[:, -context_k:] # (1, context_k, d_model) pos_batch_size = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, self.d_k) + p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, + self.d_k) p = p.transpose(1, 2) # (1, head, context_k, d_k) - q = q0.view( - batch_size, -1, self.num_heads, self.d_k - ) # (batch, time1, head, d_k) + q = q0.view(batch_size, -1, self.num_heads, + self.d_k) # (batch, time1, head, d_k) q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part # This is the sum of Btilde and Dtilde in the Appendix of the paper BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt( - self.d_k - ) # (batch, head, time1, context_k) + self.d_k) # (batch, head, time1, context_k) # apply left shift as indicated in the Appendix to geth B+D # 1) block-diagonal part of BD: BD1 BD1 = self._left_shift( - BDtilde, context_q, 0 - ) # (batch, head, blocks, context_q, context_k) + BDtilde, context_q, + 0) # (batch, head, blocks, context_q, context_k) # 2) shifted block diagonal part of BD: BD2 BD2 = self._left_shift( - BDtilde, context_q, q_left_shift - ) # (batch, head, blocks-1, context_q, context_k) + BDtilde, context_q, + q_left_shift) # (batch, head, blocks-1, context_q, context_k) # print('BD\n',BD1[0,0,0,:10,:10]) # print(BD2[0,0,0,:10,:10]) @@ -1390,22 +1266,18 @@ def forward(self, query, key, value, pos_emb=None, mask=None): # we assume that t2 >= t1, and therefore context_k >= context_q dt = context_k - context_q pos_emb_noncausal = pos_emb[:, dt:].flip( - dims=(1,) - ) # we flip to get R_0, ..., R_{L-1} - pos_emb_noncausal[ - :, :, 0::2 - ] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + dims=(1, )) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[:, :, 0:: + 2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] p = self.linear_pos(pos_emb_noncausal).view( - pos_batch_size, -1, self.num_heads, self.d_k - ) + pos_batch_size, -1, self.num_heads, self.d_k) p = p.transpose(1, 2) # (batch, head, context_k-dt, d_k) BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt( - self.d_k - ) # (batch, head, time1, context_k-dt) + self.d_k) # (batch, head, time1, context_k-dt) BD_noncausal1 = self._right_shift( - BDtilde, context_q, 0 - ) # (batch, head, blocks, context_q, context_k-dt) + BDtilde, context_q, + 0) # (batch, head, blocks, context_q, context_k-dt) BD_noncausal2 = self._right_shift( BDtilde, context_q, q_left_shift ) # (batch, head, blocks-1, context_q, context_k-dt) @@ -1421,7 +1293,394 @@ def forward(self, query, key, value, pos_emb=None, mask=None): # add AC and BD for block-diag s scores1 = AC1 + BD1 # (batch, head, blocks, context_q, context_k) scores2 = AC2 + BD2 # (batch, head, blocks-1, context_q, context_k) - self._compute_softmax( - scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 + self._compute_softmax(scores1, scores2, mask, q_left_shift, + k_left_shift, t1, t2) + return self._apply_attn(v0, t1) + + def _forward_1block(self, query, key, value, pos_emb=None, mask=None): + """Computes 'Scaled Dot Product Attention' for the case that + there is only one block in the block-diagonal attention matrix. + + Args: + query: query with size=(batch, time1, in_feats), + where time1 is the output time dimension + key: key with size=(batch, time2, in_feats) + where time1 is the input time dimension + value: value with size=(batch, time2, in_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 + mask: optional mask with size=(batch, time1, time2), + to zero attention between some time steps + or size=(batch, time) to make time1=time2 + Returns: + Attention weigthed average of the value with size=(batch, time1, out_feats) + """ + batch_size = value.size(0) + q, k, v = self._compute_qkv(query, key, value) + context_q = query.size(1) + + pos_batch_size = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, + self.d_k) + p = p.transpose(1, 2) # (batch, head, time2, d_k) + + q = q.transpose(1, 2) # (batch, time1, head, d_k) + q_plus_u = (q + self.u).transpose(1, 2) # (batch, head, time1, d_k) + q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) + + # compute A(a) + A(c) in Sec3.3, 2nd Eq. + AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) + # AC = (batch, head, time1, time2) + + # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part + # This is the sum of Btilde and Dtilde in the Appendix of the paper + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) + # BDtilde = (batch, head, time1, time2) + # apply left shift as indicated in the Appendix to geth B+D + BD = self._left_shift(BDtilde, context_q, 0).squeeze(2) + + if not self.causal_pos_enc: + # compute A(b) + A(d) for the non-causal part, + # this is not included in the paper because it doesn't allow to attent to future postions + # we assume that t2 >= t1 + dt = key.size(1) - query.size(1) + pos_emb_noncausal = pos_emb[:, dt:].flip( + dims=(1, )) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[:, :, 0:: + 2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] + p = self.linear_pos(pos_emb_noncausal).view( + pos_batch_size, -1, self.num_heads, self.d_k) + p = p.transpose(1, 2) # (batch, head, time2-dt, d_k) + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) + # BDtilde = (batch, head, time1, time2-dt) + BD_noncausal = self._right_shift(BDtilde, context_q, 0).squeeze(2) + BD[:, :, :, dt:] += BD_noncausal + + # add and normalize + scores = (AC + BD) / math.sqrt(self.d_k) # (batch, head, time1, time2) + self.attn = self._base_compute_softmax(scores, mask) + return self._base_apply_attn(v) + + +class BlockScaledDotProdAttRelPosEncV1(BlockScaledDotProdAttV1): + """Block Scaled dot product multihead attention layer + It calculates self-attention with block diagonal mask + + It uses relative positional encoders as defined in + https://arxiv.org/pdf/1901.02860.pdf + + Attributes: + in_feats: input feature dimension + out_feats: output feature dimension + num_heads: number of heads + d_k: key/query projection dimension + d_v: value projection dimension + context: maximum attention temporal context. + causal_pos_enc: positional encoder is 0 for attending future frames. + dropout_rate: dropout rate + """ + + def __init__( + self, + in_feats, + out_feats, + num_heads, + d_k, + d_v, + context=25, + causal_pos_enc=False, + dropout_rate=0, + ): + super().__init__( + in_feats, + out_feats, + num_heads, + d_k, + d_v, + context, + dropout_rate=dropout_rate, ) + + self.linear_pos = nn.Linear(in_feats, num_heads * d_k) + # u, v in paper, Sec 3.3, 2nd eq. + self.u = nn.Parameter(torch.Tensor(num_heads, d_k)) + self.v = nn.Parameter(torch.Tensor(num_heads, d_k)) + # we use same init as in espnet + nn.init.xavier_uniform_(self.u) + nn.init.xavier_uniform_(self.v) + + self.causal_pos_enc = causal_pos_enc + + self._tril = None + self._tril_diag = 0 + self._triu = None + self._triu_diag = 0 + + def _apply_tril(self, x): + """Applies lower triangular mask to (Q + v^T) W R_{i-j} attention matrix + to keep causal attention points, i.e., i-j >= 0 + E.g., + if t1=3, t2=4 this will apply a mask + [1 1 0 0; + 1 1 1 0; + 1 1 1 1 ] + """ + diag = x.size(4) - x.size(3) + if (self._tril is None or self._tril.size(3) < x.size(3) + or self._tril.size(4) < x.size(4) or self._tril_diag != diag): + # in these cases we need to recompute the lower triangular mask + ones = torch.ones((x.size(3), x.size(4)), + dtype=x.dtype, + device=x.device) + self._tril = torch.tril(ones, diag)[None, None, None, :, :] + self._tril_diag = diag + tril = self._tril + else: + tril = self._tril[:, :, :, :x.size(3), :x.size(4)] + + return x * tril + + def _apply_triu(self, x): + """Applies upper triangular mask to (Q + v^T) W R_{i-j} attention matrix + to keep non-causal attention points, i.e., i-j < 0 + E.g., + if t1=3, t2=4 this will apply a mask + [0 0 1 1; + 0 0 0 1; + 0 0 0 0 ] + """ + # we add 1 to put the diagonal to 0 so we don't count the R_0 embedding twice + diag = x.size(4) - x.size(3) + 1 + if (self._triu is None or self._triu.size(3) < x.size(3) + or self._triu.size(4) < x.size(4) or self._triu_diag != diag): + # in these cases we need to recompute the lower triangular mask + ones = torch.ones((x.size(3), x.size(4)), + dtype=x.dtype, + device=x.device) + self._triu = torch.triu(ones, diag)[None, None, None, :, :] + self._triu_diag = diag + triu = self._triu + else: + triu = self._triu[:, :, :, -x.size(3):, -x.size(4):] + + return x * triu + + def _left_shift(self, x, context): + """Applies left shifts to the rows of x + to get scores with relative pos encodings R_{i-j} + i-j >=0, causal attention + + E.g. + [q0 R3, q0 R2, q0 R1, q0 R0; + q1 R3, q1 R2, q1 R1, q1 R0; + q2 R3, q2 R2, q2 R1, q2 R0] + + becomes: + [q0 R1, q0 R0, 0 , 0 ; + q1 R2, q1 R1, q1 R0, 0 ; + q2 R3, q2 R2, q2 R1, q2 R0] + """ + x = x.view(x.size(0), x.size(1), -1, context, x.size(-1)) + x_pad = nn.functional.pad(x, (1, 0), mode="constant", value=0) + x_pad = x_pad.view(*x.size()[:3], x.size(4) + 1, x.size(3)) + x = x_pad[:, :, :, 1:].view_as(x) + return self._apply_tril(x) + + def _right_shift(self, x, context): + """Applies right shifts to the rows of x + to get scores with relative pos encodings R_{i-j} + i-j < 0, non-causal attention + + E.g. + [q0 R_0, q0 R_{-1}, q0 R_{-2}; + q1 R_0, q1 R_{-1}, q1 R_{-2}; + q2 R_0, q1 R_{-1}, q2 R_{-2}] + + becomes: + [ 0, q0 R_{-1}, q0 R_{-2}; + 0, 0 , q1 R_{-1}; + 0, 0 , 0 ] + """ + x = x.view(x.size(0), x.size(1), -1, context, x.size(-1)) + x_pad = nn.functional.pad(x, (0, 1), mode="constant", value=0) + x_pad = x_pad.view(*x.size()[:3], x.size(4) + 1, x.size(3)) + x = x_pad[:, :, :, :-1].view_as(x) + return self._apply_triu(x) + + def forward(self, query, key, value, pos_emb=None, mask=None): + """Computes 'Scaled Dot Product Attention'. + + Args: + query: query with size=(batch, time1, in_feats), + where time1 is the output time dimension + key: key with size=(batch, time2, in_feats) + where time1 is the input time dimension + value: value with size=(batch, time2, in_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 + mask: optional mask with size=(batch, time1, time2), + to zero attention between some time steps + or size=(batch, time) to make time1=time2 + Returns: + Attention weigthed average of the value with size=(batch, time1, out_feats) + """ + batch_size = query.size(0) + t1 = query.size(1) + t2 = key.size(1) + if t2 // self.context > 1: + return self._forward_nblocks(query, key, value, pos_emb, mask) + else: + return self._forward_1block(query, key, value, pos_emb, mask) + + def _forward_nblocks(self, query, key, value, pos_emb=None, mask=None): + """Computes 'Scaled Dot Product Attention' for the case that we have + more than 1block in the block diagonal attention matrix. + + Args: + query: query with size=(batch, time1, in_feats), + where time1 is the output time dimension + key: key with size=(batch, time2, in_feats) + where time1 is the input time dimension + value: value with size=(batch, time2, in_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 + mask: optional mask with size=(batch, time1, time2), + to zero attention between some time steps + or size=(batch, time) to make time1=time2 + Returns: + Attention weigthed average of the value with size=(batch, time1, out_feats) + """ + batch_size = query.size(0) + t1 = query.size(1) + t2 = key.size(1) + + q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( + query, key, value) + # q0 size=(batch, time1, head*d_k) + # k0 size=(batch, time2, head*d_k) + # v0 size=(batch, time2, head*d_v) + + q_plus_u0 = q0 + self.u.view(-1, + q0.size(-1)) # (batch, time1, head*d_k) + + # q = q.transpose(1, 2) # (batch, time1, head, d_k) + # q_plus_u = (q + self.u).transpose(1, 2) #(batch, head, time1, d_k) + # q_plus_v = (q + self.v).transpose(1, 2) #(batch, head, time1, d_k) + + # compute A(a) + A(c) in Sec3.3, 2nd Eq. block diagonals + # 1) compute block diagonal affinity matrix + AC1 = self._compute_scores(q_plus_u0, k0, num_blocks, context_q, + context_k) + # (batch, head, blocks, context_q, context_k) + + # AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) # (batch, head, time1, time2) + + pos_emb = pos_emb[:, -context_k:] # (1, context_k, d_model) + pos_batch_size = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, + self.d_k) + p = p.transpose(1, 2) # (1, head, context_k, d_k) + + q = q0.view(batch_size, -1, self.num_heads, + self.d_k) # (batch, time1, head, d_k) + q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) + + # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part + # This is the sum of Btilde and Dtilde in the Appendix of the paper + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt( + self.d_k) + # BDtilde = (batch, head, time1, context_k) + # apply left shift as indicated in the Appendix to geth B+D + # 1) block-diagonal part of BD: BD1 + BD1 = self._left_shift(BDtilde, context_q) + # BD = (batch, head, blocks, context_q, context_k) + # print('BD\n',BD1[0,0,0,:10,:10]) + + if not self.causal_pos_enc: + # compute A(b) + A(d) for the non-causal part, + # this is not included in the paper because it doesn't allow to attent to future postions + # we assume that t2 >= t1, and therefore context_k >= context_q + dt = context_k - context_q + pos_emb_noncausal = pos_emb[:, dt:].flip( + dims=(1, )) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[:, :, 0:: + 2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] + p = self.linear_pos(pos_emb_noncausal).view( + pos_batch_size, -1, self.num_heads, self.d_k) + p = p.transpose(1, 2) # (batch, head, context_k-dt, d_k) + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt( + self.d_k) # (batch, head, time1, context_k-dt) + BD_noncausal1 = self._right_shift(BDtilde, context_q) + # BD = (batch, head, blocks, context_q, context_k-dt) + # print(BD_noncausal1[0,0,0,:10,:10]) + BD1[:, :, :, :, dt:] += BD_noncausal1 + + # print(BD1[0,0,0,:10,:10]) + + # add AC and BD for block-diag s + scores = AC1 + BD1 # (batch, head, blocks, context_q, context_k) + self._compute_softmax(scores, mask, t1, t2) return self._apply_attn(v0, t1) + + def _forward_1block(self, query, key, value, pos_emb=None, mask=None): + """Computes 'Scaled Dot Product Attention' for the case that + there is only one block in the block-diagonal attention matrix. + + Args: + query: query with size=(batch, time1, in_feats), + where time1 is the output time dimension + key: key with size=(batch, time2, in_feats) + where time1 is the input time dimension + value: value with size=(batch, time2, in_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 + mask: optional mask with size=(batch, time1, time2), + to zero attention between some time steps + or size=(batch, time) to make time1=time2 + Returns: + Attention weigthed average of the value with size=(batch, time1, out_feats) + """ + batch_size = value.size(0) + q, k, v = self._compute_qkv(query, key, value) + context_q = query.size(1) + + pos_batch_size = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, + self.d_k) + p = p.transpose(1, 2) # (batch, head, time2, d_k) + + q = q.transpose(1, 2) # (batch, time1, head, d_k) + q_plus_u = (q + self.u).transpose(1, 2) # (batch, head, time1, d_k) + q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) + + # compute A(a) + A(c) in Sec3.3, 2nd Eq. + AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) + # AC = (batch, head, time1, time2) + + # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part + # This is the sum of Btilde and Dtilde in the Appendix of the paper + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) + # BDtilde = (batch, head, time1, time2) + # apply left shift as indicated in the Appendix to geth B+D + BD = self._left_shift(BDtilde, context_q).squeeze(2) + + if not self.causal_pos_enc: + # compute A(b) + A(d) for the non-causal part, + # this is not included in the paper because it doesn't allow to attent to future postions + # we assume that t2 >= t1 + dt = key.size(1) - query.size(1) + pos_emb_noncausal = pos_emb[:, dt:].flip( + dims=(1, )) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[:, :, 0:: + 2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] + p = self.linear_pos(pos_emb_noncausal).view( + pos_batch_size, -1, self.num_heads, self.d_k) + p = p.transpose(1, 2) # (batch, head, time2-dt, d_k) + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) + # BDtilde = (batch, head, time1, time2-dt) + BD_noncausal = self._right_shift(BDtilde, context_q).squeeze(2) + BD[:, :, :, dt:] += BD_noncausal + + # add and normalize + scores = (AC + BD) / math.sqrt(self.d_k) # (batch, head, time1, time2) + self.attn = self._base_compute_softmax(scores, mask) + return self._base_apply_attn(v) diff --git a/hyperion/torch/layers/audio_feats.py b/hyperion/torch/layers/audio_feats.py index d435ebbd..ed26b576 100644 --- a/hyperion/torch/layers/audio_feats.py +++ b/hyperion/torch/layers/audio_feats.py @@ -2,16 +2,13 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -# -import math import logging - -from ...utils.misc import str2bool +import math import torch -import torch.nn as nn import torch.cuda.amp as amp +import torch.nn as nn try: from torch.fft import rfft as torch_rfft @@ -24,7 +21,7 @@ _pow_spectrogram = lambda x: x.pow(2).sum(-1) _spectrogram = lambda x: x.pow(2).sum(-1).sqrt() -from ...feats.filter_banks import FilterBankFactory as FBF +from ...np.feats.filter_banks import FilterBankFactory as FBF # window types HAMMING = "hamming" @@ -68,21 +65,23 @@ def _get_feature_window_function(window_type, window_size, blackman_coeff=0.42): def _get_strided_batch(waveform, window_length, window_shift, snip_edges, center=False): - r"""Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``) - representing how the window is shifted along the waveform. Each row is a frame. + """Given a waveform (2D tensor of size (batch, num_samples), + it returns a 3D tensor (batch, m, window_size) + representing how the window is shifted along the waveform. Each row is a frame. Args: - waveform (torch.Tensor): Tensor of size ``num_samples`` - window_size (int): Frame length - window_shift (int): Frame shift - snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit - in the file, and the number of frames depends on the frame_length. If False, the number of frames - depends only on the frame_shift, and we reflect the data at the ends. - center (bool): If true, if puts the center of the frame at t*window_shift, starting at t=0, - If overwrides snip_edges and set it to False + waveform: Tensor of size (batch, num_samples). + window_size: Frame length in samples. + window_shift: Frame shift in samples. + snip_edges: If True, end effects will be handled by outputting only frames + that completely fit in the file, and the number of frames depends + on the frame_length. If False, the number of frames depends only + on the frame_shift, and we reflect the data at the ends. + center (bool): If true, if puts the center of the frame at t*window_shift, + starting at t=0, it overwrides snip_edges and set it to False Returns: - torch.Tensor: 3D tensor of size (m, ``window_size``) where each row is a frame + 3D tensor of size (batch, m, ``window_size``) where each row is a frame """ assert waveform.dim() == 2 batch_size = waveform.size(0) @@ -123,7 +122,7 @@ def _get_strided_batch(waveform, window_length, window_shift, snip_edges, center def _get_log_energy(x, energy_floor): - r"""Returns the log energy of size (m) for a strided_input (m,*)""" + r"""Returns the log energy of size (batch, m) for a strided_input (batch, m,*)""" log_energy = (x.pow(2).sum(-1) + 1e-15).log() # size (m) if energy_floor > 0.0: log_energy = torch.max( @@ -135,6 +134,13 @@ def _get_log_energy(x, energy_floor): class Wav2Win(nn.Module): + """Class that takes a batch of waveforms and returns windowed frames + with a given frame-shift and frame-length. + + Attributes: + + """ + def __init__( self, fs=16000, @@ -151,7 +157,6 @@ def __init__( raw_energy=True, return_log_energy=False, ): - super().__init__() self.fs = fs self.frame_length = frame_length @@ -204,7 +209,6 @@ def __str__(self): return s def forward(self, x): - # Add dither if self.dither != 0.0: n = torch.randn(x.shape, device=x.device) @@ -237,7 +241,7 @@ def forward(self, x): if self.return_log_energy and not self.raw_energy: signal_log_energy = _get_log_energy( - strided_input, self.energy_floor + x_strided, self.energy_floor ) # size (batch, m) # Pad columns with zero until we reach size (batch, num_frames, pad_length) @@ -254,6 +258,37 @@ def forward(self, x): class Wav2FFT(nn.Module): + """Computes FFT from waveforms. + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ + def __init__( self, fs=16000, @@ -270,13 +305,12 @@ def __init__( raw_energy=True, use_energy=True, ): - super().__init__() N = int(math.floor(frame_length * fs / 1000)) if N > fft_length: k = math.ceil(math.log(N) / math.log(2)) - self.fft_length = int(2 ** k) + self.fft_length = int(2**k) self.wav2win = Wav2Win( fs, @@ -326,14 +360,19 @@ def dither(self): return self.wav2win.dither def forward(self, x): + """Computes the comples Fourier transform. + Args: + x: waveform tensor with shape = (batch, num_samples). + + Returns: + FFT tensor with shape = (batch, num_frames, fft_length//2+1) + """ x_strided = self.wav2win(x) if self.use_energy: x_strided, log_e = x_strided - # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) - if self.use_energy: X[:, 0, :, 0] = log_e @@ -341,6 +380,37 @@ def forward(self, x): class Wav2Spec(Wav2FFT): + """Computes Spectrograms from waveforms. + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ + def __init__( self, fs=16000, @@ -358,7 +428,6 @@ def __init__( raw_energy=True, use_energy=True, ): - super().__init__( fs, frame_length, @@ -382,18 +451,21 @@ def __init__( self._to_spec = _pow_spectrogram def forward(self, x): + """Computes the Spectrogram. + + Args: + x: waveform tensor with shape = (batch, num_samples). + + Returns: + Spectrogram tensor with shape = (batch, num_frames, fft_length//2+1) + """ x_strided = self.wav2win(x) if self.use_energy: x_strided, log_e = x_strided - # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) pow_spec = self._to_spec(X) - # pow_spec = X.pow(2).sum(-1) - # if self.use_fft_mag: - # pow_spec = pow_spec.sqrt() - if self.use_energy: pow_spec[:, 0] = log_e @@ -401,6 +473,37 @@ def forward(self, x): class Wav2LogSpec(Wav2FFT): + """Computes log-spectrograms from waveforms. + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ + def __init__( self, fs=16000, @@ -418,7 +521,6 @@ def __init__( raw_energy=True, use_energy=True, ): - super().__init__( fs, frame_length, @@ -442,21 +544,21 @@ def __init__( self._to_spec = _pow_spectrogram def forward(self, x): + """Computes the log-spectrogram. + + Args: + x: waveform tensor with shape = (batch, num_samples). + Returns: + Spectrogram tensor with shape = (batch, num_frames, fft_length//2+1) + """ x_strided = self.wav2win(x) if self.use_energy: x_strided, log_e = x_strided - # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) pow_spec = self._to_spec(X) - - # pow_spec = X.pow(2).sum(-1) - # if self.use_fft_mag: - # pow_spec = pow_spec.sqrt() - pow_spec = (pow_spec + 1e-15).log() - if self.use_energy: pow_spec[:, 0] = log_e @@ -464,6 +566,46 @@ def forward(self, x): class Wav2LogFilterBank(Wav2FFT): + """Computes log-filter-bank from waveforms. + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + fb_type: Filter-bank type in ["mel_kaldi", "mel_etsi", + "mel_librosa", "mel_librosa_htk", "linear"] + (default = 'mel_kaldi') + low_freq: Low cutoff frequency for mel bins (default = 20) + high_freq: High cutoff frequency for mel bins, if < 0, + offset from Nyquist (default = 0) + num_filters: Number of triangular mel-frequency bins (default = 23) + norm_filters: Normalize filters coeff to sum up to 1, if librosa + it uses Stanley norm (default = False) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ + def __init__( self, fs=16000, @@ -486,7 +628,6 @@ def __init__( raw_energy=True, use_energy=True, ): - super().__init__( fs, frame_length, @@ -528,29 +669,24 @@ def __init__( self._to_spec = _pow_spectrogram def forward(self, x): + """Computes the log-filter-banks. + Args: + x: waveform tensor with shape = (batch, num_samples). + + Returns: + Filter-bank tensor with shape = (batch, num_frames, num_filters) + """ x_strided = self.wav2win(x) if self.use_energy: x_strided, log_e = x_strided - # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) - # logging.info('X={} {}'.format(X, X.type())) - # logging.info('X={}'.format(X.type())) pow_spec = self._to_spec(X) - # pow_spec = X.pow(2).sum(-1) - # # logging.info('p={} {} nan={}'.format(pow_spec, pow_spec.type(), torch.sum(torch.isnan(pow_spec)))) - # # logging.info('p={}'.format(pow_spec.type())) - # if self.use_fft_mag: - # pow_spec = pow_spec.sqrt() - with amp.autocast(enabled=False): pow_spec = torch.matmul(pow_spec.float(), self._fb.float()) - # logging.info('fb={} {}'.format(pow_spec, pow_spec.type())) - # logging.info('fb={}'.format(pow_spec.type())) + pow_spec = (pow_spec + 1e-10).log() - # logging.info('lfb={} {}'.format(pow_spec, pow_spec.type())) - # logging.info('lfb={}'.format(pow_spec.type())) if self.use_energy: pow_spec = torch.cat((log_e.unsqueeze(-1), pow_spec), dim=-1) @@ -558,6 +694,49 @@ def forward(self, x): class Wav2MFCC(Wav2FFT): + """Computes MFCC from waveforms. + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + fb_type: Filter-bank type in ["mel_kaldi", "mel_etsi", + "mel_librosa", "mel_librosa_htk", "linear"] + (default = 'mel_kaldi') + low_freq: Low cutoff frequency for mel bins (default = 20) + high_freq: High cutoff frequency for mel bins, if < 0, + offset from Nyquist (default = 0) + num_filters: Number of triangular mel-frequency bins (default = 23) + norm_filters: Normalize filters coeff to sum up to 1, if librosa + it uses Stanley norm (default = False) + num_ceps: Number of cepstra in MFCC computation (including C0) + (default = 13) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + cepstral_lifter: Constant that controls scaling of MFCCs (default = 22) + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ + def __init__( self, fs=16000, @@ -582,7 +761,6 @@ def __init__( raw_energy=True, use_energy=True, ): - super().__init__( fs, frame_length, @@ -650,6 +828,15 @@ def make_lifter(N, Q): @staticmethod def make_dct_matrix(num_ceps, num_filters): + """Calculates the DCT Matrix. + + Args: + num_ceps: Number of cepstral coeffs. + num_filters: Number of filters. + + Returns + DCT matrix (num_ceps, num_filters) + """ n = torch.arange(float(num_filters)).unsqueeze(1) k = torch.arange(float(num_ceps)) dct = torch.cos( @@ -660,23 +847,25 @@ def make_dct_matrix(num_ceps, num_filters): return dct def forward(self, x): + """Computes the MFCC. + + Args: + x: Waveform tensor with shape = (batch, num_samples). + + Returns: + MFCC tensor with shape = (batch, num_frames, num_ceps) + """ x_strided = self.wav2win(x) if self.use_energy: x_strided, log_e = x_strided - # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) pow_spec = self._to_spec(X) - # pow_spec = X.pow(2).sum(-1) - # if self.use_fft_mag: - # pow_spec = pow_spec.sqrt() - with amp.autocast(enabled=False): pow_spec = torch.matmul(pow_spec.float(), self._fb.float()) pow_spec = (pow_spec + 1e-10).log() - mfcc = torch.matmul(pow_spec, self._dct) if self.cepstral_lifter > 0: mfcc *= self._lifter @@ -691,6 +880,31 @@ class Wav2KanBayashiLogFilterBank(Wav2LogFilterBank): """Class to replicate log-filter-banks used in Kan Bayashi's ParallelWaveGAN repository: https://github.com/kan-bayashi/ParallelWaveGAN + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds + frame_shift: Frame shift in milliseconds + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + fb_type: Filter-bank type in ["mel_kaldi", "mel_etsi", + "mel_librosa", "mel_librosa_htk", "linear"] + (default = 'mel_kaldi') + low_freq: Low cutoff frequency for mel bins (default = 20) + high_freq: High cutoff frequency for mel bins, if < 0, + offset from Nyquist (default = 0) + num_filters: Number of triangular mel-frequency bins (default = 23) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False """ def __init__( @@ -707,7 +921,6 @@ def __init__( snip_edges=False, center=True, ): - super().__init__( fs=fs, frame_length=frame_length, @@ -732,6 +945,14 @@ def __init__( self.scale = 1.0 / math.log(10) def forward(self, x): + """Computes the Log filter banks using Kan Bayashi configuration. + + Args: + x: Waveform tensor with shape = (batch, num_samples). + + Returns: + Filter-bank tensor with shape = (batch, num_frames, num_samples) + """ return self.scale * super().forward(x) @@ -746,7 +967,6 @@ def __init__( num_filters=23, norm_filters=False, ): - super().__init__() self.fs = fs self.fft_length = fft_length @@ -770,6 +990,14 @@ def __init__( ) def forward(self, x): + """Computes the Log filter banks from spectrograms. + + Args: + x: Waveform tensor with shape = (batch, num_samples). + + Returns: + Filter-bank tensor with shape = (batch, num_frames, num_filters) + """ with amp.autocast(enabled=False): pow_spec = torch.matmul(x.float(), self._fb.float()) pow_spec = (pow_spec + 1e-10).log() diff --git a/hyperion/torch/layers/audio_feats_factory.py b/hyperion/torch/layers/audio_feats_factory.py index ac463f07..1694e84e 100644 --- a/hyperion/torch/layers/audio_feats_factory.py +++ b/hyperion/torch/layers/audio_feats_factory.py @@ -2,11 +2,11 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import re -from ...utils.misc import str2bool -from ...feats.filter_banks import FilterBankFactory as FBF +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...np.feats.filter_banks import FilterBankFactory as FBF from .audio_feats import * FFT = "fft" @@ -19,7 +19,11 @@ FEAT_TYPES = [FFT, SPEC, LOG_SPEC, LOG_FB, MFCC, KAN_BAYASHI] -class AudioFeatsFactory(object): +class AudioFeatsFactory: + """Factory class to create acoustic features layers like + FFT, Spectrogram, log-Spectrogram, log-filter-bank, MFCC. + """ + @staticmethod def create( audio_feat, @@ -45,6 +49,53 @@ def create( raw_energy=True, use_energy=True, ): + """ + Method that creates acoustic features layers like + FFT, Spectrogram, log-Spectrogram, log-filter-bank, MFCC. + + Args: + audio_feat: Type of feature extractor in ["fft", "spec", "log_spec", + "logfb", "mfcc", "kanbayashi_logfb"]. "kanbayashi_logfb" + should produce features compatible with WaveGAN repository. + sample_frequency: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + fb_type: Filter-bank type in ["mel_kaldi", "mel_etsi", + "mel_librosa", "mel_librosa_htk", "linear"] + (default = 'mel_kaldi') + low_freq: Low cutoff frequency for mel bins (default = 20) + high_freq: High cutoff frequency for mel bins, if < 0, + offset from Nyquist (default = 0) + num_filters: Number of triangular mel-frequency bins (default = 23) + norm_filters: Normalize filters coeff to sum up to 1, if librosa + it uses Stanley norm (default = False) + num_ceps: Number of cepstra in MFCC computation (including C0) + (default = 13) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + cepstral_lifter: Constant that controls scaling of MFCCs (default = 22) + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ if audio_feat == FFT: return Wav2FFT( @@ -161,15 +212,17 @@ def create( snip_edges=snip_edges, ) + raise ValueError(f"unknown feature type {audio_feat}") + @staticmethod def filter_args(**kwargs): - """Filters MFCC args from arguments dictionary. + """Filters feature extractor args from arguments dictionary. Args: kwargs: Arguments dictionary. Returns: - Dictionary with MFCC options. + Dictionary with feature extractor options. """ valid_args = ( "sample_frequency", @@ -189,7 +242,7 @@ def filter_args(**kwargs): "norm_filters", "num_ceps", "snip_edges", - "energy_floor", + "center" "energy_floor", "raw_energy", "use_energy", "cepstral_lifter", @@ -201,7 +254,7 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): - """Adds MFCC options to parser. + """Adds feature extractor options to parser. Args: parser: Arguments parser @@ -232,7 +285,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--remove-dc-offset", default=True, - type=str2bool, + action=ActionYesNo, help="Subtract mean from waveform on each frame", ) @@ -263,7 +316,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--dither", type=float, - default=1, + default=1.0 / 2**15, help="Dithering constant (0.0 means no dither)", ) @@ -279,7 +332,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--snip-edges", default=True, - type=str2bool, + action=ActionYesNo, help=( "If true, end effects will be handled by outputting only " "frames that completely fit in the file, and the number of " @@ -292,7 +345,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--center", default=False, - type=str2bool, + action=ActionYesNo, help=( "If true, puts the center of the frame at t*frame_shift, " "it over-wrides snip-edges and set it to false" @@ -309,13 +362,13 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--raw-energy", default=True, - type=str2bool, + action=ActionYesNo, help="If true, compute energy before preemphasis and windowing", ) parser.add_argument( "--use-energy", default=True, - type=str2bool, + action=ActionYesNo, help="Use energy (not C0) in MFCC computation", ) @@ -328,15 +381,14 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--audio-feat", - default="cepstrum", + default="logfb", choices=FEAT_TYPES, help=( - "It can return intermediate result: fft, spec, log_spec, " "logfb, mfcc" + "It can return intermediate result: fft, spec, log_spec, logfb, mfcc" ), ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='acoustic features options') add_argparse_args = add_class_args diff --git a/hyperion/torch/layers/calibrators.py b/hyperion/torch/layers/calibrators.py index 4b38a858..51d363b8 100644 --- a/hyperion/torch/layers/calibrators.py +++ b/hyperion/torch/layers/calibrators.py @@ -8,10 +8,26 @@ class LinBinCalibrator(nn.Module): + """Linear score calibrator. + Applies a scale and bias to a tensor. + + Attributes: + a: Scale + b: Bias + """ + def __init__(self, a, b): super().__init__() self.a = a self.b = b def forward(self, x): + """Applies scale and bias to a tensor. + + Args: + x: Input tensor. + + Returns: + Calibrated tensor. + """ return self.a * x + self.b diff --git a/hyperion/torch/layers/dropout.py b/hyperion/torch/layers/dropout.py index 6765baa5..22bff733 100644 --- a/hyperion/torch/layers/dropout.py +++ b/hyperion/torch/layers/dropout.py @@ -10,7 +10,21 @@ class Dropout1d(Dropout2d): + """Dropout for tensors with 1d spatial (time) dimension (3d tensors). + + Attributes: + p: Drop probability. + """ + def forward(self, inputs): + """Applies dropout 1d. + + Args: + inputs: Input tensor with shape = (batch, C, time). + + Returns: + Tensor with shape = (batch, C, time). + """ x = torch.unsqueeze(inputs, dim=-2) x = F.dropout2d(x, self.p, self.training, self.inplace) return torch.squeeze(x, dim=-2) @@ -24,6 +38,15 @@ def __str__(self): class DropConnect2d(nn.Module): + """DropsConnect for tensor with 2d spatial dimanions (4d tensors). + It drops the full feature map. It used to create residual networks + with stochastic depth. + + Attributes: + p: Probability of dropping the feature map. + + """ + def __init__(self, p=0.2): super().__init__() self.p = p @@ -36,6 +59,14 @@ def __str__(self): return s def forward(self, inputs): + """Applies drop-connect. + + Args: + inputs: Input tensor with shape = (batch, C, H, W). + + Returns: + Tensor with shape = (batch, C, H, W). + """ if not self.training: return inputs @@ -51,6 +82,15 @@ def forward(self, inputs): class DropConnect1d(nn.Module): + """DropsConnect for tensor with 1d spatial dimanions (3d tensors). + It drops the full feature map. It used to create residual networks + with stochastic depth. + + Attributes: + p: Probability of dropping the feature map. + + """ + def __init__(self, p=0.2): super().__init__() self.p = p @@ -63,6 +103,14 @@ def __str__(self): return s def forward(self, inputs): + """Applies drop-connect. + + Args: + inputs: Input tensor with shape = (batch, C, time). + + Returns: + Tensor with shape = (batch, C, time). + """ if not self.training: return inputs diff --git a/hyperion/torch/layers/feat_fuser_factory.py b/hyperion/torch/layers/feat_fuser_factory.py new file mode 100644 index 00000000..edc4d933 --- /dev/null +++ b/hyperion/torch/layers/feat_fuser_factory.py @@ -0,0 +1,101 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from typing import Optional + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args +from .feat_fusers import ( + CatFeatFuser, + LastFeatFuser, + LinearFeatFuser, + WeightedAvgFeatFuser, +) + +LAST_FUSER = "last" +WAVG_FUSER = "weighted-avg" +LINEAR_FUSER = "linear" +CAT_FUSER = "cat" + +FUSER_TYPES = [LAST_FUSER, WAVG_FUSER, LINEAR_FUSER, CAT_FUSER] + + +class FeatFuserFactory: + """Factory class to create feature fusers for Wav2Vec style hidden features.""" + + @staticmethod + def create( + fuser_type: str = WAVG_FUSER, + num_feats: Optional[int] = None, + feat_dim: Optional[int] = None, + proj_dim: Optional[int] = None, + proj_bias: bool = True, + ): + if fuser_type == WAVG_FUSER: + return WeightedAvgFeatFuser( + num_feats, feat_dim=feat_dim, proj_dim=proj_dim, proj_bias=proj_bias + ) + elif fuser_type == LAST_FUSER: + return LastFeatFuser( + feat_dim=feat_dim, proj_dim=proj_dim, proj_bias=proj_bias + ) + elif fuser_type == LINEAR_FUSER: + return LinearFeatFuser( + num_feats, feat_dim=feat_dim, proj_dim=proj_dim, proj_bias=proj_bias + ) + elif fuser_type == CAT_FUSER: + return CatFeatFuser( + num_feats, feat_dim=feat_dim, proj_dim=proj_dim, proj_bias=proj_bias + ) + else: + raise ValueError(f"unknown feature fuser type {fuser_type}") + + @staticmethod + def filter_args(**kwargs): + """Filters arguments correspondin to Feature Fuser + from args dictionary + + Args: + kwargs: args dictionary + + Returns: + args dictionary + """ + args = filter_func_args(FeatFuserFactory.create, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds feature extractor options to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--fuser-type", + default=WAVG_FUSER, + choices=FUSER_TYPES, + help=f"One of {FUSER_TYPES}", + ) + parser.add_argument( + "--proj-dim", + default=None, + type=int, + help="project features after fusion to proj_dim", + ) + parser.add_argument( + "--proj-bias", + default=True, + action=ActionYesNo, + help="linear projection has bias", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/layers/feat_fusers.py b/hyperion/torch/layers/feat_fusers.py new file mode 100644 index 00000000..44c72ffb --- /dev/null +++ b/hyperion/torch/layers/feat_fusers.py @@ -0,0 +1,86 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math + +import torch +import torch.nn as nn + + +class FeatFuser(nn.Module): + def __init__(self): + super().__init__() + + +class _ProjFeatFuser(FeatFuser): + def __init__(self, feat_dim=None, proj_dim=None, proj_bias=True): + super().__init__() + self.feat_dim = feat_dim + self.proj_dim = proj_dim + self.feat_proj = None + if feat_dim is not None and proj_dim is not None: + self.feat_proj = nn.Linear(feat_dim, proj_dim, bias=proj_bias) + + +class LastFeatFuser(_ProjFeatFuser): + def __init__(self, feat_dim=None, proj_dim=None, proj_bias=True): + super().__init__(feat_dim, proj_dim, proj_bias) + + def forward(self, feats): + feats = feats[-1] + if self.feat_proj is not None: + feats = self.feat_proj(feats) + + return feats + + +class WeightedAvgFeatFuser(_ProjFeatFuser): + def __init__(self, num_feats, feat_dim=None, proj_dim=None, proj_bias=True): + super().__init__(feat_dim, proj_dim, proj_bias) + self.num_feats = num_feats + self.feat_fuser = nn.Parameter(torch.zeros(num_feats)) + + def forward(self, feats): + feats = torch.stack(feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(feats * norm_weights, dim=-1) + if self.feat_proj is not None: + feats = self.feat_proj(feats) + + return feats + + +class LinearFeatFuser(_ProjFeatFuser): + def __init__(self, num_feats, feat_dim=None, proj_dim=None, proj_bias=True): + super().__init__(feat_dim, proj_dim, proj_bias) + self.num_feats = num_feats + self.feat_fuser = nn.Linear(num_feats, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, num_feats) / num_feats + + def forward(self, feats): + feats = torch.stack(feats, dim=-1) + feats = self.feat_fuser(feats).squeeze(dim=-1) + if self.feat_proj is not None: + feats = self.feat_proj(feats) + + return feats + + +class CatFeatFuser(FeatFuser): + def __init__(self, num_feats, feat_dim, proj_dim=None, proj_bias=True): + super().__init__() + self.num_feats = num_feats + self.feat_dim = feat_dim + if proj_dim is None: + proj_dim = feat_dim + self.proj_dim = proj_dim + self.proj_bias = proj_bias + self.feat_fuser = nn.Linear(num_feats * feat_dim, proj_dim, bias=proj_bias) + + def forward(self, feats): + feats = torch.cat(feats, dim=-1) + feats = self.feat_fuser(feats) + return feats diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 5a2e960c..f4174e3d 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -2,15 +2,19 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import math -import numpy as np +import numpy as np import torch import torch.nn as nn import torch.nn.functional as nnf +from ..utils import seq_lengths_to_mask + SQRT_EPS = 1e-5 +N_EPS = 1e-6 def _conv1(in_channels, out_channels, bias=False): @@ -19,19 +23,35 @@ def _conv1(in_channels, out_channels, bias=False): class _GlobalPool1d(nn.Module): + """Abstract base class Global pooling in 1d + + Attributes: + dim: Pooling dimension + keepdim: If True, it keeps the same number of dimensions after pooling + + """ + def __init__(self, dim=-1, keepdim=False): super().__init__() self.dim = dim self.keepdim = keepdim self.size_multiplier = 1 - def _standarize_weights(self, weights, ndims): + def _standardize_weights(self, x, x_lengths=None, weights=None): + """standardizes the weights to have the proper shape to be + multiplied by the input data. + """ + if weights is None: + time_dim = self.dim if self.dim >= 0 else x.dim() + self.dim + return seq_lengths_to_mask( + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=time_dim + ) - if weights.dim() == ndims: + if weights.dim() == x.dim(): return weights assert weights.dim() == 2 - shape = ndims * [1] + shape = x.dim() * [1] shape[0] = weights.shape[0] shape[self.dim] = weights.shape[1] return weights.view(tuple(shape)) @@ -44,7 +64,6 @@ def forward_slidwin(self, x, win_length, win_shift): raise NotImplementedError() def _slidwin_pad(self, x, win_length, win_shift, snip_edges): - if snip_edges: num_frames = int( math.floor((x.size(-1) - win_length + win_shift) / win_shift) @@ -68,21 +87,30 @@ class GlobalAvgPool1d(_GlobalPool1d): """Global average pooling in 1d Attributes: - dim: pooling dimension - keepdim: it True keeps the same number of dimensions after pooling + dim: Pooling dimension + keepdim: if True, it keeps the same number of dimensions after pooling """ def __init__(self, dim=-1, keepdim=False): super().__init__(dim, keepdim) - def forward(self, x, weights=None): + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor. + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor + """ + weights = self._standardize_weights(x, x_lengths, weights) if weights is None: y = torch.mean(x, dim=self.dim, keepdim=self.keepdim) return y - weights = self._standarize_weights(weights, x.dim()) - xbar = torch.mean(weights * x, dim=self.dim, keepdim=self.keepdim) wbar = torch.mean(weights, dim=self.dim, keepdim=self.keepdim) return xbar / wbar @@ -116,7 +144,6 @@ def _post_slidwin(self, m_x, x_shape): return m_x def _forward_slidwin_int(self, x, win_length, win_shift, snip_edges): - c_x, out_shape = self._pre_slidwin(x, win_length, win_shift, snip_edges) m_x = (c_x[:, win_shift:] - c_x[:, :-win_shift]) / win_length @@ -146,8 +173,8 @@ class GlobalMeanStdPool1d(_GlobalPool1d): """Global mean + standard deviation pooling in 1d Attributes: - dim: pooling dimension - keepdim: it True keeps the same number of dimensions after pooling + dim: Pooling dimension + keepdim: If True, it keeps the same number of dimensions after pooling """ @@ -155,7 +182,18 @@ def __init__(self, dim=-1, keepdim=False): super().__init__(dim, keepdim) self.size_multiplier = 2 - def forward(self, x, weights=None): + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor. + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor + """ + weights = self._standardize_weights(x, x_lengths, weights) if weights is None: mu = torch.mean(x, dim=self.dim, keepdim=True) delta = x - mu @@ -164,7 +202,7 @@ def forward(self, x, weights=None): # this can produce slightly negative variance when relu6 saturates in all time steps # add 1e-5 for stability s = torch.sqrt( - torch.mean(delta ** 2, dim=self.dim, keepdim=False).clamp(min=SQRT_EPS) + torch.mean(delta**2, dim=self.dim, keepdim=False).clamp(min=SQRT_EPS) ) mus = torch.cat((mu, s), dim=1) @@ -173,12 +211,11 @@ def forward(self, x, weights=None): return mus - weights = self._standarize_weights(weights, x.dim()) xbar = torch.mean(weights * x, dim=self.dim, keepdim=True) wbar = torch.mean(weights, dim=self.dim, keepdim=True) mu = xbar / wbar delta = x - mu - var = torch.mean(weights * delta ** 2, dim=self.dim, keepdim=True) / wbar + var = torch.mean(weights * delta**2, dim=self.dim, keepdim=True) / wbar s = torch.sqrt(var.clamp(min=SQRT_EPS)) mu = mu.squeeze(self.dim) s = s.squeeze(self.dim) @@ -218,19 +255,18 @@ def _forward_slidwin_int(self, x, win_length, win_shift, snip_edges): c_x = torch.cumsum(x, dim=-1).view(-1, x.shape[-1]) m_x = (c_x[:, win_shift:] - c_x[:, :-win_shift]) / win_length - c_x = torch.cumsum(x ** 2, dim=-1).view(-1, x.shape[-1]) + c_x = torch.cumsum(x**2, dim=-1).view(-1, x.shape[-1]) m_x2 = (c_x[:, win_shift:] - c_x[:, :-win_shift]) / win_length - s_x = torch.sqrt(m_x2 - m_x ** 2).clamp(min=SQRT_EPS) + s_x = torch.sqrt(m_x2 - m_x**2).clamp(min=SQRT_EPS) mus = self._post_slidwin(m_x, s_x, out_shape) return mus def _forward_slidwin_float(self, x, win_length, win_shift, snip_edges): - x, out_shape = self._pre_slidwin(x, win_length, win_shift, snip_edges) num_frames = out_shape[-1] c_x = torch.cumsum(x, dim=-1).view(-1, x.shape[-1]) - c_x2 = torch.cumsum(x ** 2, dim=-1).view(-1, x.shape[-1]) + c_x2 = torch.cumsum(x**2, dim=-1).view(-1, x.shape[-1]) # xx = x.view(-1, x.shape[-1]) # print(xx.shape[1]) @@ -274,7 +310,7 @@ def _forward_slidwin_float(self, x, win_length, win_shift, snip_edges): k += win_shift - var_x = (m_x2 - m_x ** 2).clamp(min=SQRT_EPS) + var_x = (m_x2 - m_x**2).clamp(min=SQRT_EPS) s_x = torch.sqrt(var_x) # idx = torch.isnan(s_x) #.any(dim=1) # if torch.sum(idx) > 0: @@ -342,8 +378,8 @@ class GlobalMeanLogVarPool1d(_GlobalPool1d): """Global mean + log-variance pooling in 1d Attributes: - dim: pooling dimension - keepdim: it True keeps the same number of dimensions after pooling + dim: Pooling dimension + keepdim: If True, it keeps the same number of dimensions after pooling """ @@ -351,19 +387,28 @@ def __init__(self, dim=-1, keepdim=False): super().__init__(dim, keepdim) self.size_multiplier = 2 - def forward(self, x, weights=None): + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor. + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor + """ + weights = self._standardize_weights(x, x_lengths, weights) if weights is None: mu = torch.mean(x, dim=self.dim, keepdim=self.keepdim) - x2bar = torch.mean(x ** 2, dim=self.dim, keepdim=self.keepdim) + x2bar = torch.mean(x**2, dim=self.dim, keepdim=self.keepdim) logvar = torch.log(x2bar - mu * mu + 1e-5) # for stability in case var=0 return torch.cat((mu, logvar), dim=-1) - weights = self._standarize_weights(weights, x.dim()) - xbar = torch.mean(weights * x, dim=self.dim, keepdim=self.keepdim) wbar = torch.mean(weights, dim=self.dim, keepdim=self.keepdim) mu = xbar / wbar - x2bar = torch.mean(weights * x ** 2, dim=self.dim, keepdim=self.keepdim) / wbar + x2bar = torch.mean(weights * x**2, dim=self.dim, keepdim=self.keepdim) / wbar var = (x2bar - mu * mu).clamp(min=1e-5) logvar = torch.log(var) @@ -371,15 +416,16 @@ def forward(self, x, weights=None): class LDEPool1d(_GlobalPool1d): - """Learnable dictionary encoder pooling in 1d + """Learnable dictionary encoder pooling in 1d. + It only works for 3d tensors. Attributes: - in_feats: input feature dimension - num_comp: number of cluster components - dist_pow: power for distance metric - use_bias: use bias parameter when computing posterior responsibility - dim: pooling dimension - keepdim: it True keeps the same number of dimensions after pooling + in_feats: Input feature dimension. + num_comp: Number of cluster components. + dist_pow: Power for distance metric. + use_bias: Use bias parameter when computing posterior responsibility. + dim: Pooling dimension. + keepdim: if True, it keeps the same number of dimensions after pooling. """ @@ -399,7 +445,7 @@ def __init__( if dist_pow == 1: self.dist_f = lambda x: torch.norm(x, p=2, dim=-1) else: - self.dist_f = lambda x: torch.sum(x ** 2, dim=-1) + self.dist_f = lambda x: torch.sum(x**2, dim=-1) self.size_multiplier = num_comp @@ -426,34 +472,58 @@ def __str__(self): ) return s - def forward(self, x, weights=None): + def _standardize_weights(self, x, x_lengths=None, weights=None): + """standardizes the weights to have shape (batch, max_length).""" + if weights is None: + return seq_lengths_to_mask( + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=1 + ) + + if weights.dim() == x.dim(): + return weights.traspose(1, self.dim) + + assert weights.dim() == 2 + return weights + + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor of shape=(batch, time, feat_dim) or (batch, feat_dim, time). + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor. + """ + weights = self._standardize_weights(x, x_lengths, weights) if self.dim != 1 or self.dim != -2: - x = x.transpose(1, self.dim) + x = x.transpose(1, self.dim) # (batch, time, feat_dim) - x = torch.unsqueeze(x, dim=2) - delta = x - self.mu - dist = self.dist_f(delta) + x = torch.unsqueeze(x, dim=2) # (batch, time, 1, feat_dim) + delta = x - self.mu # (batch, time, num_comp, feat_dim) + dist = self.dist_f(delta) # (batch, time, num_comp) - llk = -self.prec ** 2 * dist + self.bias - r = nnf.softmax(llk, dim=-1) + llk = -self.prec**2 * dist + self.bias + r = nnf.softmax(llk, dim=-1) # (batch, time, num_comp) if weights is not None: r *= weights - r = torch.unsqueeze(r, dim=-1) - N = torch.sum(r, dim=1) + 1e-9 - F = torch.sum(r * delta, dim=1) - pool = F / N + r = torch.unsqueeze(r, dim=-1) # (batch, time, num_comp, 1) + N = torch.sum(r, dim=1) + N_EPS # (batch, num_comp, 1) + F = torch.sum(r * delta, dim=1) # (batch, num_comp, feat_dim) + pool = F / N # (batch, num_comp, feat_dim) pool = pool.contiguous().view(-1, self.num_comp * self.in_feats) + # (batch, num_comp * feat_dim) if self.keepdim: if self.dim == 1 or self.dim == -2: - pool.unsqueeze_(1) + pool = pool.unsqueeze(1) else: - pool.unsqueeze_(-1) + pool = pool.unsqueeze(-1) return pool def get_config(self): - config = { "in_feats": self.in_feats, "num_comp": self.num_comp, @@ -466,6 +536,23 @@ def get_config(self): class ScaledDotProdAttV1Pool1d(_GlobalPool1d): + """Scaled dot product attention pooling in 1d. + The attention weights are obtained by scaled inner product + between the feature frames and learned parameters contained + inside the layer. + This class only works on 3d tensors. + + Attributes: + in_feats: Input feature dimension. + num_heads: Number of attention heads. + d_k: Dimension of the keys. + d_v: Dimension of the values + bin_attn: It True, use binary attention. Attention values are obtained by applying sigmoid to + the dot products instead of softmax. + dim: Pooling dimension. + keepdim: if True, it keeps the same number of dimensions after pooling. + """ + def __init__( self, in_feats, num_heads, d_k, d_v, bin_attn=False, dim=-1, keepdim=False ): @@ -505,9 +592,34 @@ def __str__(self): ) return s - def forward(self, x, weights=None): + def _standardize_weights(self, x, x_lengths=None, weights=None): + """standardizes the weights to have shape (batch, max_length).""" + if weights is None: + return seq_lengths_to_mask( + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=2 + ) + + if weights.dim() == x.dim(): + return weights.traspose(1, self.dim) + + assert weights.dim() == 2 + return weights + + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor of shape=(batch, time, feat_dim) or (batch, feat_dim, time). + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor. In this implementation only binary weights + are allowed. + """ + weights = self._standardize_weights(x, x_lengths, weights) batch_size = x.size(0) - if self.dim != 1: + if self.dim == 2 or self.dim == -1: x = x.transpose(1, self.dim) k = self.linear_k(x).view(batch_size, -1, self.num_heads, self.d_k) @@ -519,16 +631,20 @@ def forward(self, x, weights=None): self.d_k ) # (batch, head, 1, time) if self.bin_attn: + # use binary attention. scores = torch.sigmoid(scores + self.bias) # scores = scores.squeeze(dim=-1) # (batch, head, time) if weights is not None: - mask = weights.view(batch_size, 1, 1, -1).eq(0) # (batch, 1, 1,time) + mask = weights.view(batch_size, 1, 1, -1).eq(0) # (batch, 1, 1, time) if self.bin_attn: scores = scores.masked_fill(mask, 0.0) self.attn = scores / (torch.sum(scores, dim=-1, keepdim=True) + 1e-9) else: - min_value = -1e200 + if scores.dtype == torch.half: + min_value = -65504 + else: + min_value = -1e200 scores = scores.masked_fill(mask, min_value) self.attn = torch.softmax(scores, dim=-1).masked_fill( mask, 0.0 @@ -541,7 +657,14 @@ def forward(self, x, weights=None): x = torch.matmul(self.attn, v) # (batch, head, 1, d_v) if self.keepdim: - x = x.view(batch_size, 1, self.num_heads * self.d_v) # (batch, 1, d_model) + if self.dim == 1 or self.dim == -2: + x = x.view( + batch_size, 1, self.num_heads * self.d_v + ) # (batch, 1, d_model) + else: + x = x.view( + batch_size, 1, self.num_heads * self.d_v + ) # (batch, d_model, 1) else: x = x.view(batch_size, self.num_heads * self.d_v) # (batch, d_model) return x @@ -560,7 +683,20 @@ def get_config(self): class GlobalChWiseAttMeanStdPool1d(_GlobalPool1d): - """Attentive mean + stddev pooling for each channel""" + """Attentive mean + stddev pooling for each channel. + This class only works on 3d tensors. + + Attributes: + in_feats: Input feature dimension. + inner_feats: Feature dimension in the hidden layer of the content based attention. + bin_attn: If True, use binary attention. Attention values are obtained by applying sigmoid to + the dot products instead of softmax. + use_global_context: If True, concat global stats pooling to the input features to + compute the attention. + norm_layer: Normalization layer object, if None, it used BatchNorm1d. + dim: Pooling dimension. + keepdim: it True, it keeps the same number of dimensions after pooling. + """ def __init__( self, @@ -588,9 +724,9 @@ def __init__( self.norm_layer = norm_layer(inner_feats) self.activation = nn.Tanh() self.conv2 = _conv1(inner_feats, in_feats, bias=True) - self.stats_pool = GlobalMeanStdPool1d(dim=dim) + self.stats_pool = GlobalMeanStdPool1d(dim=-1) if self.bin_attn: - self.bias = nn.Parameter(torch.ones((1, in_feats, 1))) + self.bias = nn.Parameter(torch.zeros((1, in_feats, 1))) def __repr__(self): return self.__str__() @@ -607,23 +743,95 @@ def __str__(self): ) return s - def forward(self, x, weights=None): + def _standardize_weights(self, x, x_lengths=None, weights=None): + """standardizes the weights to have the proper shape to be + multiplied by the input data. + """ + if weights is None: + return seq_lengths_to_mask( + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=-1 + ) + + if weights.dim() == x.dim(): + return weights.transpose(self.dim, -1) + + assert weights.dim() == 2 + shape = x.dim() * [1] + shape[0] = weights.shape[0] + shape[-1] = weights.shape[1] + return weights.view(tuple(shape)) + + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor of shape=(batch, time, feat_dim) or (batch, feat_dim, time). + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor. + """ + assert x.dim() == 3, "Input should be a 3d tensor" + if self.dim == 1 or self.dim == -2: + x = x.transpose(1, self.dim) + + # x = (batch, feat_dim, time) + weights = self._standardize_weights(x, x_lengths, weights) # (batch, 1, time) + x_inner = self.conv1(x) # (batch, inner_dim, time) + # assert not torch.any( + # torch.isnan(x_inner) + # ), f"xinner is nan {torch.sum(torch.isnan(x_inner))} {torch.sum(torch.isnan(x))} {torch.mean(x)} {torch.sum(torch.isinf(x))} {x.size()}" + # assert not torch.any( + # torch.isinf(x_inner) + # ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(x))}" - x_inner = self.conv1(x) - # logging.info('x_inner1={} {}'.format(torch.sum(torch.isnan(x_inner)), torch.sum(torch.isinf(x_inner)))) if self.use_global_context: - global_mus = self.stats_pool(x) + global_mus = self.stats_pool(x, weights=weights) x_inner = x_inner + self.lin_global(global_mus).unsqueeze(-1) - # logging.info('x_inner2={} {}'.format(torch.sum(torch.isnan(x_inner)), torch.sum(torch.isinf(x_inner)))) - attn = self.conv2(self.activation(self.norm_layer(x_inner))) + # assert not torch.any( + # torch.isnan(x_inner) + # ), f"xinner is nan {torch.sum(torch.isnan(x_inner))} {torch.sum(torch.isnan(global_mus))}" + # assert not torch.any( + # torch.isinf(x_inner) + # ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(global_mus))}" + + attn = self.conv2( + self.activation(self.norm_layer(x_inner)) + ) # (batch, feat_dim, time) if self.bin_attn: - # attn = torch.sigmoid(attn+self.bias) - attn = torch.sigmoid(attn) + attn = torch.sigmoid(attn + self.bias).clamp(min=N_EPS) else: + if weights is not None: + if attn.dtype == torch.half: + min_value = -65504 + else: + min_value = -1e20 + mask = weights.eq(0) + attn = attn.masked_fill(mask, min_value) + attn = nnf.softmax(attn, dim=-1) + if weights is not None: + attn = attn * weights + + # assert not torch.any( + # torch.isnan(attn) + # ), f"attn is nan {torch.sum(torch.isnan(attn))}" + # assert not torch.any( + # torch.isinf(attn) + # ), f"attn is inf {torch.sum(torch.isinf(attn))}" mus = self.stats_pool(x, weights=attn) - # logging.info('mus={} {}'.format(torch.sum(torch.isnan(mus)), torch.sum(torch.isinf(mus)))) + + if self.keepdim: + mus = mus.unsqueeze(self.dim) + + # assert not torch.any( + # torch.isnan(mus) + # ), f"mus is nan {torch.sum(torch.isnan(mus))}" + # assert not torch.any( + # torch.isinf(mus) + # ), f"mus is inf {torch.sum(torch.isinf(mus))}" return mus def get_config(self): diff --git a/hyperion/torch/layers/interpolate.py b/hyperion/torch/layers/interpolate.py index fa76fd2a..335433fe 100644 --- a/hyperion/torch/layers/interpolate.py +++ b/hyperion/torch/layers/interpolate.py @@ -9,6 +9,14 @@ class Interpolate(nn.Module): + """Interpolation class. + + Attributes: + scale_factor: Upsampling scale factor. + mode: Algorithm used for upsampling: + 'nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'. + """ + def __init__(self, scale_factor, mode="nearest"): super().__init__() self.interp = nnf.interpolate @@ -24,5 +32,13 @@ def __repr__(self): return s def forward(self, x): + """Interpolates the input. + + Args: + x: Input tensor. + + Returns: + Interpolated tensor. + """ x = self.interp(x, scale_factor=self.scale_factor, mode=self.mode) return x diff --git a/hyperion/torch/layers/lora.py b/hyperion/torch/layers/lora.py new file mode 100644 index 00000000..18401669 --- /dev/null +++ b/hyperion/torch/layers/lora.py @@ -0,0 +1,120 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from typing import Union + +import loralib as lora +import torch.nn as nn +from loralib import mark_only_lora_as_trainable + + +def repr_lora(self, str_base): + if isinstance(self.lora_dropout, nn.Dropout): + lora_dropout = self.lora_dropout.p + else: + lora_dropout = 0 + + str_lora = f", r={self.r}, alpha={self.lora_alpha}, dropout={lora_dropout}, merge_weights={self.merge_weights})" + return str_base[:-1] + str_lora + + +class LinearLoRA(lora.Linear): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class EmbeddingLoRA(lora.Embedding): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv1dLoRA(lora.Conv1d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv2dLoRA(lora.Conv2d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv3dLoRA(lora.Conv3d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class LoRAFactory: + def create_from_pretrained( + layer: Union[nn.Embedding, nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d], + r: int = 8, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + merge_weights: bool = True, + ): + if isinstance(layer, nn.Embedding): + lora_layer = EmbeddingLoRA( + layer.num_embeddings, + layer.embedding_dim, + padding_idx=layer.padding_idx, + max_norm=layer.max_norm, + norm_type=layer.norm_type, + scale_grad_by_freq=layer.scale_grad_by_freq, + sparse=layer.sparse, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + + elif isinstance(layer, nn.Linear): + bias = layer.bias is not None + lora_layer = LinearLoRA( + layer.in_features, + layer.out_features, + bias=bias, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + if bias: + lora_layer.bias.data = layer.bias.data + + elif isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): + if isinstance(layer, nn.Conv1d): + lora_class = Conv1dLoRA + elif isinstance(layer, nn.Conv2d): + lora_class = Conv2dLoRA + elif isinstance(layer, nn.Conv3d): + lora_class = Conv3dLoRA + + bias = layer.bias is not None + lora_layer = lora_class( + layer.in_channels, + layer.out_channels, + layer.kernel_size, + stride=layer.stride, + padding=layer.padding, + dilation=layer.dilation, + groups=layer.groups, + bias=bias, + padding_mode=layer.padding_mode, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + if bias: + lora_layer.bias.data = layer.bias.data + + return lora_layer diff --git a/hyperion/torch/layers/margin_losses.py b/hyperion/torch/layers/margin_losses.py index 36fd2a5f..3f991567 100644 --- a/hyperion/torch/layers/margin_losses.py +++ b/hyperion/torch/layers/margin_losses.py @@ -3,13 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys import logging import math +import sys import torch -import torch.nn as nn import torch.cuda.amp as amp +import torch.nn as nn def _l2_norm(x, axis=-1): @@ -19,9 +19,37 @@ def _l2_norm(x, axis=-1): return y +def _cosine_affinity(kernel): + kernel_norm = _l2_norm(kernel, axis=0) + return torch.mm(kernel_norm.transpose(0, 1), kernel_norm) + + class ArcLossOutput(nn.Module): + """Additive angular margin softmax (ArcFace) output layer. + + It includes the option to also use InterTopK penalty: + https://arxiv.org/abs/2109.01989 + + Attributes: + in_feats: input feature dimension. + num_classes: number of output classes. + cos_scale: cosine scale. + margin: angular margin. + margin_warmup_epochs: number of epochs to warm up the margin from 0 to + its final value. + intertop_k: adds negative angular penalty to k largest negative scores. + intertop_margin: inter-top-k penalty. + """ + def __init__( - self, in_feats, num_classes, cos_scale=64, margin=0.3, margin_warmup_epochs=0 + self, + in_feats, + num_classes, + cos_scale=64, + margin=0.3, + margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0, ): super().__init__() self.in_feats = in_feats @@ -29,50 +57,83 @@ def __init__( self.cos_scale = cos_scale self.margin = margin self.margin_warmup_epochs = margin_warmup_epochs + self.intertop_k = intertop_k + self.intertop_margin = intertop_margin if margin_warmup_epochs == 0: self.cur_margin = margin + self.cur_intertop_margin = intertop_margin else: self.cur_margin = 0 + self.cur_intertop_margin = 0 self._compute_aux() + # each column is the prototype vector of a class self.kernel = nn.Parameter(torch.Tensor(in_feats, num_classes)) + # we normalize prototypes to have l2 norm = 1 self.kernel.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5) def __repr__(self): return self.__str__() def __str__(self): - s = "%s(in_feats=%d, num_classes=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d)" % ( - self.__class__.__name__, - self.in_feats, - self.num_classes, - self.cos_scale, - self.margin, - self.margin_warmup_epochs, + s = ( + "%s(in_feats=%d, num_classes=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" + % ( + self.__class__.__name__, + self.in_feats, + self.num_classes, + self.cos_scale, + self.margin, + self.margin_warmup_epochs, + self.intertop_k, + self.intertop_margin, + ) ) return s def _compute_aux(self): - logging.info("updating arc-softmax margin=%.2f" % (self.cur_margin)) + logging.info( + "updating arc-softmax margin=%.2f intertop-margin=%.2f", + self.cur_margin, + self.cur_intertop_margin, + ) self.cos_m = math.cos(self.cur_margin) self.sin_m = math.sin(self.cur_margin) + self.intertop_cos_m = math.cos(self.cur_intertop_margin) + self.intertop_sin_m = math.sin(self.cur_intertop_margin) def update_margin(self, epoch): - if self.margin_warmup_epochs == 0: - return + """Updates the value of the margin. + Args: + epoch: value of current epoch. + """ if epoch < self.margin_warmup_epochs: self.cur_margin = self.margin * epoch / self.margin_warmup_epochs + self.cur_intertop_margin = ( + self.intertop_margin * epoch / self.margin_warmup_epochs + ) else: if self.cur_margin != self.margin: self.cur_margin = self.margin + self.cur_intertop_margin = self.intertop_margin else: return self._compute_aux() def forward(self, x, y=None): + """Computes penalized logits. + + Args: + x: input feature tensor with shape = (batch, in_feats). + y: ground truth classes. This is required to penalize the logit of + the true class at training time. + + Returns: + Logit tensor with shape = (batch, num_classes) + """ with amp.autocast(enabled=False): s = self.cos_scale batch_size = len(x) @@ -92,14 +153,57 @@ def forward(self, x, y=None): idx_ = torch.arange(0, batch_size, dtype=torch.long) output[idx_, y] = cos_theta_m[idx_, y] + if self.cur_intertop_margin > 0: + # implementation of intertop-K + # set positive scores to -inf so they don't appear in the top k + cos_aux = cos_theta * 1 + cos_aux[idx_, y] = -1e10 + # find topk indices for negative samples + topk = torch.topk(cos_aux, k=self.intertop_k, dim=-1, sorted=False) + idx_ = ( + idx_.unsqueeze(-1).expand(batch_size, self.intertop_k).flatten() + ) + topk_idx = topk.indices.flatten() + # compute cos(theta-m') + cos_theta_m = ( + cos_theta[idx_, topk_idx] * self.intertop_cos_m + + sin_theta[idx_, topk_idx] * self.intertop_sin_m + ) + # take the maximum for the cases where m' is larger than theta to get cos(max(0, theta-m')) + output[idx_, topk_idx] = torch.maximum( + output[idx_, topk_idx], cos_theta_m + ) output *= s # scale up in order to make softmax work return output + def compute_prototype_affinity(self): + return _cosine_affinity(self.kernel) + class CosLossOutput(nn.Module): + """Additive margin softmax (CosFace) output layer. + + Attributes: + in_feats: input feature dimension. + num_classes: number of output classes. + cos_scale: cosine scale. + margin: angular margin. + margin_warmup_epochs: number of epochs to warm up the margin from 0 to + its final value. + intertop_k: adds negative angular penalty to k largest negative scores. + intertop_margin: inter-top-k penalty. + """ + def __init__( - self, in_feats, num_classes, cos_scale=64, margin=0.3, margin_warmup_epochs=0 + self, + in_feats, + num_classes, + cos_scale=64, + margin=0.3, + margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, ): super().__init__() self.in_feats = in_feats @@ -107,29 +211,79 @@ def __init__( self.cos_scale = cos_scale self.margin = margin self.margin_warmup_epochs = margin_warmup_epochs + self.intertop_k = intertop_k + self.intertop_margin = intertop_margin if margin_warmup_epochs == 0: self.cur_margin = margin + self.cur_intertop_margin = intertop_margin else: self.cur_margin = 0 + self.cur_intertop_margin = 0 self.kernel = nn.Parameter(torch.Tensor(in_feats, num_classes)) self.kernel.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5) + def __repr__(self): + return self.__str__() + + def __str__(self): + s = ( + "%s(in_feats=%d, num_classes=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" + % ( + self.__class__.__name__, + self.in_feats, + self.num_classes, + self.cos_scale, + self.margin, + self.margin_warmup_epochs, + self.intertop_k, + self.intertop_margin, + ) + ) + return s + def update_margin(self, epoch): - if self.margin_warmup_epochs == 0: - return + """Updates the value of the margin. + + Args: + epoch: value of current epoch. + """ + # if self.margin_warmup_epochs == 0: + # return if epoch < self.margin_warmup_epochs: self.cur_margin = self.margin * epoch / self.margin_warmup_epochs - logging.info("updating cos-softmax margin=%.2f" % (self.cur_margin)) + logging.info( + "updating cos-softmax margin=%.2f intertop-margin=%.2f", + self.cur_margin, + self.cur_intertop_margin, + ) + self.cur_intertop_margin = ( + self.intertop_margin * epoch / self.margin_warmup_epochs + ) else: if self.cur_margin != self.margin: self.cur_margin = self.margin - logging.info("updating cos-softmax margin=%.2f" % (self.cur_margin)) + self.cur_intertop_margin = self.intertop_margin + logging.info( + "updating cos-softmax margin=%.2f intertop-margin=%.2f", + self.cur_margin, + self.cur_intertop_margin, + ) else: return def forward(self, x, y=None): + """Computes penalized logits. + + Args: + x: input feature tensor with shape = (batch, in_feats). + y: ground truth classes. This is required to penalize the logit of + the true class at training time. + + Returns: + Logit tensor with shape = (batch, num_classes) + """ with amp.autocast(enabled=False): s = self.cos_scale x = _l2_norm(x.float()) @@ -146,12 +300,44 @@ def forward(self, x, y=None): cos_theta_m = cos_theta - self.cur_margin idx_ = torch.arange(0, batch_size, dtype=torch.long) output[idx_, y] = cos_theta_m[idx_, y] + if self.cur_intertop_margin > 0: + # implementation of intertop-K + # set positive scores to -inf so they don't appear in the top k + cos_aux = cos_theta * 1 + cos_aux[idx_, y] = -1e10 + # find topk indices for negative samples + topk = torch.topk(cos_aux, k=self.intertop_k, dim=-1, sorted=False) + idx_ = ( + idx_.unsqueeze(-1).expand(batch_size, self.intertop_k).flatten() + ) + topk_idx = topk.indices.flatten() + # compute cos(theta) + m' + cos_theta_m = cos_theta[idx_, topk_idx] + self.cur_intertop_margin + # clamp so cos cannt be larger than 1. + output[idx_, topk_idx] = cos_theta_m.clamp(max=1.0) output *= s # scale up in order to make softmax work return output + def compute_prototype_affinity(self): + return _cosine_affinity(self.kernel) + class SubCenterArcLossOutput(ArcLossOutput): + """Sub-Center Additive angular margin softmax (ArcFace) output layer. + + Attributes: + in_feats: input feature dimension. + num_classes: number of output classes. + num_subcenters: number of subcenters. + cos_scale: cosine scale. + margin: angular margin. + margin_warmup_epochs: number of epochs to warm up the margin from 0 to + its final value. + intertop_k: adds negative angular penalty to k largest negative scores. + intertop_margin: inter-top-k penalty. + """ + def __init__( self, in_feats, @@ -160,6 +346,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, ): super().__init__( in_feats, @@ -167,23 +355,53 @@ def __init__( cos_scale, margin, margin_warmup_epochs, + intertop_k, + intertop_margin, ) self.num_classes = num_classes self.num_subcenters = num_subcenters + # this variable counts which subcenter is used more time during training + # Therefore, which subscenter correspond to the clean label. + self.register_buffer( + "subcenter_counts", torch.zeros(num_classes, num_subcenters) + ) def __str__(self): - s = "%s(in_feats=%d, num_classes=%d, num_subcenters=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d)" % ( - self.__class__.__name__, - self.in_feats, - self.num_classes, - self.num_subcenters, - self.cos_scale, - self.margin, - self.margin_warmup_epochs, + s = ( + "%s(in_feats=%d, num_classes=%d, num_subcenters=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" + % ( + self.__class__.__name__, + self.in_feats, + self.num_classes, + self.num_subcenters, + self.cos_scale, + self.margin, + self.margin_warmup_epochs, + self.intertop_k, + self.intertop_margin, + ) ) return s + def _update_counts(self, y, proto_idx): + idx1 = torch.arange(y.size(0)) + proto_idx = proto_idx[idx1, y] + self.subcenter_counts[y, proto_idx] += 1 + # we make counts relative to avoid risk of overflowing the integers + min_counts, _ = torch.min(self.subcenter_counts, dim=1, keepdim=True) + self.subcenter_counts -= min_counts + def forward(self, x, y=None): + """Computes penalized logits. + + Args: + x: Input feature tensor with shape = (batch, in_feats). + y: Ground truth classes. This is required to penalize the logit of + the true class at training time. + + Returns: + Logit tensor with shape = (batch, num_classes) + """ with amp.autocast(enabled=False): s = self.cos_scale batch_size = len(x) @@ -191,17 +409,17 @@ def forward(self, x, y=None): kernel_norm = _l2_norm(self.kernel, axis=0) # cos(theta+m) cos_theta = torch.mm(x, kernel_norm).float() - cos_theta = torch.max( + cos_theta, proto_idx = torch.max( cos_theta.view(-1, self.num_classes, self.num_subcenters), dim=-1 - )[0] - + ) cos_theta = cos_theta.clamp(-1, 1) # for numerical stability - # print(cos_theta) + output = ( cos_theta * 1.0 ) # a little bit hacky way to prevent in_place operation on cos_theta if y is not None and self.training: + self._update_counts(y, proto_idx) cos_theta_2 = torch.pow(cos_theta, 2) sin_theta_2 = (1 + 1e-10) - cos_theta_2 sin_theta = torch.sqrt(sin_theta_2) @@ -209,6 +427,68 @@ def forward(self, x, y=None): idx_ = torch.arange(0, batch_size, dtype=torch.long) output[idx_, y] = cos_theta_m[idx_, y] + if self.cur_intertop_margin > 0: + # implementation of intertop-K + # set positive scores to -inf so they don't appear in the top k + cos_aux = cos_theta * 1 + cos_aux[idx_, y] = -1e10 + # find topk indices for negative samples + topk = torch.topk(cos_aux, k=self.intertop_k, dim=-1, sorted=False) + idx_ = ( + idx_.unsqueeze(-1).expand(batch_size, self.intertop_k).flatten() + ) + topk_idx = topk.indices.flatten() + # compute cos(theta-m') + cos_theta_m = ( + cos_theta[idx_, topk_idx] * self.intertop_cos_m + + sin_theta[idx_, topk_idx] * self.intertop_sin_m + ) + # take the maximum for the cases where m' is larger than theta to get cos(max(0, theta-m')) + output[idx_, topk_idx] = torch.maximum( + output[idx_, topk_idx], cos_theta_m + ) output *= s # scale up in order to make softmax work return output + + def get_main_prototype_kernel(self): + _, idx2 = torch.max( + self.subcenter_counts, dim=-1 + ) # get indices for the main prototype + idx1 = torch.arange(self.num_classes) + kernel = self.kernel.view(-1, self.num_classes, self.num_subcenters)[ + :, idx1, idx2 + ] + return kernel + + def compute_prototype_affinity(self): + kernel = self.get_main_prototype_kernel() + return _cosine_affinity(kernel) + + def to_arc_loss(self): + loss = ArcLossOutput( + in_feats=self.in_feats, + num_classes=self.num_classes, + cos_scale=self.cos_scale, + margin=self.margin, + margin_warmup_epochs=self.margin_warmup_epochs, + intertop_k=self.intertop_k, + intertop_margin=self.intertop_margin, + ) + kernel = self.get_main_prototype_kernel() + loss.kernel.data = kernel + return loss + + def to_cos_loss(self): + loss = CosLossOutput( + in_feats=self.in_feats, + num_classes=self.num_classes, + cos_scale=self.cos_scale, + margin=self.margin, + margin_warmup_epochs=self.margin_warmup_epochs, + intertop_k=self.intertop_k, + intertop_margin=self.intertop_margin, + ) + kernel = self.get_main_prototype_kernel() + loss.kernel.data = kernel + return loss diff --git a/hyperion/torch/layers/mvn.py b/hyperion/torch/layers/mvn.py index 3ee1e121..5a92e89a 100644 --- a/hyperion/torch/layers/mvn.py +++ b/hyperion/torch/layers/mvn.py @@ -2,18 +2,33 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ..utils import seq_lengths_to_mask + +SQRT_EPS = 1e-5 class MeanVarianceNorm(nn.Module): + """Class to apply short-time mean-variance normalization to features. + + Attributes: + norm_mean: if True, it normalizes the mean. + norm_var: if True, is also normalized the variance. + left_context: left context for the window that computes the normalization stats. + right_context: right context for the window that computes the normalization stats. + dim: normalization dimension (time dimension). + + If left_context = right_context = 0, it computes the stats on the whole utterance. + """ + def __init__( self, norm_mean=True, norm_var=False, left_context=0, right_context=0, dim=1 ): - - super(MeanVarianceNorm, self).__init__() + super().__init__() self.norm_mean = norm_mean self.norm_var = norm_var self.left_context = left_context @@ -34,53 +49,127 @@ def __str__(self): ) return s - def forward(self, x): + def forward(self, x, x_lengths=None, x_mask=None): + """Short-time mean-var normalizes feature tensor. + + Args: + x: feature tensor. + x_lengths: lengths of x sequences + x_mask: mask of valid frames, if present, x_lengths is ignored. + + Returns: + Normalized feature tensor. + """ + if not self.norm_mean and not self.norm_var: + return x + + if self.dim != 1: + x = x.transpose(x, 1, self.dim) + + max_length = x.size(1) + if x_lengths is not None and x_mask is None: + x_mask = seq_lengths_to_mask( + x_lengths, + max_length, + dtype=x.dtype, + ndim=x.dim(), + none_if_all_max=True, + ) - T = x.shape[self.dim] if (self.left_context == 0 and self.right_context == 0) or ( - T <= self.left_context + self.right_context + 1 + max_length <= self.left_context + self.right_context + 1 ): - return self.normalize_global(x) + x = self.normalize_global(x, x_mask) + else: + x = self.normalize_cumsum(x, x_mask) - return self.normalize_cumsum(x) + if self.dim != 1: + x = x.transpose(x, 1, self.dim).contiguous() + + return x - def normalize_global(self, x): + def _normalize_global_nomask(self, x): + """Applies global mean-var normalization.""" # Global mean/var norm. + if self.norm_mean: - m_x = torch.mean(x, dim=self.dim, keepdim=True) + m_x = torch.mean(x, dim=1, keepdim=True) x = x - m_x if self.norm_var: - s_x = torch.std(x, dim=self.dim, keepdim=True).clamp(min=1e-5) + s_x = torch.std(x, dim=1, keepdim=True).clamp(min=1e-5) x = x / s_x return x - def normalize_cumsum(self, x): - + def _normalize_global_mask(self, x, x_mask): + """Applies global mean-var normalization with masking.""" + # Global mean/var norm. + den = torch.mean(x_mask, dim=1, keepdim=True) + x = x * x_mask + m_x = torch.mean(x, dim=1, keepdim=True) / den if self.norm_mean: + x = x - m_x + if self.norm_var: + s2_x = torch.mean(x**2, dim=1, keepdim=True) / den + s_x = torch.sqrt(s2_x.clamp(min=SQRT_EPS)) + x = x / s_x + elif self.norm_var: + s2_x = torch.mean((x - m_x) ** 2, dim=1, keepdim=True) / den + s_x = torch.sqrt(s2_x.clamp(min=SQRT_EPS)) + x = x / s_x + + return x + + def normalize_global(self, x, x_mask=None): + """Applies global mean-var normalization.""" + # Global mean/var norm. + if x_mask is None: + return self._normalize_global_nomask(x) + else: + return self._normalize_global_mask(x, x_mask) + + def _prenormalize_cumsum(self, x, x_mask): + """substract first global mean + it will help cumsum numerical stability + and set masked values to the global mean""" + if self.norm_mean or x_mask is not None: # substract first global mean # it will help cumsum numerical stability - m_x = torch.mean(x, dim=self.dim, keepdim=True) + if x_mask is not None: + x = x * x_mask + den = torch.mean(x_mask, dim=1, keepdim=True) + else: + den = 1 + m_x = torch.mean(x, dim=1, keepdim=True) / den + + if self.norm_mean: x = x - m_x + if x_mask is not None: + x = x * x_mask + elif x_mask is not None: + x = x * x_mask + m_x * (1 - x_mask) - if self.dim != 1: - x = x.transpose(self.dim, 1) + return x + + def normalize_cumsum(self, x, x_mask=None): + """Applies short-time mean-var normalization using cumulative sums.""" + x = self._prenormalize_cumsum(x, x_mask) total_context = self.left_context + self.right_context + 1 xx = nn.functional.pad( x.transpose(1, -1), (self.left_context, self.right_context), mode="reflect" ).transpose(1, -1) - if self.norm_mean: + if self.norm_mean or self.norm_var: c_x = torch.cumsum(xx, dim=1) m_x = ( c_x[:, total_context - 1 :] - c_x[:, : -total_context + 1] ) / total_context if self.norm_var: - c_x = torch.cumsum(xx ** 2, dim=1) + c_x = torch.cumsum(xx**2, dim=1) m_x2 = ( c_x[:, total_context - 1 :] - c_x[:, : -total_context + 1] ) / total_context @@ -89,23 +178,20 @@ def normalize_cumsum(self, x): x = x - m_x if self.norm_var: - s_x = torch.sqrt((m_x2 - m_x ** 2).clamp(min=1e-5)) + s_x = torch.sqrt((m_x2 - m_x**2).clamp(min=SQRT_EPS)) x = x / s_x - if self.dim != 1: - x = x.transpose(self.dim, 1) - return x.contiguous() @staticmethod def filter_args(**kwargs): - """Filters ST-CMVN args from arguments dictionary. + """Filters ST-MVN args from arguments dictionary. Args: kwargs: Arguments dictionary. Returns: - Dictionary with ST-CMVN options. + Dictionary with ST-MVN options. """ valid_args = ( @@ -143,16 +229,16 @@ def add_class_args(parser, prefix=None): parser = ArgumentParser(prog="") parser.add_argument( - "--no-norm-mean", - default=False, - action="store_true", - help="don't center the features", + "--norm-mean", + default=True, + action=ActionYesNo, + help="center the features", ) parser.add_argument( "--norm-var", default=False, - action="store_true", + action=ActionYesNo, help="normalize the variance of the features", ) diff --git a/hyperion/torch/layers/norm_layer_factory.py b/hyperion/torch/layers/norm_layer_factory.py index cd7e542f..8c0ebdeb 100644 --- a/hyperion/torch/layers/norm_layer_factory.py +++ b/hyperion/torch/layers/norm_layer_factory.py @@ -7,6 +7,10 @@ class NormLayer2dFactory(object): + """Factory class to create normalization layers for + tensors with 2D spatial dimension. + """ + @staticmethod def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): """Creates a layer-norm callabe constructor @@ -54,6 +58,10 @@ def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): class NormLayer1dFactory(object): + """Factory class to create normalization layers for + tensors with 1D spatial (time) dimension. + """ + @staticmethod def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): """Creates a layer-norm callabe constructor @@ -97,4 +105,4 @@ def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): if norm_name == "layer-norm": # it is equivalent to groupnorm with 1 group - return lambda x, momentum=momentum, eps=eps: nn.GroupNorm(1, x, eps=eps) + return lambda x, momentum=momentum, eps=eps: nn.LayerNorm(x, eps=eps) diff --git a/hyperion/torch/layers/pdf_storage.py b/hyperion/torch/layers/pdf_storage.py index bac48d27..6a87cd0d 100644 --- a/hyperion/torch/layers/pdf_storage.py +++ b/hyperion/torch/layers/pdf_storage.py @@ -5,23 +5,27 @@ # import torch -import torch.nn as nn import torch.distributions as pdf +import torch.nn as nn class StdNormal(nn.Module): - """Storage for Standard Normal distribution""" + """Storage for Standard Normal distribution parameters + + Attributes: + shape: shape of the location/scale tensors. + """ def __init__(self, shape): super().__init__() self.register_buffer("loc", torch.zeros(shape)) self.register_buffer("scale", torch.ones(shape)) - # self.loc = nn.Parameter(torch.zeros(shape), requires_grad=False) - # self.scale = nn.Parameter(torch.ones(shape), requires_grad=False) @property def pdf(self): + """Probability density function for N(0,I).""" return pdf.normal.Normal(self.loc, self.scale) def forward(self): + """Probability density function for N(0,I).""" return self.pdf diff --git a/hyperion/torch/layers/pool_factory.py b/hyperion/torch/layers/pool_factory.py index 41cf2ac2..84d0cbf1 100644 --- a/hyperion/torch/layers/pool_factory.py +++ b/hyperion/torch/layers/pool_factory.py @@ -2,13 +2,16 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser + import torch.nn as nn from .global_pool import * class GlobalPool1dFactory(object): + """Factory class to create global pooling layers 1d.""" + @staticmethod def create( pool_type, @@ -27,6 +30,28 @@ def create( keepdim=False, **kwargs ): + """Creates a global pooling layer from arguments. + + Args: + pool_type: pooling type in ["avg", "mean+stddev", "mean+logvar", "lde", + "scaled-dot-prod-att-v1", "ch-wise-att-mean+stddev"] + in_feats: input feature dimension. + inner_feats: feature dimension in the hidden layer of the content based attention, + in channel-wise attention. + num_comp: number of LDE components. + dist_power: distance type in LDE in L1 or L2. + use_bias: use bias in LDE. + num_heads: number of attention heads. + d_k: dimension of the keys in scaled dot product attn. + d_v: dimension of the values in scaled dot product attn. + bin_attn: it True, use binary attention. Attention values are obtained by applying sigmoid to + the dot products instead of softmax. + use_global_context: if True, concat global stats pooling to the input features to + compute the attention in channel-wise attention. + norm_layer: normalization layer object, if None, it used BatchNorm1d. + dim: pooling dimension. + keepdim: it True keeps the same number of dimensions after pooling. + """ if pool_type == "avg": return GlobalAvgPool1d(dim=dim, keepdim=keepdim) @@ -69,8 +94,18 @@ def create( keepdim=keepdim, ) + raise ValueError(f"Invalid pooling type {pool_type}") + @staticmethod def filter_args(**kwargs): + """Filters the arguments corresponding to the creation of a pooling layer. + + Args: + kwargs: Arguments dictionary. + + Returns: + Dictionary with the pooling layer options. + """ if "wo_bias" in kwargs: kwargs["use_bias"] = not kwargs["wo_bias"] diff --git a/hyperion/torch/layers/pos_encoder.py b/hyperion/torch/layers/pos_encoder.py index f3aa17e9..f18eb51f 100644 --- a/hyperion/torch/layers/pos_encoder.py +++ b/hyperion/torch/layers/pos_encoder.py @@ -3,12 +3,19 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import math +from typing import Union import torch from torch import nn +from .activation_factory import ActivationFactory as AF -class PosEncoder(nn.Module): + +class PosEncoderBase(nn.Module): + pass + + +class PosEncoder(PosEncoderBase): """Positional encoding. Attributes: @@ -16,7 +23,7 @@ class PosEncoder(nn.Module): dropout_rate: dropout rate """ - def __init__(self, num_feats, dropout_rate=0): + def __init__(self, num_feats: int, dropout_rate: float = 0): super().__init__() self.num_feats = num_feats self.dropout_rate = dropout_rate @@ -60,7 +67,7 @@ def _pe(self, x, relative=False): self.pe = pe.to(device=x.device, dtype=x.dtype) return self.pe - def forward(self, x): + def forward(self, x: torch.Tensor): """Add positional encoding. Args: @@ -88,10 +95,10 @@ class RelPosEncoder(PosEncoder): dropout_rate: dropout rate """ - def __init__(self, num_feats, dropout_rate=0): + def __init__(self, num_feats: int, dropout_rate: float = 0): super().__init__(num_feats, dropout_rate) - def forward(self, x): + def forward(self, x: torch.Tensor): """Add positional encoding. Args: @@ -117,7 +124,7 @@ def forward(self, x): return x, pos_emb -class NoPosEncoder(nn.Module): +class NoPosEncoder(PosEncoderBase): """This is a dummy class for the case where we deactivate the positional encoder @@ -126,7 +133,7 @@ class NoPosEncoder(nn.Module): def __init__(self): super().__init__() - def forward(self, x): + def forward(self, x: torch.Tensor): """Identity map Args: @@ -136,3 +143,42 @@ def forward(self, x): x """ return x + + +class ConvPosEncoder(PosEncoderBase): + """Convolutional positional encoder like the one used in wav2vec2 + + Attributes: + num_feats: number of input/output features + kernel_size: kernel size of convolution + num_groups: number of groups of the convolution + activation: hidden activation + """ + + def __init__( + self, + num_feats: int, + kernel_size: int, + num_groups: int, + activation: Union[str, nn.Module], + ): + super().__init__() + self.conv = nn.Conv1d( + num_feats, + num_feats, + kernel_size=kernel_size, + padding=kernel_size // 2, + groups=num_groups, + ) + self.activation = AF.create(activation) + self.num_pad_remove = 1 if kernel_size % 2 == 0 else 0 + + def forward(self, x: torch.Tensor): + x = x.transpose(1, 2) + x = self.conv(x) + if self.num_pad_remove > 0: + x = x[:, :, : -self.num_pad_remove] + + x = self.activation(x).transpose(1, 2) + + return x diff --git a/hyperion/torch/layers/spec_augment.py b/hyperion/torch/layers/spec_augment.py index ecb3609f..9ef71f5f 100644 --- a/hyperion/torch/layers/spec_augment.py +++ b/hyperion/torch/layers/spec_augment.py @@ -2,12 +2,15 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging -from jsonargparse import ArgumentParser, ActionParser import torch import torch.nn as nn import torch.nn.functional as nnf +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args count = 0 @@ -17,10 +20,13 @@ class AxisMasker(nn.Module): Implementation based on espnet. Attributes: - mask_width_range: range for the width of the masks - mask_num_range: range for the number of masks - dim: axis where we apply the mask - fill_value: masking value + min_width: minimum width of the mask. + max_width: maximum width of the mask. + min_num_mask: minimum number of masks. + max_num_mask: maximum number of masks. + dim: axis where we apply the mask. + mask_value: masking value. + use_num_masks_percentage: if True, num_masks are per 100 frames, if False they are absolute. """ def __init__( @@ -30,7 +36,9 @@ def __init__( min_num_masks=1, max_num_masks=2, dim=-1, - fill_value=0, + mask_method="constant", + mask_value=0, + use_num_masks_percentage=False, ): super().__init__() assert min_width >= 0 @@ -40,16 +48,22 @@ def __init__( self.min_width = min_width self.max_width = max_width + if not use_num_masks_percentage: + min_num_masks = int(min_num_masks) + max_num_masks = int(max_num_masks) + self.min_num_masks = min_num_masks self.max_num_masks = max_num_masks self.dim = dim - self.fill_value = fill_value + self.mask_method = mask_method + self.mask_value = mask_value + self.use_num_masks_percentage = use_num_masks_percentage def __repr__(self): s = ( "{}(min_width={}, max_width={}, " "min_num_masks={}, max_num_masks={}, " - "dim={}, fill_value={})" + "dim={}, mask_method={}, mask_value={} use_num_masks_percentage={})" ).format( self.__class__.__name__, self.min_width, @@ -57,7 +71,9 @@ def __repr__(self): self.min_num_masks, self.max_num_masks, self.dim, - self.fill_value, + self.mask_method, + self.mask_value, + self.use_num_masks_percentage, ) return s @@ -80,9 +96,16 @@ def forward(self, x): batch_size = x.shape[0] masked_dim_length = x.shape[self.dim] + if self.use_num_masks_percentage: + min_num_masks = int(round(self.min_num_masks * masked_dim_length / 100)) + max_num_masks = int(round(self.max_num_masks * masked_dim_length / 100)) + else: + min_num_masks = self.min_num_masks + max_num_masks = self.max_num_masks + # select how many masks num_masks = torch.randint( - self.min_num_masks, self.max_num_masks + 1, size=(1,), device=x.device + min_num_masks, max_num_masks + 1, size=(1,), device=x.device )[0] # (batch, num_mask, 1) widths = torch.randint( @@ -109,7 +132,14 @@ def forward(self, x): else: mask = mask.unsqueeze(-1) - x = x.masked_fill(mask, self.fill_value) + if self.mask_method == "mean": + mask_value = x.mean().item() + elif self.mask_method == "min": + mask_value = x.min().item() + else: + mask_value = self.mask_value + + x = x.masked_fill(mask, mask_value) if ndim > 3: x = x.view(in_shape) @@ -121,7 +151,9 @@ class SpecWarper(nn.Module): Implementation based on espnet. Attributes: - window: time warp parameter + window: time warp parameter. + mode: interpolation mode in ["nearest", "linear", "bilinear"] + dim: warping dimension. """ def __init__(self, window=80, mode="bicubic", dim=-2): @@ -136,14 +168,14 @@ def __repr__(self): ) return s - def forward(self, x, lengths=None): + def forward(self, x, x_lengths=None): """warps x along time or freq dimension Args: - x: spectrogram (batch, *, time, freq) - lengths: length ratios + x: spectrogram shape= (batch, *, time, freq) + x_lengths: time lengths of the sequences. Returns: - warped spectrogram (batch, *, time, freq) + warped spectrogram shape = (batch, *, time, freq) """ if not self.training: return x @@ -166,14 +198,13 @@ def forward(self, x, lengths=None): # the first n frames where n is the length of the # shortest utterance # the end of the utterance will not be warped - if dim == -1 or lengths is None: + if dim == -1 or x_lengths is None: warp_length = x.shape[-2] else: - warp_length = int(x.shape[-2] * torch.min(lengths)) + warp_length = int(torch.min(x_lengths)) center = torch.randint(self.window, warp_length - self.window, (1,))[0] warped = torch.randint(center - self.window, center + self.window, (1,))[0] + 1 - # (batch, C, warped, freq) left = nnf.interpolate( x[:, :, :center], (warped, x.shape[3]), mode=self.mode, align_corners=False @@ -195,6 +226,9 @@ def forward(self, x, lengths=None): if dim == -1: x = x.transpose(-1, -2) + if ndim == 3: + x = x.squeeze(1) + x = x.view(in_shape) return x @@ -208,6 +242,21 @@ class SpecAugment(nn.Module): Augmentation Method for Automatic Speech Recognition" Attributes: + time_warp_prob: probability of applying time warping. + time_warp_window: time warp parameter. + time_warp_mode: interpolation mode in ["nearest", "linear", "bilinear"] + time_mask_prob: probability of applying masking in time. + time_min_width: minimum width of the time mask. + time_max_width: maximum width of the time mask. + time_min_num_mask: minimum number of time masks. + time_max_num_mask: maximum number of time masks. + time_use_num_masks_percentage: if True, num_masks are per 100 frames, if False they are absolute. + freq_mask_prob: probability of applying frequency masking. + freq_min_width: minimum width of the frequency mask. + freq_max_width: maximum width of the frequency mask. + freq_min_num_mask: minimum number of frequency masks. + freq_max_num_mask: maximum number of frequency masks. + mask_value: masking value. """ def __init__( @@ -220,12 +269,14 @@ def __init__( time_mask_max_width=100, time_mask_min_num_masks=1, time_mask_max_num_masks=2, + time_use_num_masks_percentage=False, freq_mask_prob=0, freq_mask_min_width=0, freq_mask_max_width=20, freq_mask_min_num_masks=1, freq_mask_max_num_masks=2, - fill_value=0, + mask_method="constant", + mask_value=0, ): super().__init__() @@ -242,7 +293,7 @@ def __init__( self.freq_mask_max_width = freq_mask_max_width self.freq_mask_min_num_masks = freq_mask_min_num_masks self.freq_mask_max_num_masks = freq_mask_max_num_masks - self.fill_value = fill_value + self.mask_value = mask_value self.time_masker = None self.freq_masker = None @@ -255,7 +306,9 @@ def __init__( min_num_masks=time_mask_min_num_masks, max_num_masks=time_mask_max_num_masks, dim=-2, - fill_value=fill_value, + mask_method=mask_method, + mask_value=mask_value, + use_num_masks_percentage=time_use_num_masks_percentage, ) if self.freq_mask_prob > 0: @@ -265,7 +318,8 @@ def __init__( min_num_masks=freq_mask_min_num_masks, max_num_masks=freq_mask_max_num_masks, dim=-1, - fill_value=fill_value, + mask_method=mask_method, + mask_value=mask_value, ) if self.time_warp_prob > 0: @@ -287,7 +341,14 @@ def __repr__(self): ) return s - def forward(self, x, lengths=None): + def forward(self, x, x_lengths=None): + """Applies spec augment to input + Args: + x: spectrogram with shape = (batch, time, freq) + lengths: time lengths of the sequences. + Returns: + Augmented spectrogram with shape = (batch, time, freq) + """ if not self.training: return x # global count @@ -300,7 +361,7 @@ def forward(self, x, lengths=None): # ax.imshow(x.cpu().numpy()[0].T) r = torch.rand((3,), device=x.device) if self.time_warp_prob > r[0]: - x = self.time_warper(x, lengths) + x = self.time_warper(x, x_lengths) # ax = plt.subplot(222) # ax.imshow(x.cpu().numpy()[0].T) @@ -319,6 +380,7 @@ def forward(self, x, lengths=None): # count += 1 return x + @staticmethod def filter_args(**kwargs): """Filters SpecAugment args from arguments dictionary. @@ -328,25 +390,7 @@ def filter_args(**kwargs): Returns: Dictionary with SpecAugment options. """ - valid_args = ( - "time_warp_prob", - "time_warp_window", - "time_warp_mode", - "time_mask_prob", - "time_mask_max_width", - "time_mask_min_width", - "time_mask_max_num_masks", - "time_mask_min_num_masks", - "freq_mask_prob", - "freq_mask_max_width", - "freq_mask_min_width", - "freq_mask_max_num_masks", - "freq_mask_min_num_masks", - "fill_value", - ) - - d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return d + return filter_func_args(SpecAugment.__init__, kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -396,16 +440,22 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( "--time-mask-min-num-masks", - type=int, + type=float, default=1, help="min. number of time mask", ) parser.add_argument( "--time-mask-max-num-masks", - type=int, + type=float, default=2, help="max. number of time mask", ) + parser.add_argument( + "--time-use-num-masks-percentage", + default=False, + action=ActionYesNo, + help="if True, num_masks are per 100 frames, if False they are absolute.", + ) parser.add_argument( "--freq-mask-prob", @@ -437,9 +487,15 @@ def add_class_args(parser, prefix=None): default=2, help="max. number of freq mask", ) + parser.add_argument( + "--mask-method", + default="constant", + choices=["constant", "min", "mean"], + help="mothod to get the masked value", + ) parser.add_argument( - "--fill-value", + "---mask-value", type=float, default=0.0, help="filling value for the masked spec. bins", diff --git a/hyperion/torch/layers/subpixel_convs.py b/hyperion/torch/layers/subpixel_convs.py index 6b529aff..19c0283f 100644 --- a/hyperion/torch/layers/subpixel_convs.py +++ b/hyperion/torch/layers/subpixel_convs.py @@ -9,6 +9,22 @@ class SubPixelConv1d(nn.Module): + """Implements a SubPixel Convolution in 1d proposed in: + https://arxiv.org/abs/1609.05158 + + Attributes: + in_channels: Number of input channels. + out_channels: Number of output channels. + kernel_size: Kernel size. + stride: Downsampling stride. + padding: Int or Int Tuple with the number of left/right padding samples + dilation: Kernel dilation. + groups: Number of groups in the convolution. + bias: If true, the convolution has bias. + padding_mode: Padding mode in ['zeros', 'reflect', 'replicate' or 'circular']. + + """ + def __init__( self, in_channels, @@ -38,6 +54,14 @@ def __init__( self.stride = stride def forward(self, x): + """Applies subpixel convolution 1d. + + Args: + x: Input tensor with shape = (batch, in_channels, in_time) + + Returns: + Output tensor with shape = (batch, out_channels, out_time) + """ x = self.conv(x) if self.stride == 1: return x @@ -51,6 +75,22 @@ def forward(self, x): class SubPixelConv2d(nn.Module): + """Implements a SubPixel Convolution in 2d proposed in: + https://arxiv.org/abs/1609.05158 + + Attributes: + in_channels: Number of input channels. + out_channels: Number of output channels. + kernel_size: Kernel size. + stride: Downsampling stride. + padding: Int or Int Tuple with the number of left/right padding samples + dilation: Kernel dilation. + groups: Number of groups in the convolution. + bias: If true, the convolution has bias. + padding_mode: Padding mode in ['zeros', 'reflect', 'replicate' or 'circular']. + + """ + def __init__( self, in_channels, @@ -81,6 +121,14 @@ def __init__( self.pixel_shuffle = nn.PixelShuffle(self.stride) def forward(self, x): + """Applies subpixel convolution 1d. + + Args: + x: Input tensor with shape = (batch, in_channels, in_W, in_H) + + Returns: + Output tensor with shape = (batch, out_channels, out_W, out_H) + """ x = self.conv(x) if self.stride == 1: return x diff --git a/hyperion/torch/layers/swish.py b/hyperion/torch/layers/swish.py index 520a71fb..9ba0a896 100644 --- a/hyperion/torch/layers/swish.py +++ b/hyperion/torch/layers/swish.py @@ -7,6 +7,8 @@ class SwishImplementation(torch.autograd.Function): + """Implementation for Swish activation function.""" + @staticmethod def forward(ctx, i): result = i * torch.sigmoid(i) @@ -21,6 +23,10 @@ def backward(ctx, grad_output): class Swish(nn.Module): + """Swish activation class: + y = x * sigmoid(x) + """ + def forward(self, x): return SwishImplementation.apply(x) @@ -30,3 +36,114 @@ def __repr__(self): def __str__(self): s = "{}()".format(self.__class__.__name__) return s + + +class Swish6(nn.Module): + """Swish activation class, clamped to 6 + y = min(x, 6) * sigmoid(min(x,6)) + """ + + def forward(self, x): + return SwishImplementation.apply(x.clamp(max=6)) + + def __repr__(self): + return self.__str__() + + def __str__(self): + s = "{}()".format(self.__class__.__name__) + return s + + +class DoubleSwishImplementation(torch.autograd.Function): + """Implementation for DoubleSwish Activation from + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py + + f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), + where swish(x) = x * sigmoid(x). + + Memory-efficient derivative computation: + f'(x) = = x * s'(x) + x' * s(x) = x * s'(x) + s(x). + where s(x) = simoid(x), and s'(x) = s(x) * (1-s(x)). + + f'(x) = x * s(x) * (1-s(x)) + s(x) = f(x) * (1-s(x)) + s(x) + """ + + @staticmethod + def forward(ctx, x: torch.Tensor) -> torch.Tensor: + requires_grad = x.requires_grad + x_dtype = x.dtype + if x.dtype == torch.float16: + x = x.to(torch.float32) + + s = torch.sigmoid(x - 1.0) + y = x * s + + if requires_grad: + deriv = y * (1 - s) + s + # notes on derivative of x * sigmoid(x - 1): + # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29 + # min \simeq -0.043638. Take floor as -0.043637 so it's a lower bound + # max \simeq 1.1990. Take ceil to be 1.2 so it's an upper bound. + # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which + # floors), should be expectation-preserving. + floor = -0.043637 + ceil = 1.2 + d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like( + deriv + ) + d_int = d_scaled.to(torch.uint8) + ctx.save_for_backward(d_int) + if x_dtype == torch.float16 or torch.is_autocast_enabled(): + y = y.to(torch.float16) + return y + + @staticmethod + def backward(ctx, y_grad: torch.Tensor) -> torch.Tensor: + (d,) = ctx.saved_tensors + # the same constants as used in forward pass. + floor = -0.043637 + ceil = 1.2 + d = d * ((ceil - floor) / 255.0) + floor + return y_grad * d + + +class DoubleSwish(torch.nn.Module): + """DoubleSwish activation + f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), + where swish(x) = x * sigmoid(x). + """ + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if torch.jit.is_scripting() or torch.jit.is_tracing(): + return x * torch.sigmoid(x - 1.0) + + return DoubleSwishImplementation.apply(x) + + def __repr__(self): + return self.__str__() + + def __str__(self): + s = "{}()".format(self.__class__.__name__) + return s + + +class DoubleSwish6(torch.nn.Module): + """DoubleSwish activation clamped to 6 + x = min(x, 6) + f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), + where swish(x) = x * sigmoid(x). + """ + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.clamp(max=6) + if torch.jit.is_scripting() or torch.jit.is_tracing(): + return (x * torch.sigmoid(x - 1.0)).clamp(max=6) + + return DoubleSwishImplementation.apply(x) + + def __repr__(self): + return self.__str__() + + def __str__(self): + s = "{}()".format(self.__class__.__name__) + return s diff --git a/hyperion/torch/layers/tensor2pdf.py b/hyperion/torch/layers/tensor2pdf.py index e38b1bc7..41d1bc37 100644 --- a/hyperion/torch/layers/tensor2pdf.py +++ b/hyperion/torch/layers/tensor2pdf.py @@ -5,14 +5,20 @@ # import torch +import torch.distributions as pdf import torch.nn as nn import torch.nn.functional as nnf -import torch.distributions as pdf class Tensor2PDF(nn.Module): """Base class for layers that create a prob distribution from an input tensor + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -44,7 +50,14 @@ def _make_proj(self, in_feats, out_feats, ndims): class Tensor2NormalICov(Tensor2PDF): - """Transforms a Tensor into Normal distribution with identitiy variance""" + """Transforms a Tensor into Normal distribution with identitiy variance + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. + """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): super().__init__(pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) @@ -53,6 +66,16 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self._proj = self._make_proj(self.in_feats, self.pdf_feats, self.in_dim) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + prior: Not used. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) @@ -70,6 +93,12 @@ class Tensor2NormalGlobDiagCov(Tensor2PDF): Input tensor will be the mean of the distribution and the standard deviation is a global trainable parameter. + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -85,6 +114,18 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self.logvar = nn.Parameter(torch.zeros(pdf_shape)) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + Args: + inputs: Input tensor. + prior: prior pdf object. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) @@ -108,6 +149,12 @@ class Tensor2NormalDiagCov(Tensor2PDF): Applies two linear transformation to the tensors to obtain the mean and the log-variance. + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -117,6 +164,18 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self._proj = self._make_proj(self.in_feats, self.pdf_feats * 2, self.in_dim) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + Args: + inputs: Input tensor. + prior: prior pdf object. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) @@ -138,7 +197,13 @@ def forward(self, inputs, prior=None, squeeze_dim=None): class Tensor2BayNormalICovGivenNormalPrior(Tensor2PDF): """Transforms a Tensor into Normal distribution with identitiy variance - Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation + Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation. + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -151,6 +216,18 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self._alpha = nn.Parameter(torch.zeros(1)) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + Args: + inputs: Input tensor. + prior: prior pdf object. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) @@ -173,7 +250,13 @@ class Tensor2BayNormalGlobDiagCovGivenNormalPrior(Tensor2PDF): Input tensor will be the ML mean of the distribution and the ML standard deviation is a global trainable parameter. - Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation + Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation. + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -193,6 +276,18 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self._beta = nn.Parameter(torch.zeros(1)) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + Args: + inputs: Input tensor. + prior: prior pdf object. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) @@ -231,7 +326,13 @@ class Tensor2BayNormalDiagCovGivenNormalPrior(Tensor2PDF): Applies two linear transformation to the tensors to obtain the maximum likelihood mean and the log-variance. - Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation + Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation. + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -245,6 +346,18 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self._beta = nn.Parameter(torch.zeros(1)) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + Args: + inputs: Input tensor. + prior: prior pdf object. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) diff --git a/hyperion/torch/layers/tensor2pdf1.py b/hyperion/torch/layers/tensor2pdf1.py index 87ba3475..45c51f17 100644 --- a/hyperion/torch/layers/tensor2pdf1.py +++ b/hyperion/torch/layers/tensor2pdf1.py @@ -4,8 +4,8 @@ """ import torch -import torch.nn as nn import torch.distributions as pdf +import torch.nn as nn class Tensor2PDF(nn.Module): diff --git a/hyperion/torch/layers/vq.py b/hyperion/torch/layers/vq.py index 98307438..4a59b305 100644 --- a/hyperion/torch/layers/vq.py +++ b/hyperion/torch/layers/vq.py @@ -5,12 +5,24 @@ import math import torch +import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F -import torch.distributed as dist + +from ..utils import seq_lengths_to_mask class VectorQuantizer(nn.Module): + """Abstract base class for vector quantization layers. + + Attributes: + num_embed: codebook size. + embed_feats: feature dimension of the codebook vectors. + project: if True, it projects the input features to the embed_feats dim. + in_feats: input feature dimension, needed when project=True. + in_dim: number of dimensions of the input tensor in [2,5], needed when project=True + """ + def __init__( self, num_embed, embed_feats, project=True, in_feats=None, in_dim=None ): @@ -43,6 +55,7 @@ def __repr__(self): return self.__str__() def _make_proj(self, in_feats, out_feats, ndims): + """Creates the feature projection layer.""" if ndims == 2: return nn.Linear(in_feats, out_feats) elif ndims == 3: @@ -56,6 +69,18 @@ def _make_proj(self, in_feats, out_feats, ndims): class KMeansVectorQuantizer(VectorQuantizer): + """Class for K-Means vector quantization layers, + where codebook vectors are trained by gradient descend losses. + + Attributes: + num_embed: codebook size. + embed_feats: feature dimension of the codebook vectors. + commitment_cost: weight for loss that makes input features close to the codebook vectors. + project: if True, it projects the input features to the embed_feats dim. + in_feats: input feature dimension, needed when project=True. + in_dim: number of dimensions of the input tensor in [2,5], needed when project=True + """ + def __init__( self, num_embed, @@ -95,11 +120,33 @@ def __str__(self): ) return s - def forward(self, inputs, return_r=False): + def forward(self, inputs, lengths=None, mask=None, return_r=False): + """Quantizes the input tensor. + + Args: + input: input tensor 2d - 5d dimension with shape (batch, channels, ...) + lengths: when inputs is 3d, it the length of each sequence in the batch. + Not used if mask is given. + mask: indicates which elements are valid, to quantize. The elements with zero + mask are set to 0. The mask tensor should have the same shape as the + input tensor with the channel dimension removed, shape=(batch, ...). + return_r: it True, it returns the responsibilities. + + Returns: + Dictionary containing quantized vectors, vq_loss, KL(q(z)||p(z)), where q(z) is + the distribution of posterior responsabilities and p(z) is a uniform categorical + distribution, and the log_perplexity of the responsibilities. If return_r is True, + it also returns the responsibilities. + """ # inputs -> z_e in paper if self.project: inputs = self._proj(inputs) + if mask is None and lengths is not None: + mask = seq_lengths_to_mask( + lengths, inputs.size(-1), time_dim=1, dtype=inputs.dtype + ) + # convert inputs from BCHW -> BHWC inputs = inputs.transpose(1, -1).contiguous() input_shape = inputs.shape @@ -112,26 +159,37 @@ def forward(self, inputs, return_r=False): torch.sum(flat_inputs ** 2, dim=1, keepdim=True) + torch.sum(self.embed ** 2, dim=1) - 2 * torch.matmul(flat_inputs, self.embed.t()) - ) + ) # (batch x time, num_embeds) # Encoding # quantization integer indexes - q_idx = torch.argmin(d2, dim=1).unsqueeze(1) + q_idx = torch.argmin(d2, dim=1).unsqueeze(1) # (batch x time, 1) # 1 hot responsibilities r = torch.zeros(q_idx.shape[0], self.num_embed, device=inputs.device) - r.scatter_(1, q_idx, 1) - z_q = torch.matmul(r, self.embed).view(input_shape) + r.scatter_(1, q_idx, 1) # (batch x time, num_embeds) + z_q = torch.matmul(r, self.embed).view(input_shape) # (batch, time, embed_dim) + + if mask is not None: + z_q = z_q * mask + inputs = inputs * mask # Loss - vq_loss = F.mse_loss(z_q, inputs.detach()) - commitment_loss = F.mse_loss(z_q.detach(), inputs) + vq_loss = F.mse_loss(z_q, inputs.detach()) # || z_q - sg(z) ||_2 + commitment_loss = F.mse_loss(z_q.detach(), inputs) # || z - sg (z_q) ||_2 + loss = vq_loss + self.commitment_cost * commitment_loss + if mask is not None: + loss /= torch.mean(mask) # this allows to backprogate the gradients as if the output were equal to z_e z_q = inputs + (z_q - inputs).detach() # compute the perplexity - probs = torch.mean(r, dim=0) + if mask is None: + probs = torch.mean(r, dim=0) + else: + probs = torch.mean(r[mask.flatten()], dim=0) + log_perplexity = -torch.sum(probs * torch.log(probs + 1e-10)) # compute KL divergence between r and uniform categorical prior @@ -147,7 +205,7 @@ def forward(self, inputs, return_r=False): ) # convert quantized from BHWC -> BCHW - z_q = z_q.transpose(1, -1).contiguous() + z_q = z_q.transpose(1, -1).contiguous() # (batch, embed_dim, time) output = { "z_q": z_q, "loss": loss, @@ -162,6 +220,20 @@ def forward(self, inputs, return_r=False): class MultiKMeansVectorQuantizer(VectorQuantizer): + """Class for Mulit-group K-Means vector quantization layers, + where codebook vectors are trained by gradient descend losses. + The input tensors are divided into groups and quantized separately. + + Attributes: + num_groups: number of codebooks. + num_embed: codebook size. + embed_feats: feature dimension of the codebook vectors. + commitment_cost: weight for loss that makes input features close to the codebook vectors. + project: if True, it projects the input features to the embed_feats dim. + in_feats: input feature dimension, needed when project=True. + in_dim: number of dimensions of the input tensor in [2,5], needed when project=True + """ + def __init__( self, num_groups, @@ -212,15 +284,37 @@ def __str__(self): ) return s - def forward(self, inputs, return_r=False): + def forward(self, inputs, lengths=None, mask=None, return_r=False): + """Quantizes the input tensor. + + Args: + input: input tensor 2d - 5d dimension with shape (batch, channels, ...) + lengths: when inputs is 3d, it the length of each sequence in the batch. + Not used if mask is given. + mask: indicates which elements are valid, to quantize. The elements with zero + mask are set to 0. The mask tensor should have the same shape as the + input tensor with the channel dimension removed, shape=(batch, ...). + return_r: it True, it returns the responsibilities. + + Returns: + Dictionary containing quantized vectors, vq_loss, KL(q(z)||p(z)), where q(z) is + the distribution of posterior responsabilities and p(z) is a uniform categorical + distribution, and the log_perplexity of the responsibilities. If return_r is True, + it also returns the responsibilities. + """ if self.project: inputs = self._proj(inputs) + if mask is None and lengths is not None: + mask = seq_lengths_to_mask( + lengths, inputs.size(-1), time_dim=1, dtype=inputs.dtype + ) + inputs = inputs.chunk(self.num_groups, dim=1) z_q = [] r = [] for i in range(self.num_groups): - output_i = self.vq_layers[i](inputs[i], return_r=return_r) + output_i = self.vq_layers[i](inputs[i], mask=mask, return_r=return_r) z_qi = output_i["z_q"] loss_i = output_i["loss"] kldiv_ri = output_i["kldiv_qrpr"] @@ -255,6 +349,19 @@ def forward(self, inputs, return_r=False): class EMAKMeansVectorQuantizer(VectorQuantizer): + """Class exponential moving average vector quantization layers, + + Attributes: + num_embed: codebook size. + embed_feats: feature dimension of the codebook vectors. + commitment_cost: weight for loss that makes input features close to the codebook vectors. + gamma: exponential average coefficient. + eps: epsilon for Laplace smoothing of the counts. + project: if True, it projects the input features to the embed_feats dim. + in_feats: input feature dimension, needed when project=True. + in_dim: number of dimensions of the input tensor in [2,5], needed when project=True + """ + def __init__( self, num_embed, @@ -302,11 +409,34 @@ def __str__(self): ) return s - def forward(self, inputs, return_r=False): + def forward(self, inputs, lengths=None, mask=None, return_r=False): + """Quantizes the input tensor. In training phase, it also + updates the codebooks by EMA. + + Args: + input: input tensor 2d - 5d dimension with shape (batch, channels, ...) + lengths: when inputs is 3d, it the length of each sequence in the batch. + Not used if mask is given. + mask: indicates which elements are valid, to quantize. The elements with zero + mask are set to 0. The mask tensor should have the same shape as the + input tensor with the channel dimension removed, shape=(batch, ...). + return_r: it True, it returns the responsibilities. + + Returns: + Dictionary containing quantized vectors, vq_loss, KL(q(z)||p(z)), where q(z) is + the distribution of posterior responsabilities and p(z) is a uniform categorical + distribution, and the log_perplexity of the responsibilities. If return_r is True, + it also returns the responsibilities. + """ # inputs -> z_e in paper if self.project: inputs = self._proj(inputs) + if mask is None and lengths is not None: + mask = seq_lengths_to_mask( + lengths, inputs.size(-1), time_dim=1, dtype=inputs.dtype + ) + # convert inputs from BCHW -> BHWC inputs = inputs.transpose(1, -1).contiguous() input_shape = inputs.shape @@ -331,9 +461,15 @@ def forward(self, inputs, return_r=False): # Use Exponetial Moving Average (EMA) to update the embedding vectors if self.training: + if mask is not None: + flat_mask = mask.flatten() + r = r[flat_mask] + flat_inputs = flat_inputs[flat_mask] + N = torch.sum(r, dim=0) # required to sync gpus in DDP - dist.all_reduce(N, op=dist.ReduceOp.SUM) + if dist.is_initialized(): + dist.all_reduce(N, op=dist.ReduceOp.SUM) ema_N = self._ema_N * self.gamma + (1 - self.gamma) * N @@ -345,21 +481,31 @@ def forward(self, inputs, return_r=False): z_acc = torch.matmul(r.t(), flat_inputs) # required to sync gpus in DDP - dist.all_reduce(z_acc, op=dist.ReduceOp.SUM) + if dist.is_initialized(): + dist.all_reduce(z_acc, op=dist.ReduceOp.SUM) self._ema_z_acc = ( self.gamma * self._ema_z_acc + (1 - self.gamma) * z_acc ).detach() self.embed = (self._ema_z_acc / self._ema_N.unsqueeze(1)).detach() + if mask is not None: + z_q = z_q * mask + inputs = inputs * mask # Loss commitment_loss = F.mse_loss(z_q.detach(), inputs) loss = self.commitment_cost * commitment_loss + if mask is not None: + loss /= torch.mean(mask) # this allows to backprogate the gradients as if the output were equal to z_e z_q = inputs + (z_q - inputs).detach() # compute the perplexity - probs = torch.mean(r, dim=0) + if mask is None: + probs = torch.mean(r, dim=0) + else: + probs = torch.mean(r[mask.flatten()], dim=0) + log_perplexity = -torch.sum(probs * torch.log(probs + 1e-10)) # compute KL divergence between r and uniform categorical prior @@ -390,6 +536,22 @@ def forward(self, inputs, return_r=False): class MultiEMAKMeansVectorQuantizer(VectorQuantizer): + """Class for Mulit-group exponential moving average vector quantization layers, + where codebook vectors are trained by gradient descend losses. + The input tensors are divided into groups and quantized separately. + + Attributes: + num_groups: number of codebooks. + num_embed: codebook size. + embed_feats: feature dimension of the codebook vectors. + commitment_cost: weight for loss that makes input features close to the codebook vectors. + gamma: exponential average coefficient. + eps: epsilon for Laplace smoothing of the counts. + project: if True, it projects the input features to the embed_feats dim. + in_feats: input feature dimension, needed when project=True. + in_dim: number of dimensions of the input tensor in [2,5], needed when project=True + """ + def __init__( self, num_groups, @@ -452,15 +614,37 @@ def __str__(self): ) return s - def forward(self, inputs, return_r=False): + def forward(self, inputs, lengths=None, mask=None, return_r=False): + """Quantizes the input tensor. + + Args: + input: input tensor 2d - 5d dimension with shape=(batch, channels, ...) + lengths: when inputs is 3d, it the length of each sequence in the batch. + Not used if mask is given. + mask: indicates which elements are valid, to quantize. The elements with zero + mask are set to 0. The mask tensor should have the same shape as the + input tensor with the channel dimension removed, shape=(batch, ...). + return_r: it True, it returns the responsibilities. + + Returns: + Dictionary containing quantized vectors, vq_loss, KL(q(z)||p(z)), where q(z) is + the distribution of posterior responsabilities and p(z) is a uniform categorical + distribution, and the log_perplexity of the responsibilities. If return_r is True, + it also returns the responsibilities. + """ if self.project: inputs = self._proj(inputs) + if mask is None and lengths is not None: + mask = seq_lengths_to_mask( + lengths, inputs.size(-1), time_dim=1, dtype=inputs.dtype + ) + inputs = inputs.chunk(self.num_groups, dim=1) z_q = [] r = [] for i in range(self.num_groups): - output_i = self.vq_layers[i](inputs[i]) + output_i = self.vq_layers[i](inputs[i], mask=mask) z_qi = output_i["z_q"] loss_i = output_i["loss"] kldiv_ri = output_i["kldiv_qrpr"] diff --git a/hyperion/torch/loggers/__init__.py b/hyperion/torch/loggers/__init__.py index c48b9965..8842393c 100644 --- a/hyperion/torch/loggers/__init__.py +++ b/hyperion/torch/loggers/__init__.py @@ -3,9 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .csv_logger import CSVLogger from .logger import Logger from .logger_list import LoggerList -from .csv_logger import CSVLogger from .prog_logger import ProgLogger from .tensorboard_logger import TensorBoardLogger from .wandb_logger import WAndBLogger diff --git a/hyperion/torch/loggers/csv_logger.py b/hyperion/torch/loggers/csv_logger.py index 402ddcd5..67fdc464 100644 --- a/hyperion/torch/loggers/csv_logger.py +++ b/hyperion/torch/loggers/csv_logger.py @@ -3,9 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os import csv +import os from collections import OrderedDict as ODict + import numpy as np from .logger import Logger diff --git a/hyperion/torch/loggers/logger.py b/hyperion/torch/loggers/logger.py index 46c1130d..7e9c91f2 100644 --- a/hyperion/torch/loggers/logger.py +++ b/hyperion/torch/loggers/logger.py @@ -4,6 +4,7 @@ """ import numpy as np + import torch.distributed as dist diff --git a/hyperion/torch/loggers/logger_list.py b/hyperion/torch/loggers/logger_list.py index 20ae58ec..0291a01f 100644 --- a/hyperion/torch/loggers/logger_list.py +++ b/hyperion/torch/loggers/logger_list.py @@ -4,6 +4,7 @@ """ import numpy as np + import torch.distributed as dist from .tensorboard_logger import TensorBoardLogger as TBL diff --git a/hyperion/torch/loggers/prog_logger.py b/hyperion/torch/loggers/prog_logger.py index 26479197..8df63b15 100644 --- a/hyperion/torch/loggers/prog_logger.py +++ b/hyperion/torch/loggers/prog_logger.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import time import logging +import time from collections import OrderedDict import numpy as np diff --git a/hyperion/torch/loggers/tensorboard_logger.py b/hyperion/torch/loggers/tensorboard_logger.py index 314757d1..a80fa175 100644 --- a/hyperion/torch/loggers/tensorboard_logger.py +++ b/hyperion/torch/loggers/tensorboard_logger.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import re + from torch.utils.tensorboard import SummaryWriter from .logger import Logger diff --git a/hyperion/torch/loggers/wandb_logger.py b/hyperion/torch/loggers/wandb_logger.py index c864e9b1..094f619a 100644 --- a/hyperion/torch/loggers/wandb_logger.py +++ b/hyperion/torch/loggers/wandb_logger.py @@ -2,8 +2,8 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import re import os +import re try: import wandb diff --git a/hyperion/torch/losses/__init__.py b/hyperion/torch/losses/__init__.py index bf3ce279..56ad2a5d 100644 --- a/hyperion/torch/losses/__init__.py +++ b/hyperion/torch/losses/__init__.py @@ -4,3 +4,4 @@ """ from .bce_with_llr import BCEWithLLR +from .dino_loss import CosineDINOLoss, DINOLoss diff --git a/hyperion/torch/losses/dino_loss.py b/hyperion/torch/losses/dino_loss.py new file mode 100644 index 00000000..c5f499c8 --- /dev/null +++ b/hyperion/torch/losses/dino_loss.py @@ -0,0 +1,245 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging + +import torch +import torch.distributed as dist +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args + + +class DINOLoss(nn.Module): + """Loss for Training DIstillation with NO labels. + + Args: + num_classes: number of DINO classes + student_temp: temperature of student distribution + teacher_temp: final temperature of teacher distribution + teacher_warmup_temp: initial temperature of teacher distribution + temp_warmup_epochs: warmup epochs for the teacher temperature + center_momentum: momumntum for centering of the teacher distribution + """ + + def __init__( + self, + num_classes: int, + student_temp: float = 0.1, + teacher_temp: float = 0.04, + teacher_warmup_temp: float = 0.04, + temp_warmup_epochs: int = 30, + center_momentum: float = 0.9, + ): + super().__init__() + self.num_classes = num_classes + self.student_temp = student_temp + self.teacher_temp = teacher_temp + self.teacher_warmup_temp = teacher_warmup_temp + self.temp_warmup_epochs = temp_warmup_epochs + self.center_momentum = center_momentum + self.cur_teacher_temp = teacher_warmup_temp + self.register_buffer("center", torch.zeros(1, num_classes)) + + def update_temp(self, epoch: int): + if epoch < self.temp_warmup_epochs: + self.cur_teacher_temp = ( + self.teacher_warmup_temp + + (self.teacher_temp - self.teacher_warmup_temp) + * epoch + / self.temp_warmup_epochs + ) + logging.info("updating dino-loss teacher temp=%.3f", self.cur_teacher_temp) + else: + self.cur_teacher_temp = self.teacher_temp + + def forward( + self, + student_pred: torch.Tensor, + teacher_pred: torch.Tensor, + num_student_crops: int, + num_teacher_crops: int, + ): + """ + Cross-entropy between softmax outputs of the teacher and student networks. + """ + assert not torch.any(torch.isnan(student_pred)), f"loss/student is nan" + student_pred = student_pred / self.student_temp + assert not torch.any(torch.isnan(student_pred)), f"loss/p is nan" + student_pred = student_pred.chunk(num_student_crops) + teacher_pred = teacher_pred.detach() + center = self.center # we take the center before updating it + if self.training: + self.update_center(teacher_pred) + assert not torch.any(torch.isnan(teacher_pred)), f"loss/teacher is nan" + teacher_pred = nn.functional.softmax( + (teacher_pred - center) / self.cur_teacher_temp, dim=-1 + ) + assert not torch.any(torch.isnan(teacher_pred)), f"loss/q is nan {center}" + teacher_pred = teacher_pred.chunk(num_teacher_crops) + + total_loss = 0 + n_loss_terms = 0 + for iq, q in enumerate(teacher_pred): + for ip, p in enumerate(student_pred): + if ip == iq and num_teacher_crops > 1: + # we skip cases where student and teacher operate on the same view + continue + loss = torch.sum(-q * nn.functional.log_softmax(p, dim=-1), dim=-1) + assert not torch.any( + torch.isnan(loss) + ), f"loss is nan {iq} {ip} {torch.mean(q)} {torch.mean(p)} {torch.mean(center)}" + total_loss += loss.mean() + n_loss_terms += 1 + total_loss /= n_loss_terms + return total_loss + + @torch.no_grad() + def update_center(self, teacher_pred: torch.Tensor): + """ + Update center used for teacher output. + """ + batch_acc = torch.sum(teacher_pred, dim=0, keepdim=True) + batch_size = torch.as_tensor(teacher_pred.size(0), device=batch_acc.device) + if dist.is_initialized(): + dist.all_reduce(batch_size, op=dist.ReduceOp.SUM) + dist.all_reduce(batch_acc, op=dist.ReduceOp.SUM) + + batch_center = batch_acc / batch_size + assert not torch.any( + torch.isnan(batch_center) + ), f"bc is nan {torch.mean(batch_acc)} {batch_size}" + # ema update + self.center = self.center * self.center_momentum + batch_center * ( + 1 - self.center_momentum + ) + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(DINOLoss.__init__, kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--num-classes", default=65536, type=int, help="number of DINO classes" + ) + parser.add_argument( + "--student-temp", + default=0.1, + type=float, + help="temperature of student distribution", + ) + parser.add_argument( + "--teacher-temp", + default=0.07, + type=float, + help="final temperature of teacher distribution", + ) + parser.add_argument( + "--teacher-warmup-temp", + default=0.04, + type=float, + help="initial temperature of teacher distribution", + ) + parser.add_argument( + "--temp-warmup-epochs", + default=30, + type=int, + help="warmup epochs for the teacher temperature", + ) + parser.add_argument( + "--center-momentum", + default=0.9, + type=float, + help="momumntum for centering of the teacher distribution", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + +class CosineDINOLoss(nn.Module): + """Cosine Loss to regularize DINO + and enforze DINO embeddings to be suitable for cosine scoring + + """ + + def __init__( + self, + scale: float = 1.0, + warmup_epochs: int = 30, + ): + super().__init__() + self.scale = scale + self.warmup_epochs = warmup_epochs + self.cur_scale = scale + + def update_scale(self, epoch: int): + if epoch < self.warmup_epochs: + self.cur_scale = self.scale * epoch / self.warmup_epochs + logging.info("updating cosine-loss scale=%.3f", self.cur_scale) + else: + self.cur_scale = self.scale + + def forward( + self, + student_embed: torch.Tensor, + teacher_embed: torch.Tensor, + num_student_crops: int, + num_teacher_crops: int, + ): + """ + Cosine scoring between embeddings of the teacher and student networks. + """ + if self.scale == 0: + return 0 + + student_embed = torch.nn.functional.normalize(student_embed, dim=-1) + teacher_embed = torch.nn.functional.normalize(teacher_embed, dim=-1) + student_embed = student_embed.chunk(num_student_crops) + teacher_embed = teacher_embed.detach() + teacher_embed = teacher_embed.chunk(num_teacher_crops) + + total_loss = 0 + n_loss_terms = 0 + for iq, q in enumerate(teacher_embed): + for ip, p in enumerate(student_embed): + if ip == iq and num_teacher_crops > 1: + # we skip cases where student and teacher operate on the same view + continue + loss = 1 - torch.sum(q * p, dim=-1) + total_loss += loss.mean() + n_loss_terms += 1 + total_loss /= n_loss_terms + + return self.cur_scale * total_loss, total_loss + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(CosineDINOLoss.__init__, kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--scale", default=0, type=float, help="Scale of Cosine loss to reg. DINO" + ) + parser.add_argument( + "--warmup-epochs", + default=30, + type=int, + help="warmup epochs for the scale", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/lr_schedulers/__init__.py b/hyperion/torch/lr_schedulers/__init__.py index f0a3465e..7d1b07db 100644 --- a/hyperion/torch/lr_schedulers/__init__.py +++ b/hyperion/torch/lr_schedulers/__init__.py @@ -4,8 +4,11 @@ """ -from .lr_scheduler import LRScheduler -from .red_lr_on_plateau import ReduceLROnPlateau +from .cos_lr import AdamCosineLR, CosineLR from .exp_lr import ExponentialLR -from .cos_lr import CosineLR, AdamCosineLR from .factory import LRSchedulerFactory +from .invpow_lr import InvPowLR +from .lr_scheduler import LRScheduler +from .noam_lr import NoamLR +from .red_lr_on_plateau import ReduceLROnPlateau +from .triangular_lr import TriangularLR diff --git a/hyperion/torch/lr_schedulers/cos_lr.py b/hyperion/torch/lr_schedulers/cos_lr.py index 6e36cf2a..c2ea8ec3 100644 --- a/hyperion/torch/lr_schedulers/cos_lr.py +++ b/hyperion/torch/lr_schedulers/cos_lr.py @@ -3,9 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - -import math import logging +import math import torch @@ -25,16 +24,26 @@ class CosineLR(LRScheduler): When epoch=-1, sets initial lr as lr. It has been proposed in - `SGDR: Stochastic Gradient Descent with Warm Restarts`_. - - Args: - optimizer (Optimizer): Wrapped optimizer. - T_max (int): Maximum number of iterations. - eta_min (float): Minimum learning rate. Default: 0. - epoch (int): The index of last epoch. Default: -1. - .. _SGDR\: Stochastic Gradient Descent with Warm Restarts: https://arxiv.org/abs/1608.03983 + + Attributes: + optimizer: Pytorch optimizer object. + T: period of the cycle. + T_mul: period multiplier, after each cycle the period is multiplied by T_mul. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + warm_restarts: whether or not to do warm restarts. + gamma: after each period, the maximum lr is multiplied by gamma. + last_restart: what is the step when the last restart happened, , this is used + to restart the training from a checkpoint. + num_restarts: how many restarts, we have done, this is used to restart the + training from a checkpoint. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. """ def __init__( @@ -53,7 +62,7 @@ def __init__( update_lr_on_opt_step=False, ): - super(CosineLR, self).__init__( + super().__init__( optimizer, min_lr, warmup_steps, epoch, step, update_lr_on_opt_step ) self.T = T @@ -64,7 +73,7 @@ def __init__( self.gamma = gamma def on_epoch_begin(self, epoch=None, epoch_updates=1, **kwargs): - super(CosineLR, self).on_epoch_begin(epoch) + super().on_epoch_begin(epoch) if self.update_lr_on_opt_step: # T has to correspond to an integer number of epochs T = int(math.ceil(self.T / epoch_updates) * epoch_updates) @@ -92,7 +101,7 @@ def get_lr(self, step): else: return self.min_lrs - alpha = self.gamma ** self.num_restarts + alpha = self.gamma**self.num_restarts r = math.pi / self.T return [ @@ -108,6 +117,29 @@ def get_lr(self, step): class AdamCosineLR(CosineLR): + r"""Set the learning rate of each parameter group using a cosine annealing + schedule when using adam optimizer + + Attributes: + optimizer: Pytorch optimizer object. + T: period of the cycle. + T_mul: period multiplier, after each cycle the period is multiplied by T_mul. + hold_steps: number of steps until the lr starts decaying. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + warm_restarts: whether or not to do warm restarts. + gamma: after each period, the maximum lr is multiplied by gamma. + last_restart: what is the step when the last restart happened, , this is used + to restart the training from a checkpoint. + num_restarts: how many restarts, we have done, this is used to restart the + training from a checkpoint. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. + """ + def __init__( self, optimizer, @@ -122,7 +154,7 @@ def __init__( step=-1, update_lr_on_opt_step=False, ): - super(AdamCosineLR, super).__init__( + super().__init__( optimizer, T, T_mul, @@ -143,12 +175,12 @@ def get_lr(self, step): if self.warm_restarts: self.last_restart = step x = 0 - self.T *= T_mul + self.T *= self.T_mul self.num_restarts += 1 else: return self.min_lrs - alpha = gamma ** self.num_restarts + alpha = self.gamma**self.num_restarts r = math.pi / self.T return [ diff --git a/hyperion/torch/lr_schedulers/exp_lr.py b/hyperion/torch/lr_schedulers/exp_lr.py index cbe00a01..66edf436 100644 --- a/hyperion/torch/lr_schedulers/exp_lr.py +++ b/hyperion/torch/lr_schedulers/exp_lr.py @@ -10,7 +10,21 @@ class ExponentialLR(LRScheduler): - """Exponential learning rate scheduler.""" + """Exponential learning rate scheduler. + + Attributes: + optimizer: Pytorch optimizer object. + decay_rate: the lr is multiplied by `decay_rate` after `decay_ste.ps` + decay_steps: number of decay steps. + hold_steps: number of steps until the lr starts decaying. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. + """ def __init__( self, @@ -24,7 +38,7 @@ def __init__( step=0, update_lr_on_opt_step=False, ): - super(ExponentialLR, self).__init__( + super().__init__( optimizer, min_lr, warmup_steps, epoch, step, update_lr_on_opt_step ) self.decay_rate = decay_rate diff --git a/hyperion/torch/lr_schedulers/factory.py b/hyperion/torch/lr_schedulers/factory.py index 9e185a7c..f2886203 100644 --- a/hyperion/torch/lr_schedulers/factory.py +++ b/hyperion/torch/lr_schedulers/factory.py @@ -2,17 +2,18 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from .red_lr_on_plateau import ReduceLROnPlateau +from .cos_lr import AdamCosineLR, CosineLR from .exp_lr import ExponentialLR from .invpow_lr import InvPowLR -from .cos_lr import CosineLR, AdamCosineLR +from .noam_lr import NoamLR +from .red_lr_on_plateau import ReduceLROnPlateau +from .triangular_lr import TriangularLR -class LRSchedulerFactory(object): +class LRSchedulerFactory: def create( optimizer, lrsch_type, @@ -34,8 +35,57 @@ def create( eps=1e-8, min_lr=0, warmup_steps=0, + d_model=None, + lr_factor=1, update_lr_on_opt_step=False, ): + """Creates a learning rate scheduler object. + + Args: + optimizer: Pytorch optimizer object. + lrsched_type: type of scheduler in ["none", "exp_lr", "invpow_lr", + "cos_lr", "adamcos_lr", "red_lr_on_plateau", "noam_lr", + "triangular_lr"]. + decay_rate: the lr is multiplied by `decay_rate` after `decay_ste.ps` + decay_steps: number of decay steps. + power: the step/epoch number is ellebated to this power to compute the decay. + hold_steps: number of steps until the lr starts decaying. + t: period of the cycle. + t_mul: period multiplier, after each cycle the period is multiplied by T_mul. + warm_restarts: whether or not to do warm restarts. + gamma: after each period, the maximum lr is multiplied by gamma, in cyclid schedulers. + monitor: which metric to monitor in RedLROnPlateau scheduler. + mode (str): One of `min`, `max`. In `min` mode, lr will + be reduced when the quantity monitored has stopped + decreasing; in `max` mode it will be reduced when the + quantity monitored has stopped increasing. Default: 'min'. + factor (float): Factor by which the learning rate will be + reduced. new_lr = lr * factor. Default: 0.1. + patience (int): Number of epochs with no improvement after + which learning rate will be reduced. For example, if + `patience = 2`, then we will ignore the first 2 epochs + with no improvement, and will only decrease the LR after the + 3rd epoch if the loss still hasn't improved then. + threshold (float): Threshold for measuring the new optimum, + to only focus on significant changes. Default: 1e-4. + threshold_mode (str): One of `rel`, `abs`. In `rel` mode, + dynamic_threshold = best * ( 1 + threshold ) in 'max' + mode or best * ( 1 - threshold ) in `min` mode. + In `abs` mode, dynamic_threshold = best + threshold in + `max` mode or best - threshold in `min` mode. Default: 'rel'. + cooldown (int): Number of epochs to wait before resuming + normal operation after lr has been reduced. Default: 0. + eps (float): Minimal decay applied to lr. If the difference + between new and old lr is smaller than eps, the update is + ignored. Default: 1e-8. + d_model: hidden dimension of transformer model. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + d_model: hidden dimension of transformer model. + lr_factor: multiplies the Noam lr by this number. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. + """ if lrsch_type == "none": return None @@ -61,6 +111,15 @@ def create( update_lr_on_opt_step=update_lr_on_opt_step, ) + if lrsch_type == "noam_lr": + return NoamLR( + optimizer, + d_model, + lr_factor, + min_lr=min_lr, + warmup_steps=warmup_steps, + ) + if lrsch_type == "cos_lr": return CosineLR( optimizer, @@ -73,6 +132,16 @@ def create( update_lr_on_opt_step=update_lr_on_opt_step, ) + if lrsch_type == "triangular": + return TriangularLR( + optimizer, + t, + t_mul, + min_lr=min_lr, + gamma=gamma, + update_lr_on_opt_step=update_lr_on_opt_step, + ) + if lrsch_type == "adamcos_lr": return AdamCosineLR( optimizer, @@ -99,9 +168,10 @@ def create( eps=eps, ) + raise ValueError(f"invalid lrsch_type={lrsch_type}") + @staticmethod def filter_args(**kwargs): - valid_args = ( "lrsch_type", "decay_rate", @@ -122,6 +192,8 @@ def filter_args(**kwargs): "eps", "min_lr", "warmup_steps", + "lr_factor", + "d_model", "update_lr_on_opt_step", ) @@ -144,6 +216,8 @@ def add_class_args(parser, prefix=None): "cos_lr", "adamcos_lr", "red_lr_on_plateau", + "noam_lr", + "triangular_lr", ], help=( "Learning rate schedulers: None, Exponential," @@ -173,19 +247,19 @@ def add_class_args(parser, prefix=None): "--t-mul", default=1, type=int, - help=("Period multiplicator for each restart in cos lr"), + help=("Period multiplicator for each restart in cos/triangular lr"), ) parser.add_argument( "--gamma", - default=1 / 100, + default=1.0, type=float, - help=("LR decay rate for each restart in cos lr"), + help=("LR decay rate for each restart in cos/triangular lr"), ) parser.add_argument( "--warm-restarts", default=False, - action="store_true", + action=ActionYesNo, help=("Do warm restarts in cos lr"), ) @@ -248,10 +322,22 @@ def add_class_args(parser, prefix=None): help=("Number of batches to warmup lr"), ) + parser.add_argument( + "--d-model", + default=None, + type=int, + help=("Transformer model hidden dimension"), + ) + parser.add_argument( + "--lr-factor", + default=1, + type=float, + help=("learning rate scaling factor for Noam schedule"), + ) parser.add_argument( "--update-lr-on-opt-step", default=False, - action="store_true", + action=ActionYesNo, help=("Update lr based on batch number instead of epoch number"), ) diff --git a/hyperion/torch/lr_schedulers/invpow_lr.py b/hyperion/torch/lr_schedulers/invpow_lr.py index 53aa28dc..db420a0f 100644 --- a/hyperion/torch/lr_schedulers/invpow_lr.py +++ b/hyperion/torch/lr_schedulers/invpow_lr.py @@ -10,7 +10,20 @@ class InvPowLR(LRScheduler): - """inverse power learning rate scheduler.""" + """inverse power decay learning rate scheduler. + + Attributes: + optimizer: Pytorch optimizer object. + power: the step/epoch number is ellebated to this power to compute the decay. + hold_steps: number of steps until the lr starts decaying. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. + """ def __init__( self, @@ -23,7 +36,7 @@ def __init__( step=0, update_lr_on_opt_step=False, ): - super(InvPowLR, self).__init__( + super().__init__( optimizer, min_lr, warmup_steps, epoch, step, update_lr_on_opt_step ) self.power = power diff --git a/hyperion/torch/lr_schedulers/lr_scheduler.py b/hyperion/torch/lr_schedulers/lr_scheduler.py index 319ea7a2..d609bf26 100644 --- a/hyperion/torch/lr_schedulers/lr_scheduler.py +++ b/hyperion/torch/lr_schedulers/lr_scheduler.py @@ -3,13 +3,23 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import torch import torch.optim as optim -class LRScheduler(object): - """Base class for learning rate schedulers""" +class LRScheduler: + """Base class for learning rate schedulers. + + Attributes: + optimizer: Pytorch optimizer object. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. + """ def __init__( self, @@ -56,7 +66,7 @@ def __init__( @property def in_warmup(self): - return self.step <= self.warmup_steps + return self.step < self.warmup_steps def state_dict(self): """Returns the state of the scheduler as a :class:`dict`. @@ -80,7 +90,7 @@ def load_state_dict(self, state_dict): def get_warmup_lr(self): x = self.step return [ - (base_lr - min_lr) / self.warmup_steps * x + min_lr + (base_lr - min(min_lr, 1e-8)) / self.warmup_steps * x + min(min_lr, 1e-8) for base_lr, min_lr in zip(self.base_lrs, self.min_lrs) ] @@ -103,10 +113,6 @@ def on_epoch_end(self, metrics=None): self.epoch += 1 def on_opt_step(self): - - # self.update_lr_on_opt_step=True - # print('exp-lr', self.last_step, self.hold_steps, self.decay_rate, self.decay_steps) - if self.in_warmup: for param_group, lr in zip( self.optimizer.param_groups, self.get_warmup_lr() diff --git a/hyperion/torch/lr_schedulers/noam_lr.py b/hyperion/torch/lr_schedulers/noam_lr.py new file mode 100644 index 00000000..c075b919 --- /dev/null +++ b/hyperion/torch/lr_schedulers/noam_lr.py @@ -0,0 +1,60 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import math + +from .invpow_lr import InvPowLR + +# import torch + + + +class NoamLR(InvPowLR): + """Optimizer used for Transformers in + Attention is all You Need: https://arxiv.org/pdf/1706.03762.pdf + + This is Inverse Power Law decay scheduler with parameters that depend on + the transformer hidden dimension. + + Attributes: + optimizer: Pytorch optimizer object. + d_model: hidden dimension of transformer model. + lr_factor: multiplies the Noam lr by this number. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + + """ + def __init__( + self, + optimizer, + d_model, + lr_factor=1, + min_lr=0, + warmup_steps=0, + epoch=0, + step=0, + ): + lr = lr_factor / math.sqrt(d_model * warmup_steps) + logging.info("Noam lr=%f", lr) + # we scale the lr taking account the relative + # learning rates in the param_groups + # in order to be able to have different lr for + # different modules of the model + max_lr = 0 + for group in optimizer.param_groups: + max_lr = max(group["lr"], max_lr) + for group in optimizer.param_groups: + group["lr"] = lr * group["lr"] / max_lr + super().__init__( + optimizer, + min_lr=min_lr, + warmup_steps=warmup_steps, + epoch=epoch, + step=step, + update_lr_on_opt_step=True, + ) diff --git a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py index 8d9eb4bf..3f7b2ec7 100644 --- a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py +++ b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py @@ -7,7 +7,11 @@ from functools import partial import torch -from torch._six import inf + +try: + from torch import inf +except: + from torch._six import inf from .lr_scheduler import LRScheduler @@ -21,6 +25,7 @@ class ReduceLROnPlateau(LRScheduler): Attributes: optimizer (Optimizer): optimizer. + monitor: which metric to monitor. mode (str): One of `min`, `max`. In `min` mode, lr will be reduced when the quantity monitored has stopped decreasing; in `max` mode it will be reduced when the @@ -45,6 +50,7 @@ class ReduceLROnPlateau(LRScheduler): min_lr (float or list): A scalar or a list of scalars. A lower bound on the learning rate of all param groups or each group respectively. Default: 0. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. eps (float): Minimal decay applied to lr. If the difference between new and old lr is smaller than eps, the update is ignored. Default: 1e-8. @@ -64,7 +70,7 @@ def __init__( warmup_steps=0, eps=1e-8, ): - super(ReduceLROnPlateau, self).__init__( + super().__init__( optimizer, min_lr, warmup_steps, diff --git a/hyperion/torch/lr_schedulers/triangular_lr.py b/hyperion/torch/lr_schedulers/triangular_lr.py new file mode 100644 index 00000000..0a5efd38 --- /dev/null +++ b/hyperion/torch/lr_schedulers/triangular_lr.py @@ -0,0 +1,97 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +import logging +import math + +import torch + +from .lr_scheduler import LRScheduler + + +class TriangularLR(LRScheduler): + r"""Sets cyclid triangular learning rate schedule as proposed in + .. Cyclical Learning Rates for Training Neural Networks: + https://arxiv.org/abs/1506.01186 + + .. math:: + \mathrm{cycle} = \mathrm{floor}(1 + \frac{T_{cur}}{T_{max}}) + x = \mathrm{abs}(2\frac{T_{cur}}{T_{max}}-2\mathrm{cycle}+1) + \eta_t = \eta_{min} + (\eta_{max} - \eta_{min})\max(0, 1-x) + + Attributes: + optimizer: Pytorch optimizer object. + T: period of the cycle. + T_mul: period multiplier, after each cycle the period is multiplied by T_mul. + hold_steps: number of steps until the lr starts decaying. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + gamma: after each period, the maximum lr is multiplied by gamma. + last_restart: what is the step when the last restart happened, , this is used + to restart the training from a checkpoint. + num_restarts: how many restarts, we have done, this is used to restart the + training from a checkpoint. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. + """ + + def __init__( + self, + optimizer, + T, + T_mul=1, + min_lr=0, + gamma=1, + last_restart=0, + num_restarts=0, + epoch=0, + step=0, + update_lr_on_opt_step=False, + ): + super().__init__(optimizer, min_lr, 0, epoch, step, update_lr_on_opt_step) + self.T = T + self.T_mul = T_mul + self.last_restart = last_restart + self.num_restarts = num_restarts + self.gamma = gamma + + def load_state_dict(self, state_dict): + # we want to be able to change gamma and T_mul in the middle of training + del state_dict["gamma"] + del state_dict["T_mul"] + super().load_state_dict(state_dict) + + def on_epoch_begin(self, epoch=None, epoch_updates=1, **kwargs): + super().on_epoch_begin(epoch) + if self.update_lr_on_opt_step: + # T has to correspond to an integer number of epochs + T = int(math.ceil(self.T / epoch_updates) * epoch_updates) + if self.T != T: + logging.info("readjusting triangular_lr T %d -> %d", self.T, T) + self.T = T + + def get_lr(self, step): + x = step - self.last_restart + + if x >= self.T: + self.last_restart = step + x = 0 + self.T *= self.T_mul + self.num_restarts += 1 + logging.info( + "triangular_lr warm-restart=%d T=%d", self.num_restarts, self.T + ) + + alpha = self.gamma**self.num_restarts + x = abs(2 * x / self.T - 1) + + return [ + eta_min + (alpha * eta_max - eta_min) * max(0, 1 - x) + for eta_max, eta_min in zip(self.base_lrs, self.min_lrs) + ] diff --git a/hyperion/torch/metrics/__init__.py b/hyperion/torch/metrics/__init__.py index b4a2eaac..33d67c21 100644 --- a/hyperion/torch/metrics/__init__.py +++ b/hyperion/torch/metrics/__init__.py @@ -4,6 +4,6 @@ """ -from .metrics import TorchMetric -from .accuracy_functional import * from .accuracy import * +from .accuracy_functional import * +from .metrics import TorchMetric diff --git a/hyperion/torch/metrics/accuracy.py b/hyperion/torch/metrics/accuracy.py index ebd02e32..93d71683 100644 --- a/hyperion/torch/metrics/accuracy.py +++ b/hyperion/torch/metrics/accuracy.py @@ -5,8 +5,8 @@ import torch -from .metrics import TorchMetric from .accuracy_functional import * +from .metrics import TorchMetric class CategoricalAccuracy(TorchMetric): diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index be4e0441..77a2543f 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -4,13 +4,32 @@ """ -from .xvectors.xvector import XVector -from .xvectors.tdnn_xvector import TDNNXVector -from .xvectors.resnet_xvector import ResNetXVector -from .xvectors.efficient_net_xvector import EfficientNetXVector -from .xvectors.transformer_xvector_v1 import TransformerXVectorV1 -from .xvectors.spinenet_xvector import SpineNetXVector -from .xvectors.resnet1d_xvector import ResNet1dXVector - +from .transducer import RNNRNNTransducer, RNNTransducer from .vae.vae import VAE from .vae.vq_vae import VQVAE +from .wav2transducer import ( # HFWav2Vec2Transducer, + HFWav2Vec2ConformerV1RNNTransducer, + HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + Wav2ConformerV1RNNTransducer, + Wav2RNNRNNTransducer, +) +from .wav2xvectors import ( + HFHubert2ConformerV1XVector, + HFHubert2ResNet1dXVector, + HFWav2Vec2ConformerV1XVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ConformerV1XVector, + HFWavLM2ResNet1dXVector, + Wav2ConformerV1XVector, + Wav2ResNet1dXVector, + Wav2ResNetXVector, +) +from .xvectors.conformer_v1_xvector import ConformerV1XVector +from .xvectors.efficient_net_xvector import EfficientNetXVector +from .xvectors.resnet1d_xvector import ResNet1dXVector +from .xvectors.resnet_xvector import ResNetXVector +from .xvectors.spinenet_xvector import SpineNetXVector +from .xvectors.tdnn_xvector import TDNNXVector +from .xvectors.transformer_xvector_v1 import TransformerXVectorV1 +from .xvectors.xvector import XVector diff --git a/hyperion/torch/models/ae/ae.py b/hyperion/torch/models/ae/ae.py index 57d30edc..32cd68ea 100644 --- a/hyperion/torch/models/ae/ae.py +++ b/hyperion/torch/models/ae/ae.py @@ -8,8 +8,8 @@ import torch import torch.nn as nn -from ...torch_model import TorchModel from ...narchs import TorchNALoader +from ...torch_model import TorchModel class AE(TorchModel): diff --git a/hyperion/torch/models/plda/plda_base.py b/hyperion/torch/models/plda/plda_base.py index d6100a36..2556627d 100644 --- a/hyperion/torch/models/plda/plda_base.py +++ b/hyperion/torch/models/plda/plda_base.py @@ -2,15 +2,15 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import time import logging import math +import time import torch import torch.nn as nn from ...torch_model import TorchModel -from ...utils.misc import l2_norm, get_selfsim_tarnon +from ...utils.misc import get_selfsim_tarnon, l2_norm class PLDABase(TorchModel): diff --git a/hyperion/torch/models/plda/splda.py b/hyperion/torch/models/plda/splda.py index 0025e4e7..3a0f1dee 100644 --- a/hyperion/torch/models/plda/splda.py +++ b/hyperion/torch/models/plda/splda.py @@ -2,13 +2,13 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import time import logging +import time import torch import torch.nn as nn -from ...utils.math import invert_trimat +from ...utils.math_funcs import invert_trimat from .plda_base import PLDABase diff --git a/hyperion/torch/models/transducer/__init__.py b/hyperion/torch/models/transducer/__init__.py new file mode 100644 index 00000000..331e3ef0 --- /dev/null +++ b/hyperion/torch/models/transducer/__init__.py @@ -0,0 +1,15 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +from .conformer_v1_rnn_transducer import ConformerV1RNNTransducer +from .rnn_rnn_transducer import RNNRNNTransducer +from .rnn_transducer import RNNTransducer, RNNTransducerOutput + +# from .transducer import Transducer + +# from .conformer import Conformer +# from .decoder import Decoder +# from .joiner import Joiner diff --git a/hyperion/torch/models/transducer/conformer.py b/hyperion/torch/models/transducer/conformer.py new file mode 100644 index 00000000..511cc178 --- /dev/null +++ b/hyperion/torch/models/transducer/conformer.py @@ -0,0 +1,1510 @@ +#!/usr/bin/env python3 +# Copyright (c) 2021 University of Chinese Academy of Sciences (author: Han Zhu) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import math +import warnings +from typing import List, Optional, Tuple + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +import torch +from hyperion.utils.text import make_pad_mask, subsequent_chunk_mask +from torch import Tensor, nn + +from .transformer import Transformer + + +class Conformer(Transformer): + """ + Args: + num_features (int): Number of input features + output_dim (int): Number of output dimension + subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) + d_model (int): attention dimension + nhead (int): number of head + dim_feedforward (int): feedforward dimention + num_encoder_layers (int): number of encoder layers + dropout (float): dropout rate + cnn_module_kernel (int): Kernel size of convolution module + normalize_before (bool): whether to use layer_norm before the first block. + vgg_frontend (bool): whether to use vgg frontend. + dynamic_chunk_training (bool): whether to use dynamic chunk training, if + you want to train a streaming model, this is expected to be True. + When setting True, it will use a masking strategy to make the attention + see only limited left and right context. + short_chunk_threshold (float): a threshold to determinize the chunk size + to be used in masking training, if the randomly generated chunk size + is greater than ``max_len * short_chunk_threshold`` (max_len is the + max sequence length of current batch) then it will use + full context in training (i.e. with chunk size equals to max_len). + This will be used only when dynamic_chunk_training is True. + short_chunk_size (int): see docs above, if the randomly generated chunk + size equals to or less than ``max_len * short_chunk_threshold``, the + chunk size will be sampled uniformly from 1 to short_chunk_size. + This also will be used only when dynamic_chunk_training is True. + num_left_chunks (int): the left context (in chunks) attention can see, the + chunk size is decided by short_chunk_threshold and short_chunk_size. + A minus value means seeing full left context. + This also will be used only when dynamic_chunk_training is True. + causal (bool): Whether to use causal convolution in conformer encoder + layer. This MUST be True when using dynamic_chunk_training. + """ + + def __init__( + self, + num_features: int, + output_dim: int, + subsampling_factor: int = 4, + d_model: int = 256, + nhead: int = 4, + dim_feedforward: int = 2048, + num_encoder_layers: int = 12, + dropout: float = 0.1, + cnn_module_kernel: int = 31, + normalize_before: bool = True, + vgg_frontend: bool = False, + dynamic_chunk_training: bool = False, + short_chunk_threshold: float = 0.75, + short_chunk_size: int = 25, + num_left_chunks: int = -1, + causal: bool = False, + ) -> None: + super(Conformer, self).__init__( + num_features=num_features, + output_dim=output_dim, + subsampling_factor=subsampling_factor, + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + num_encoder_layers=num_encoder_layers, + dropout=dropout, + normalize_before=normalize_before, + vgg_frontend=vgg_frontend, + ) + + self.encoder_layers = num_encoder_layers + self.d_model = d_model + self.cnn_module_kernel = cnn_module_kernel + self.causal = causal + + self.dynamic_chunk_training = dynamic_chunk_training + self.short_chunk_threshold = short_chunk_threshold + self.short_chunk_size = short_chunk_size + self.num_left_chunks = num_left_chunks + + self.encoder_pos = RelPositionalEncoding(d_model, dropout) + + encoder_layer = ConformerEncoderLayer( + d_model, + nhead, + dim_feedforward, + dropout, + cnn_module_kernel, + normalize_before, + causal, + ) + self.encoder = ConformerEncoder(encoder_layer, num_encoder_layers) + self.normalize_before = normalize_before + if self.normalize_before: + self.after_norm = nn.LayerNorm(d_model) + else: + # Note: TorchScript detects that self.after_norm could be used inside forward() + # and throws an error without this change. + self.after_norm = identity + + self._init_state: List[torch.Tensor] = [torch.empty(0)] + + def forward( + self, x: torch.Tensor, x_lens: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: + The input tensor. Its shape is (batch_size, seq_len, feature_dim). + x_lens: + A tensor of shape (batch_size,) containing the number of frames in + `x` before padding. + Returns: + Return a tuple containing 2 tensors: + - logits, its shape is (batch_size, output_seq_len, output_dim) + - logit_lens, a tensor of shape (batch_size,) containing the number + of frames in `logits` before padding. + """ + x = self.encoder_embed(x) + x, pos_emb = self.encoder_pos(x) + x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) + + # Caution: We assume the subsampling factor is 4! + + # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning + # + # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0 + lengths = (((x_lens - 1) >> 1) - 1) >> 1 + + assert x.size(0) == lengths.max().item() + + src_key_padding_mask = make_pad_mask(lengths) + + if self.dynamic_chunk_training: + assert ( + self.causal + ), "Causal convolution is required for streaming conformer." + max_len = x.size(0) + chunk_size = torch.randint(1, max_len, (1,)).item() + if chunk_size > (max_len * self.short_chunk_threshold): + chunk_size = max_len + else: + chunk_size = chunk_size % self.short_chunk_size + 1 + + mask = ~subsequent_chunk_mask( + size=x.size(0), + chunk_size=chunk_size, + num_left_chunks=self.num_left_chunks, + device=x.device, + ) + x = self.encoder( + x, pos_emb, mask=mask, src_key_padding_mask=src_key_padding_mask + ) # (T, N, C) + else: + x = self.encoder( + x, pos_emb, mask=None, src_key_padding_mask=src_key_padding_mask + ) # (T, N, C) + + if self.normalize_before: + x = self.after_norm(x) + + logits = self.encoder_output_layer(x) + logits = logits.permute(1, 0, 2) # (T, N, C) ->(N, T, C) + + return logits, lengths + + @torch.jit.export + def get_init_state( + self, left_context: int, device: torch.device + ) -> List[torch.Tensor]: + """Return the initial cache state of the model. + + Args: + left_context: The left context size (in frames after subsampling). + + Returns: + Return the initial state of the model, it is a list containing two + tensors, the first one is the cache for attentions which has a shape + of (num_encoder_layers, left_context, encoder_dim), the second one + is the cache of conv_modules which has a shape of + (num_encoder_layers, cnn_module_kernel - 1, encoder_dim). + + NOTE: the returned tensors are on the given device. + """ + if ( + len(self._init_state) == 2 + and self._init_state[0].size(1) == left_context + ): + # Note: It is OK to share the init state as it is + # not going to be modified by the model + return self._init_state + + init_states: List[torch.Tensor] = [ + torch.zeros( + ( + self.encoder_layers, + left_context, + self.d_model, + ), + device=device, + ), + torch.zeros( + ( + self.encoder_layers, + self.cnn_module_kernel - 1, + self.d_model, + ), + device=device, + ), + ] + + self._init_state = init_states + + return init_states + + @torch.jit.export + def streaming_forward( + self, + x: torch.Tensor, + x_lens: torch.Tensor, + states: Optional[List[torch.Tensor]] = None, + processed_lens: Optional[Tensor] = None, + left_context: int = 64, + right_context: int = 0, + chunk_size: int = 16, + simulate_streaming: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]: + """ + Args: + x: + The input tensor. Its shape is (batch_size, seq_len, feature_dim). + x_lens: + A tensor of shape (batch_size,) containing the number of frames in + `x` before padding. + states: + The decode states for previous frames which contains the cached data. + It has two elements, the first element is the attn_cache which has + a shape of (encoder_layers, left_context, batch, attention_dim), + the second element is the conv_cache which has a shape of + (encoder_layers, cnn_module_kernel-1, batch, conv_dim). + Note: states will be modified in this function. + processed_lens: + How many frames (after subsampling) have been processed for each sequence. + left_context: + How many previous frames the attention can see in current chunk. + Note: It's not that each individual frame has `left_context` frames + of left context, some have more. + right_context: + How many future frames the attention can see in current chunk. + Note: It's not that each individual frame has `right_context` frames + of right context, some have more. + chunk_size: + The chunk size for decoding, this will be used to simulate streaming + decoding using masking. + simulate_streaming: + If setting True, it will use a masking strategy to simulate streaming + fashion (i.e. every chunk data only see limited left context and + right context). The whole sequence is supposed to be send at a time + When using simulate_streaming. + Returns: + Return a tuple containing 2 tensors: + - logits, its shape is (batch_size, output_seq_len, output_dim) + - logit_lens, a tensor of shape (batch_size,) containing the number + of frames in `logits` before padding. + - states, the updated states(i.e. caches) including the information + of current chunk. + """ + + # x: [N, T, C] + # Caution: We assume the subsampling factor is 4! + + # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning + # + # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0 + lengths = (((x_lens - 1) >> 1) - 1) >> 1 + + if not simulate_streaming: + assert states is not None + assert processed_lens is not None + assert ( + len(states) == 2 + and states[0].shape + == (self.encoder_layers, left_context, x.size(0), self.d_model) + and states[1].shape + == ( + self.encoder_layers, + self.cnn_module_kernel - 1, + x.size(0), + self.d_model, + ) + ), f"""The length of states MUST be equal to 2, and the shape of + first element should be {(self.encoder_layers, left_context, x.size(0), self.d_model)}, + given {states[0].shape}. the shape of second element should be + {(self.encoder_layers, self.cnn_module_kernel - 1, x.size(0), self.d_model)}, + given {states[1].shape}.""" + + lengths -= 2 # we will cut off 1 frame on each side of encoder_embed output + src_key_padding_mask = make_pad_mask(lengths) + + processed_mask = torch.arange(left_context, device=x.device).expand( + x.size(0), left_context + ) + processed_lens = processed_lens.view(x.size(0), 1) + processed_mask = (processed_lens <= processed_mask).flip(1) + + src_key_padding_mask = torch.cat( + [processed_mask, src_key_padding_mask], dim=1 + ) + + embed = self.encoder_embed(x) + + # cut off 1 frame on each size of embed as they see the padding + # value which causes a training and decoding mismatch. + embed = embed[:, 1:-1, :] + + embed, pos_enc = self.encoder_pos(embed, left_context) + embed = embed.permute(1, 0, 2) # (B, T, F) -> (T, B, F) + + x, states = self.encoder.chunk_forward( + embed, + pos_enc, + src_key_padding_mask=src_key_padding_mask, + states=states, + left_context=left_context, + right_context=right_context, + ) # (T, B, F) + else: + assert states is None + states = [] # just to make torch.script.jit happy + src_key_padding_mask = make_pad_mask(lengths) + x = self.encoder_embed(x) + x, pos_emb = self.encoder_pos(x) + x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) + + assert x.size(0) == lengths.max().item() + + num_left_chunks = -1 + if left_context >= 0: + assert left_context % chunk_size == 0 + num_left_chunks = left_context // chunk_size + + mask = ~subsequent_chunk_mask( + size=x.size(0), + chunk_size=chunk_size, + num_left_chunks=num_left_chunks, + device=x.device, + ) + x = self.encoder( + x, + pos_emb, + mask=mask, + src_key_padding_mask=src_key_padding_mask, + ) # (T, N, C) + + if self.normalize_before: + x = self.after_norm(x) + + logits = self.encoder_output_layer(x) + logits = logits.permute(1, 0, 2) # (T, N, C) ->(N, T, C) + + return logits, lengths, states + + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "num_features", + "encoder_out_dim", + "subsampling_factor", + "d_model", + "nhead", + "dim_feedforward", + "num_encoder_layers", + "vgg_frontend", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + + parser.add_argument( + "--num-features", default=80, type=int, help=("") + ) + + parser.add_argument( + "--subsampling-factor", default=4, type=int, help=("") + ) + + parser.add_argument( + "--d-model", default=512, type=int, help=("") + ) + + parser.add_argument( + "--nhead", default=8, type=int, help=("") + ) + + parser.add_argument( + "--dim-feedforward", default=2048, type=int, help=("") + ) + + parser.add_argument( + "--num-encoder-layers", default=12, type=int, help=("") + ) + + parser.add_argument( + "--vgg-frontend", default=False, type=bool, help=("") + ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + + + + + +class ConformerEncoderLayer(nn.Module): + """ + ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks. + See: "Conformer: Convolution-augmented Transformer for Speech Recognition" + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + cnn_module_kernel (int): Kernel size of convolution module. + normalize_before (bool): whether to use layer_norm before the first block. + causal (bool): Whether to use causal convolution in conformer encoder + layer. This MUST be True when using dynamic_chunk_training and streaming decoding. + + Examples:: + >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8) + >>> src = torch.rand(10, 32, 512) + >>> pos_emb = torch.rand(32, 19, 512) + >>> out = encoder_layer(src, pos_emb) + """ + + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + cnn_module_kernel: int = 31, + normalize_before: bool = True, + causal: bool = False, + ) -> None: + super(ConformerEncoderLayer, self).__init__() + self.self_attn = RelPositionMultiheadAttention( + d_model, nhead, dropout=0.0 + ) + + self.feed_forward = nn.Sequential( + nn.Linear(d_model, dim_feedforward), + Swish(), + nn.Dropout(dropout), + nn.Linear(dim_feedforward, d_model), + ) + + self.feed_forward_macaron = nn.Sequential( + nn.Linear(d_model, dim_feedforward), + Swish(), + nn.Dropout(dropout), + nn.Linear(dim_feedforward, d_model), + ) + + self.conv_module = ConvolutionModule( + d_model, cnn_module_kernel, causal=causal + ) + + self.norm_ff_macaron = nn.LayerNorm( + d_model + ) # for the macaron style FNN module + self.norm_ff = nn.LayerNorm(d_model) # for the FNN module + self.norm_mha = nn.LayerNorm(d_model) # for the MHA module + + self.ff_scale = 0.5 + + self.norm_conv = nn.LayerNorm(d_model) # for the CNN module + self.norm_final = nn.LayerNorm( + d_model + ) # for the final output of the block + + self.dropout = nn.Dropout(dropout) + + self.normalize_before = normalize_before + + def forward( + self, + src: Tensor, + pos_emb: Tensor, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: + """ + Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + pos_emb: Positional embedding tensor (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + Shape: + src: (S, N, E). + pos_emb: (N, 2*S-1, E). + src_mask: (S, S). + src_key_padding_mask: (N, S). + S is the source sequence length, N is the batch size, E is the feature number + """ + # macaron style feed forward module + residual = src + if self.normalize_before: + src = self.norm_ff_macaron(src) + src = residual + self.ff_scale * self.dropout( + self.feed_forward_macaron(src) + ) + if not self.normalize_before: + src = self.norm_ff_macaron(src) + + # multi-headed self-attention module + residual = src + if self.normalize_before: + src = self.norm_mha(src) + + src_att = self.self_attn( + src, + src, + src, + pos_emb=pos_emb, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask, + )[0] + src = residual + self.dropout(src_att) + if not self.normalize_before: + src = self.norm_mha(src) + + # convolution module + residual = src + if self.normalize_before: + src = self.norm_conv(src) + + src, _ = self.conv_module(src) + src = residual + self.dropout(src) + + if not self.normalize_before: + src = self.norm_conv(src) + + # feed forward module + residual = src + if self.normalize_before: + src = self.norm_ff(src) + src = residual + self.ff_scale * self.dropout(self.feed_forward(src)) + if not self.normalize_before: + src = self.norm_ff(src) + + if self.normalize_before: + src = self.norm_final(src) + + return src + + @torch.jit.export + def chunk_forward( + self, + src: Tensor, + pos_emb: Tensor, + states: List[Tensor], + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + left_context: int = 0, + right_context: int = 0, + ) -> Tuple[Tensor, List[Tensor]]: + """ + Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + pos_emb: Positional embedding tensor (required). + states: + The decode states for previous frames which contains the cached data. + It has two elements, the first element is the attn_cache which has + a shape of (left_context, batch, attention_dim), + the second element is the conv_cache which has a shape of + (cnn_module_kernel-1, batch, conv_dim). + Note: states will be modified in this function. + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + left_context: + How many previous frames the attention can see in current chunk. + Note: It's not that each individual frame has `left_context` frames + of left context, some have more. + right_context: + How many future frames the attention can see in current chunk. + Note: It's not that each individual frame has `right_context` frames + of right context, some have more. + Shape: + src: (S, N, E). + pos_emb: (N, 2*(S+left_context)-1, E). + src_mask: (S, S). + src_key_padding_mask: (N, S). + S is the source sequence length, N is the batch size, E is the feature number + """ + + # macaron style feed forward module + residual = src + if self.normalize_before: + src = self.norm_ff_macaron(src) + src = residual + self.ff_scale * self.dropout( + self.feed_forward_macaron(src) + ) + if not self.normalize_before: + src = self.norm_ff_macaron(src) + + # multi-headed self-attention module + residual = src + if self.normalize_before: + src = self.norm_mha(src) + + # We put the attention cache this level (i.e. before linear transformation) + # to save memory consumption, when decoding in streaming fashion, the + # batch size would be thousands (for 32GB machine), if we cache key & val + # separately, it needs extra several GB memory. + # TODO(WeiKang): Move cache to self_attn level (i.e. cache key & val + # separately) if needed. + key = torch.cat([states[0], src], dim=0) + val = key + if right_context > 0: + states[0] = key[ + -(left_context + right_context) : -right_context, ... # noqa + ] + else: + states[0] = key[-left_context:, ...] + + src_att = self.self_attn( + src, + key, + val, + pos_emb=pos_emb, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask, + left_context=left_context, + )[0] + src = residual + self.dropout(src_att) + if not self.normalize_before: + src = self.norm_mha(src) + + # convolution module + residual = src + if self.normalize_before: + src = self.norm_conv(src) + + src, conv_cache = self.conv_module( + src, states[1], right_context=right_context + ) + states[1] = conv_cache + src = residual + self.dropout(src) + + if not self.normalize_before: + src = self.norm_conv(src) + + # feed forward module + residual = src + if self.normalize_before: + src = self.norm_ff(src) + src = residual + self.ff_scale * self.dropout(self.feed_forward(src)) + if not self.normalize_before: + src = self.norm_ff(src) + + if self.normalize_before: + src = self.norm_final(src) + + return src, states + + +class ConformerEncoder(nn.Module): + r"""ConformerEncoder is a stack of N encoder layers + + Args: + encoder_layer: an instance of the ConformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + + Examples:: + >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8) + >>> conformer_encoder = ConformerEncoder(encoder_layer, num_layers=6) + >>> src = torch.rand(10, 32, 512) + >>> pos_emb = torch.rand(32, 19, 512) + >>> out = conformer_encoder(src, pos_emb) + """ + + def __init__(self, encoder_layer: nn.Module, num_layers: int) -> None: + super().__init__() + self.layers = nn.ModuleList( + [copy.deepcopy(encoder_layer) for i in range(num_layers)] + ) + self.num_layers = num_layers + + def forward( + self, + src: Tensor, + pos_emb: Tensor, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: + r"""Pass the input through the encoder layers in turn. + + Args: + src: the sequence to the encoder (required). + pos_emb: Positional embedding tensor (required). + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + Shape: + + Shape: + src: (S, N, E). + pos_emb: (N, 2*S-1, E). + mask: (S, S). + src_key_padding_mask: (N, S). + S is the source sequence length, T is the target sequence length, N is the batch size, E is the feature number + + """ + output = src + + for layer_index, mod in enumerate(self.layers): + output = mod( + output, + pos_emb, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + ) + return output + + @torch.jit.export + def chunk_forward( + self, + src: Tensor, + pos_emb: Tensor, + states: List[Tensor], + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + left_context: int = 0, + right_context: int = 0, + ) -> Tuple[Tensor, List[Tensor]]: + r"""Pass the input through the encoder layers in turn. + + Args: + src: the sequence to the encoder (required). + pos_emb: Positional embedding tensor (required). + states: + The decode states for previous frames which contains the cached data. + It has two elements, the first element is the attn_cache which has + a shape of (encoder_layers, left_context, batch, attention_dim), + the second element is the conv_cache which has a shape of + (encoder_layers, cnn_module_kernel-1, batch, conv_dim). + Note: states will be modified in this function. + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + left_context: + How many previous frames the attention can see in current chunk. + Note: It's not that each individual frame has `left_context` frames + of left context, some have more. + right_context: + How many future frames the attention can see in current chunk. + Note: It's not that each individual frame has `right_context` frames + of right context, some have more. + Shape: + src: (S, N, E). + pos_emb: (N, 2*(S+left_context)-1, E). + mask: (S, S). + src_key_padding_mask: (N, S). + S is the source sequence length, T is the target sequence length, N is the batch size, E is the feature number + + """ + assert not self.training + output = src + + for layer_index, mod in enumerate(self.layers): + cache = [states[0][layer_index], states[1][layer_index]] + output, cache = mod.chunk_forward( + output, + pos_emb, + states=cache, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + left_context=left_context, + right_context=right_context, + ) + states[0][layer_index] = cache[0] + states[1][layer_index] = cache[1] + + return output, states + + +class RelPositionalEncoding(torch.nn.Module): + """Relative positional encoding module. + + See : Appendix B in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" + Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/embedding.py + + Args: + d_model: Embedding dimension. + dropout_rate: Dropout rate. + max_len: Maximum input length. + + """ + + def __init__( + self, d_model: int, dropout_rate: float, max_len: int = 5000 + ) -> None: + """Construct an PositionalEncoding object.""" + super(RelPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: Tensor, left_context: int = 0) -> None: + """Reset the positional encodings.""" + x_size_1 = x.size(1) + left_context + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x_size_1 * 2 - 1: + # Note: TorchScript doesn't implement operator== for torch.Device + if self.pe.dtype != x.dtype or str(self.pe.device) != str( + x.device + ): + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` means to the position of query vector and `j` means the + # position of key vector. We use position relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i Tuple[Tensor, Tensor]: + """Add positional encoding. + + Args: + x (torch.Tensor): Input tensor (batch, time, `*`). + left_context (int): left context (in frames) used during streaming decoding. + this is used only in real streaming decoding, in other circumstances, + it MUST be 0. + Returns: + torch.Tensor: Encoded tensor (batch, time, `*`). + torch.Tensor: Encoded tensor (batch, 2*time-1, `*`). + + """ + self.extend_pe(x, left_context) + x = x * self.xscale + x_size_1 = x.size(1) + left_context + pos_emb = self.pe[ + :, + self.pe.size(1) // 2 + - x_size_1 + + 1 : self.pe.size(1) // 2 # noqa E203 + + x.size(1), + ] + return self.dropout(x), self.dropout(pos_emb) + + +class RelPositionMultiheadAttention(nn.Module): + r"""Multi-Head Attention layer with relative position encoding + + See reference: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" + + Args: + embed_dim: total dimension of the model. + num_heads: parallel attention heads. + dropout: a Dropout layer on attn_output_weights. Default: 0.0. + + Examples:: + + >>> rel_pos_multihead_attn = RelPositionMultiheadAttention(embed_dim, num_heads) + >>> attn_output, attn_output_weights = multihead_attn(query, key, value, pos_emb) + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + ) -> None: + super(RelPositionMultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert ( + self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" + + self.in_proj = nn.Linear(embed_dim, 3 * embed_dim, bias=True) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True) + + # linear transformation for positional encoding. + self.linear_pos = nn.Linear(embed_dim, embed_dim, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(num_heads, self.head_dim)) + self.pos_bias_v = nn.Parameter(torch.Tensor(num_heads, self.head_dim)) + + self._reset_parameters() + + def _reset_parameters(self) -> None: + nn.init.xavier_uniform_(self.in_proj.weight) + nn.init.constant_(self.in_proj.bias, 0.0) + nn.init.constant_(self.out_proj.bias, 0.0) + + nn.init.xavier_uniform_(self.pos_bias_u) + nn.init.xavier_uniform_(self.pos_bias_v) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + pos_emb: Tensor, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + left_context: int = 0, + ) -> Tuple[Tensor, Optional[Tensor]]: + r""" + Args: + query, key, value: map a query and a set of key-value pairs to an output. + pos_emb: Positional embedding tensor + key_padding_mask: if provided, specified padding elements in the key will + be ignored by the attention. When given a binary mask and a value is True, + the corresponding value on the attention layer will be ignored. When given + a byte mask and a value is non-zero, the corresponding value on the attention + layer will be ignored + need_weights: output attn_output_weights. + attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all + the batches while a 3D mask allows to specify a different mask for the entries of each batch. + left_context (int): left context (in frames) used during streaming decoding. + this is used only in real streaming decoding, in other circumstances, + it MUST be 0. + + Shape: + - Inputs: + - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is + the embedding dimension. + - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - pos_emb: :math:`(N, 2*L-1, E)` where L is the target sequence length, N is the batch size, E is + the embedding dimension. + - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length. + If a ByteTensor is provided, the non-zero positions will be ignored while the position + with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the + value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. + - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. + 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, + S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked + positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend + while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` + is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor + is provided, it will be added to the attention weight. + + - Outputs: + - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, + E is the embedding dimension. + - attn_output_weights: :math:`(N, L, S)` where N is the batch size, + L is the target sequence length, S is the source sequence length. + """ + return self.multi_head_attention_forward( + query, + key, + value, + pos_emb, + self.embed_dim, + self.num_heads, + self.in_proj.weight, + self.in_proj.bias, + self.dropout, + self.out_proj.weight, + self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + left_context=left_context, + ) + + def rel_shift(self, x: Tensor, left_context: int = 0) -> Tensor: + """Compute relative positional encoding. + + Args: + x: Input tensor (batch, head, time1, 2*time1-1). + time1 means the length of query vector. + left_context (int): left context (in frames) used during streaming decoding. + this is used only in real streaming decoding, in other circumstances, + it MUST be 0. + + Returns: + Tensor: tensor of shape (batch, head, time1, time2) + (note: time2 has the same value as time1, but it is for + the key, while time1 is for the query). + """ + (batch_size, num_heads, time1, n) = x.shape + time2 = time1 + left_context + + assert ( + n == left_context + 2 * time1 - 1 + ), f"{n} == {left_context} + 2 * {time1} - 1" + + # Note: TorchScript requires explicit arg for stride() + batch_stride = x.stride(0) + head_stride = x.stride(1) + time1_stride = x.stride(2) + n_stride = x.stride(3) + return x.as_strided( + (batch_size, num_heads, time1, time2), + (batch_stride, head_stride, time1_stride - n_stride, n_stride), + storage_offset=n_stride * (time1 - 1), + ) + + def multi_head_attention_forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + pos_emb: Tensor, + embed_dim_to_check: int, + num_heads: int, + in_proj_weight: Tensor, + in_proj_bias: Tensor, + dropout_p: float, + out_proj_weight: Tensor, + out_proj_bias: Tensor, + training: bool = True, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + left_context: int = 0, + ) -> Tuple[Tensor, Optional[Tensor]]: + r""" + Args: + query, key, value: map a query and a set of key-value pairs to an output. + pos_emb: Positional embedding tensor + embed_dim_to_check: total dimension of the model. + num_heads: parallel attention heads. + in_proj_weight, in_proj_bias: input projection weight and bias. + dropout_p: probability of an element to be zeroed. + out_proj_weight, out_proj_bias: the output projection weight and bias. + training: apply dropout if is ``True``. + key_padding_mask: if provided, specified padding elements in the key will + be ignored by the attention. This is an binary mask. When the value is True, + the corresponding value on the attention layer will be filled with -inf. + need_weights: output attn_output_weights. + attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all + the batches while a 3D mask allows to specify a different mask for the entries of each batch. + left_context (int): left context (in frames) used during streaming decoding. + this is used only in real streaming decoding, in other circumstances, + it MUST be 0. + + Shape: + Inputs: + - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is + the embedding dimension. + - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - pos_emb: :math:`(N, 2*L-1, E)` or :math:`(1, 2*L-1, E)` where L is the target sequence + length, N is the batch size, E is the embedding dimension. + - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length. + If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions + will be unchanged. If a BoolTensor is provided, the positions with the + value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. + - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. + 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, + S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked + positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend + while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` + are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor + is provided, it will be added to the attention weight. + + Outputs: + - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, + E is the embedding dimension. + - attn_output_weights: :math:`(N, L, S)` where N is the batch size, + L is the target sequence length, S is the source sequence length. + """ + + tgt_len, bsz, embed_dim = query.size() + assert embed_dim == embed_dim_to_check + assert key.size(0) == value.size(0) and key.size(1) == value.size(1) + + head_dim = embed_dim // num_heads + assert ( + head_dim * num_heads == embed_dim + ), "embed_dim must be divisible by num_heads" + scaling = float(head_dim) ** -0.5 + + if torch.equal(query, key) and torch.equal(key, value): + # self-attention + q, k, v = nn.functional.linear( + query, in_proj_weight, in_proj_bias + ).chunk(3, dim=-1) + + elif torch.equal(key, value): + # encoder-decoder attention + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = 0 + _end = embed_dim + _w = in_proj_weight[_start:_end, :] + if _b is not None: + _b = _b[_start:_end] + q = nn.functional.linear(query, _w, _b) + + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = embed_dim + _end = None + _w = in_proj_weight[_start:, :] + if _b is not None: + _b = _b[_start:] + k, v = nn.functional.linear(key, _w, _b).chunk(2, dim=-1) + + else: + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = 0 + _end = embed_dim + _w = in_proj_weight[_start:_end, :] + if _b is not None: + _b = _b[_start:_end] + q = nn.functional.linear(query, _w, _b) + + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = embed_dim + _end = embed_dim * 2 + _w = in_proj_weight[_start:_end, :] + if _b is not None: + _b = _b[_start:_end] + k = nn.functional.linear(key, _w, _b) + + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = embed_dim * 2 + _end = None + _w = in_proj_weight[_start:, :] + if _b is not None: + _b = _b[_start:] + v = nn.functional.linear(value, _w, _b) + + if attn_mask is not None: + assert ( + attn_mask.dtype == torch.float32 + or attn_mask.dtype == torch.float64 + or attn_mask.dtype == torch.float16 + or attn_mask.dtype == torch.uint8 + or attn_mask.dtype == torch.bool + ), "Only float, byte, and bool types are supported for attn_mask, not {}".format( + attn_mask.dtype + ) + if attn_mask.dtype == torch.uint8: + warnings.warn( + "Byte tensor for attn_mask is deprecated. Use bool tensor instead." + ) + attn_mask = attn_mask.to(torch.bool) + + if attn_mask.dim() == 2: + attn_mask = attn_mask.unsqueeze(0) + if list(attn_mask.size()) != [1, query.size(0), key.size(0)]: + raise RuntimeError( + "The size of the 2D attn_mask is not correct." + ) + elif attn_mask.dim() == 3: + if list(attn_mask.size()) != [ + bsz * num_heads, + query.size(0), + key.size(0), + ]: + raise RuntimeError( + "The size of the 3D attn_mask is not correct." + ) + else: + raise RuntimeError( + "attn_mask's dimension {} is not supported".format( + attn_mask.dim() + ) + ) + # attn_mask's dim is 3 now. + + # convert ByteTensor key_padding_mask to bool + if ( + key_padding_mask is not None + and key_padding_mask.dtype == torch.uint8 + ): + warnings.warn( + "Byte tensor for key_padding_mask is deprecated. Use bool tensor instead." + ) + key_padding_mask = key_padding_mask.to(torch.bool) + + q = q.contiguous().view(tgt_len, bsz, num_heads, head_dim) + k = k.contiguous().view(-1, bsz, num_heads, head_dim) + v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) + + src_len = k.size(0) + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz, "{} == {}".format( + key_padding_mask.size(0), bsz + ) + assert key_padding_mask.size(1) == src_len, "{} == {}".format( + key_padding_mask.size(1), src_len + ) + + q = q.transpose(0, 1) # (batch, time1, head, d_k) + + pos_emb_bsz = pos_emb.size(0) + assert pos_emb_bsz in (1, bsz) # actually it is 1 + p = self.linear_pos(pos_emb).view(pos_emb_bsz, -1, num_heads, head_dim) + + # (batch, 2*time1, head, d_k) --> (batch, head, d_k, 2*time -1) + p = p.permute(0, 2, 3, 1) + + q_with_bias_u = (q + self.pos_bias_u).transpose( + 1, 2 + ) # (batch, head, time1, d_k) + + q_with_bias_v = (q + self.pos_bias_v).transpose( + 1, 2 + ) # (batch, head, time1, d_k) + + # compute attention score + # first compute matrix a and matrix c + # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3 + k = k.permute(1, 2, 3, 0) # (batch, head, d_k, time2) + matrix_ac = torch.matmul( + q_with_bias_u, k + ) # (batch, head, time1, time2) + + # compute matrix b and matrix d + matrix_bd = torch.matmul( + q_with_bias_v, p + ) # (batch, head, time1, 2*time1-1) + + matrix_bd = self.rel_shift(matrix_bd, left_context=left_context) + + attn_output_weights = ( + matrix_ac + matrix_bd + ) * scaling # (batch, head, time1, time2) + + attn_output_weights = attn_output_weights.view( + bsz * num_heads, tgt_len, -1 + ) + + assert list(attn_output_weights.size()) == [ + bsz * num_heads, + tgt_len, + src_len, + ] + + if attn_mask is not None: + if attn_mask.dtype == torch.bool: + attn_output_weights.masked_fill_(attn_mask, float("-inf")) + else: + attn_output_weights += attn_mask + + if key_padding_mask is not None: + attn_output_weights = attn_output_weights.view( + bsz, num_heads, tgt_len, src_len + ) + attn_output_weights = attn_output_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2), + float("-inf"), + ) + attn_output_weights = attn_output_weights.view( + bsz * num_heads, tgt_len, src_len + ) + + attn_output_weights = nn.functional.softmax(attn_output_weights, dim=-1) + + # If we are using dynamic_chunk_training and setting a limited + # num_left_chunks, the attention may only see the padding values which + # will also be masked out by `key_padding_mask`, at this circumstances, + # the whole column of `attn_output_weights` will be `-inf` + # (i.e. be `nan` after softmax), so, we fill `0.0` at the masking + # positions to avoid invalid loss value below. + if ( + attn_mask is not None + and attn_mask.dtype == torch.bool + and key_padding_mask is not None + ): + combined_mask = attn_mask.unsqueeze(0) | key_padding_mask.unsqueeze( + 1 + ).unsqueeze(2) + attn_output_weights = attn_output_weights.view( + bsz, num_heads, tgt_len, src_len + ) + attn_output_weights = attn_output_weights.masked_fill( + combined_mask, 0.0 + ) + attn_output_weights = attn_output_weights.view( + bsz * num_heads, tgt_len, src_len + ) + + attn_output_weights = nn.functional.dropout( + attn_output_weights, p=dropout_p, training=training + ) + + attn_output = torch.bmm(attn_output_weights, v) + assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim] + attn_output = ( + attn_output.transpose(0, 1) + .contiguous() + .view(tgt_len, bsz, embed_dim) + ) + attn_output = nn.functional.linear( + attn_output, out_proj_weight, out_proj_bias + ) + + if need_weights: + # average attention weights over heads + attn_output_weights = attn_output_weights.view( + bsz, num_heads, tgt_len, src_len + ) + return attn_output, attn_output_weights.sum(dim=1) / num_heads + else: + return attn_output, None + + +class ConvolutionModule(nn.Module): + """ConvolutionModule in Conformer model. + Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py + + Args: + channels (int): The number of channels of conv layers. + kernel_size (int): Kernerl size of conv layers. + bias (bool): Whether to use bias in conv layers (default=True). + causal (bool): Whether to use causal convolution. + """ + + def __init__( + self, + channels: int, + kernel_size: int, + bias: bool = True, + causal: bool = False, + ) -> None: + """Construct an ConvolutionModule object.""" + super(ConvolutionModule, self).__init__() + # kernerl_size should be a odd number for 'SAME' padding + assert (kernel_size - 1) % 2 == 0 + self.causal = causal + + self.pointwise_conv1 = nn.Conv1d( + channels, + 2 * channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + + self.lorder = kernel_size - 1 + padding = (kernel_size - 1) // 2 + if self.causal: + padding = 0 + + self.depthwise_conv = nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + padding=padding, + groups=channels, + bias=bias, + ) + self.norm = nn.LayerNorm(channels) + self.pointwise_conv2 = nn.Conv1d( + channels, + channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.activation = Swish() + + def forward( + self, + x: Tensor, + cache: Optional[Tensor] = None, + right_context: int = 0, + ) -> Tuple[Tensor, Tensor]: + """Compute convolution module. + + Args: + x: Input tensor (#time, batch, channels). + + Returns: + Tensor: Output tensor (#time, batch, channels). + + """ + # exchange the temporal dimension and the feature dimension + x = x.permute(1, 2, 0) # (#batch, channels, time). + + # GLU mechanism + x = self.pointwise_conv1(x) # (batch, 2*channels, time) + x = nn.functional.glu(x, dim=1) # (batch, channels, time) + + # 1D Depthwise Conv + if self.causal and self.lorder > 0: + if cache is None: + # Make depthwise_conv causal by + # manualy padding self.lorder zeros to the left + x = nn.functional.pad(x, (self.lorder, 0), "constant", 0.0) + else: + assert ( + not self.training + ), "Cache should be None in training time" + assert cache.size(0) == self.lorder + x = torch.cat([cache.permute(1, 2, 0), x], dim=2) + if right_context > 0: + cache = x.permute(2, 0, 1)[ + -(self.lorder + right_context) : ( # noqa + -right_context + ), + ..., + ] + else: + cache = x.permute(2, 0, 1)[-self.lorder :, ...] # noqa + + x = self.depthwise_conv(x) + # x is (batch, channels, time) + x = x.permute(0, 2, 1) + x = self.norm(x) + x = x.permute(0, 2, 1) + + x = self.activation(x) + + x = self.pointwise_conv2(x) # (batch, channel, time) + + if cache is None: + cache = torch.empty(0) + + return x.permute(2, 0, 1), cache + + +class Swish(torch.nn.Module): + """Construct an Swish object.""" + + def forward(self, x: Tensor) -> Tensor: + """Return Swich activation function.""" + return x * torch.sigmoid(x) + + +def identity(x): + return x diff --git a/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py b/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py new file mode 100644 index 00000000..cf8bb91f --- /dev/null +++ b/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py @@ -0,0 +1,83 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Dict, Optional, Tuple, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +import torch + +from ...narchs import ConformerEncoderV1 +from .rnn_transducer import RNNTransducer + + +class ConformerV1RNNTransducer(RNNTransducer): + """RNN-T with Conformer Encoder + + Attributes: + encoder: dictionary of options to initialize RNNEncoder class or RNNEncoder object + decoder: RNN-T Decoder config. dictionary or module. + + """ + + def __init__(self, encoder, rnnt_decoder): + if isinstance(encoder, dict): + encoder = ConformerEncoderV1(**encoder) + else: + assert isinstance(encoder, ConformerEncoderV1) + + super().__init__(encoder, rnnt_decoder) + + @staticmethod + def filter_args(**kwargs): + args = RNNTransducer.filter_args(**kwargs) + encoder_args = ConformerEncoderV1.filter_args(**kwargs["encoder"]) + args["encoder"] = encoder_args + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ConformerEncoderV1.add_class_args(parser, prefix="encoder", skip=skip) + RNNTransducer.add_class_args(parser) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + def change_config( + self, + encoder, + rnnt_decoder, + ): + logging.info("changing transducer encoder config") + self.encoder.change_config(**encoder) + super().chage_config(**rnnt_decoder) + + @staticmethod + def filter_finetune_args(**kwargs): + args = RNNTransducer.filter_finetune_args(**kwargs) + encoder_args = ConformerEncoderV1.filter_finetune_args(**kwargs["encoder"]) + args["encoder"] = encoder_args + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ConformerEncoderV1.add_finetune_args(parser, prefix="encoder") + RNNTransducer.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/decoder.py b/hyperion/torch/models/transducer/decoder.py new file mode 100644 index 00000000..484f6f38 --- /dev/null +++ b/hyperion/torch/models/transducer/decoder.py @@ -0,0 +1,249 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Optional, Tuple + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +import torch +import torch.nn as nn + + +# TODO(fangjun): Support switching between LSTM and GRU +class Decoder(nn.Module): + def __init__( + self, + vocab_size: int, + embedding_dim: int, + blank_id: int, + num_layers: int, + hidden_dim: int, + in_feats: int, + embedding_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + ): + """ + Args: + vocab_size: + Number of tokens of the modeling unit including blank. + embedding_dim: + Dimension of the input embedding. + blank_id: + The ID of the blank symbol. + num_layers: + Number of LSTM layers. + hidden_dim: + Hidden dimension of LSTM layers. + output_dim: + Output dimension of the decoder. + embedding_dropout: + Dropout rate for the embedding layer. + rnn_dropout: + Dropout for LSTM layers. + """ + super().__init__() + self.embedding = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=embedding_dim, + padding_idx=blank_id, + ) + self.embedding_dropout = nn.Dropout(embedding_dropout_rate) + # TODO(fangjun): Use layer normalized LSTM + self.rnn = nn.LSTM( + input_size=embedding_dim, + hidden_size=hidden_dim, + num_layers=num_layers, + batch_first=True, + dropout=rnn_dropout_rate, + ) + + self.in_feats = in_feats + self.blank_id = blank_id + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + self.num_layers = num_layers + self.hidden_dim = hidden_dim + self.embedding_dropout_rate = embedding_dropout_rate + self.rnn_dropout_rate = rnn_dropout_rate + self.output_linear = nn.Linear(hidden_dim, in_feats) + + def forward( + self, + y: torch.Tensor, + states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + Args: + y: + A 2-D tensor of shape (N, U) with BOS prepended. + states: + A tuple of two tensors containing the states information of + LSTM layers in this decoder. + Returns: + Return a tuple containing: + + - rnn_output, a tensor of shape (N, U, C) + - (h, c), containing the state information for LSTM layers. + Both are of shape (num_layers, N, C) + """ + embedding_out = self.embedding(y) + embedding_out = self.embedding_dropout(embedding_out) + #print("yy", y.shape, embedding_out.shape, y) + rnn_out, (h, c) = self.rnn(embedding_out, states) + out = self.output_linear(rnn_out) + + return out, (h, c) + + def get_config(self): + config = { + "in_feats": self.in_feats, + "blank_id": self.blank_id, + "vocab_size": self.vocab_size, + "embedding_dim": self.embedding_dim, + "num_layers": self.num_layers, + "hidden_dim": self.hidden_dim, + "embedding_dropout_rate": self.embedding_dropout_rate, + "rnn_dropout_rate": self.rnn_dropout_rate, + } + + # base_config = super().get_config() + return dict(list(config.items())) + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "in_feats", + "blank_id", + "vocab_size", + "embedding_dim", + "num_layers", + "hidden_dim", + "embedding_dropout_rate", + "rnn_dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + return args + + + @staticmethod + def filter_finetune_args(**kwargs): + valid_args = ( + "embedding_dropout_rate", + "rnn_dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + return args + + @staticmethod + def add_class_args(parser, + prefix=None, + skip=set(["in_feats", "blank_id", "vocab_size"])): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument("--in-feats", + type=int, + required=True, + help=("input feature dimension")) + if "blank_id" not in skip: + parser.add_argument("--blank-id", + type=int, + required=True, + help=("blank id from sp model")) + if "vocab_size" not in skip: + parser.add_argument("--vocab-size", + type=int, + required=True, + help=("output prediction dimension")) + parser.add_argument("--embedding-dim", + default=1024, + type=int, + help=("feature dimension")) + parser.add_argument("--embedding-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings")) + parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN ")) + + parser.add_argument("--num-layers", default=2, type=int, help=("")) + + parser.add_argument("--hidden-dim", default=512, type=int, help=("")) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + + def change_config( + self, + override_dropouts=False, + embedding_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + ): + logging.info("changing decoder config") + + if override_dropouts: + logging.info("overriding decoder dropouts") + + # for module in self.modules(): + # if isinstance(module, DropConnect1d): + # module.p *= drop_connect_rate / self.drop_connect_rate + + self.rnn_dropout_rate = rnn_dropout_rate + self.rnn.p = self.rnn_dropout_rate + + self.embedding_dropout_rate = embedding_dropout_rate + self.embedding_dropout = nn.Dropout(self.embedding_dropout_rate) + + + + @staticmethod + def add_finetune_args(parser, + prefix=None, + skip=set(["in_feats", "blank_id", "vocab_size"])): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument("--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + )) + parser.add_argument("--embedding-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings")) + parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN ")) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/encoder_interface.py b/hyperion/torch/models/transducer/encoder_interface.py new file mode 100644 index 00000000..257facce --- /dev/null +++ b/hyperion/torch/models/transducer/encoder_interface.py @@ -0,0 +1,43 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import torch +import torch.nn as nn + + +class EncoderInterface(nn.Module): + def forward( + self, x: torch.Tensor, x_lens: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: + A tensor of shape (batch_size, input_seq_len, num_features) + containing the input features. + x_lens: + A tensor of shape (batch_size,) containing the number of frames + in `x` before padding. + Returns: + Return a tuple containing two tensors: + - encoder_out, a tensor of (batch_size, out_seq_len, output_dim) + containing unnormalized probabilities, i.e., the output of a + linear layer. + - encoder_out_lens, a tensor of shape (batch_size,) containing + the number of frames in `encoder_out` before padding. + """ + raise NotImplementedError("Please implement it in a subclass") diff --git a/hyperion/torch/models/transducer/joiner.py b/hyperion/torch/models/transducer/joiner.py new file mode 100644 index 00000000..a7c2e35b --- /dev/null +++ b/hyperion/torch/models/transducer/joiner.py @@ -0,0 +1,113 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +import torch +import torch.nn as nn + + +class Joiner(nn.Module): + def __init__(self, in_feats: int, out_dims: int, num_layers: int): + super().__init__() + self.in_feats = in_feats + self.out_dims = out_dims + self.num_layers = num_layers + + self.output_linear = nn.Linear(in_feats, out_dims) + + def forward( + self, encoder_out: torch.Tensor, decoder_out: torch.Tensor + ) -> torch.Tensor: + """ + Args: + encoder_out: + Output from the encoder. Its shape is (N, T, C). + decoder_out: + Output from the decoder. Its shape is (N, U, C). + Returns: + Return a tensor of shape (N, T, U, C). + """ + assert encoder_out.ndim == decoder_out.ndim == 3 + assert encoder_out.size(0) == decoder_out.size(0) + assert encoder_out.size(2) == decoder_out.size(2) + + encoder_out = encoder_out.unsqueeze(2) + # Now encoder_out is (N, T, 1, C) + + decoder_out = decoder_out.unsqueeze(1) + # Now decoder_out is (N, 1, U, C) + + logit = encoder_out + decoder_out + logit = torch.tanh(logit) + + output = self.output_linear(logit) + + return output + + + def get_config(self): + config = { + "in_feats" : self.in_feats, + "out_dims": self.out_dims, + "num_layers": self.num_layers, + } + + # base_config = super().get_config() + return dict(list(config.items())) + + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "in_feats", + "out_dims", + "num_layers", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + return args + + + @staticmethod + def add_class_args(parser, prefix=None, skip=set(["in_feats", "out_dims"])): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument( + "--in-feats", type=int, required=True, help=("input feature dimension") + ) + + if "out_dims" not in skip: + parser.add_argument( + "--out-dims", type=int, required=True, help=("output feature dimension (vocab size)") + ) + parser.add_argument( + "--num-layers", default=1, type=int, help=("layers of the joiner") + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + + # @staticmethod + # def add_class_args(parser, prefix=None, skip=set()): + + # parser.add_argument( + # "--encoder-out-dim", default=512, type=int, help=("") + # ) \ No newline at end of file diff --git a/hyperion/torch/models/transducer/rnn_rnn_transducer.py b/hyperion/torch/models/transducer/rnn_rnn_transducer.py new file mode 100644 index 00000000..46438dbc --- /dev/null +++ b/hyperion/torch/models/transducer/rnn_rnn_transducer.py @@ -0,0 +1,84 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Dict, Optional, Tuple, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +import torch + +from ...narchs import RNNEncoder +from .rnn_transducer import RNNTransducer + + +class RNNRNNTransducer(RNNTransducer): + """RNN-T with RNN Encoder + + Attributes: + encoder: dictionary of options to initialize RNNEncoder class or RNNEncoder object + decoder: RNN-T Decoder config. dictionary or module. + + """ + + def __init__(self, encoder, rnnt_decoder): + if isinstance(encoder, dict): + encoder = RNNEncoder(**encoder) + else: + assert isinstance(encoder, RNNEncoder) + + super().__init__(encoder, rnnt_decoder) + + @staticmethod + def filter_args(**kwargs): + args = RNNTransducer.filter_args(**kwargs) + encoder_args = RNNEncoder.filter_args(**kwargs["encoder"]) + args["encoder"] = encoder_args + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNEncoder.add_class_args(parser, prefix="encoder", skip=skip) + RNNTransducer.add_class_args(parser) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + def change_config( + self, + encoder, + rnnt_decoder, + ): + logging.info("changing transducer encoder config") + self.encoder.change_config(**encoder) + super().chage_config(**rnnt_decoder) + + @staticmethod + def filter_finetune_args(**kwargs): + args = RNNTransducer.filter_finetune_args(**kwargs) + encoder_args = RNNEncoder.filter_finetune_args(**kwargs["encoder"]) + args["encoder"] = encoder_args + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNEncoder.add_finetune_args(parser, prefix="encoder") + RNNTransducer.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/rnn_transducer.py b/hyperion/torch/models/transducer/rnn_transducer.py new file mode 100644 index 00000000..c951818d --- /dev/null +++ b/hyperion/torch/models/transducer/rnn_transducer.py @@ -0,0 +1,261 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +import torch + +from ....utils import HypDataClass +from ....utils.misc import filter_func_args +from ...narchs import RNNTransducerDecoder +from ...torch_model import TorchModel + + +@dataclass +class RNNTransducerOutput(HypDataClass): + loss: torch.Tensor + loss_simple: Optional[torch.Tensor] = None + loss_pruned: Optional[torch.Tensor] = None + h_feats: Optional[List[torch.Tensor]] = None + + +class RNNTransducer(TorchModel): + """Base-class for RNN-T in + "Sequence Transduction with Recurrent Neural Networks" + https://arxiv.org/pdf/1211.3711.pdf + + Attributes: + encoder: Encoder network module + rnnt_decoder: RNN-T Decoder config. dictionary or module. + """ + + def __init__( + self, + encoder: Union[TorchModel, None], + rnnt_decoder: Union[Dict, RNNTransducerDecoder], + rnnt_weight: float = 1.0, + ctc_weight: float = 0.0, + ): + super().__init__() + if encoder is not None: + assert isinstance(encoder, TorchModel) + if isinstance(rnnt_decoder, dict): + if encoder is not None: + rnnt_decoder["in_feats"] = encoder.out_shape()[-1] + rnnt_decoder = RNNTransducerDecoder(**rnnt_decoder) + else: + assert isinstance(rnnt_decoder, RNNTransducerDecoder) + + self.encoder = encoder + self.rnnt_decoder = rnnt_decoder + + def forward( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: k2.RaggedTensor, + ) -> RNNTransducerOutput: + """ + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + y: ragged tensor with 2 axes [utt][label]. It contains labels of each + utterance. + Returns: + - Token logits with shape = (N, vocab_size) + - RNN-T loss. + """ + assert x.ndim == 3, x.shape + assert x_lengths.ndim == 1, x_lengths.shape + assert y.num_axes == 2, y.num_axes + + assert x.size(0) == x_lengths.size(0) == y.dim0 + assert torch.all( + x_lengths[:-1] >= x_lengths[1:] + ), f"x_lengths={x_lengths}" # check x_lengths are sorted + + if self.encoder is not None: + x, x_lengths = self.encoder(x, x_lengths) + assert torch.all(x_lengths > 0) + + dec_output = self.rnnt_decoder(x, x_lengths, y) + output = RNNTransducerOutput(*dec_output) + return output + + def infer( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ) -> List[List[int]]: + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + assert x.ndim == 3, x.shape + assert x_lengths.ndim == 1, x_lengths.shape + assert x.size(0) == x_lengths.size(0) + + if self.encoder is not None: + x, x_lengths = self.encoder(x, x_lengths) + assert torch.all(x_lengths > 0) + + batch_size = x.size(0) + y = [] + for i in range(batch_size): + x_i = x[i : i + 1, : x_lengths[i]] + y_i = self.rnnt_decoder.decode( + x_i, + method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt, + ) + y.append(y_i) + + return y + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + else: + raise ValueError(f"invalid train_mode={mode}") + + self._train_mode = mode + + def _train(self, train_mode: str): + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return ["full", "frozen"] + + def get_config(self): + if self.encoder is None: + enc_cfg = None + else: + enc_cfg = self.encoder.get_config() + del enc_cfg["class_name"] + + dec_cfg = self.rnnt_decoder.get_config() + del dec_cfg["class_name"] + config = { + "encoder": enc_cfg, + "rnnt_decoder": dec_cfg, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + # get arguments for pooling + args = {} + rnnt_decoder_args = RNNTransducerDecoder.filter_args(**kwargs["rnnt_decoder"]) + args["rnnt_decoder"] = rnnt_decoder_args + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNTransducerDecoder.add_class_args(parser, prefix="rnnt_decoder") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + def change_config( + self, + rnnt_decoder: Dict, + ): + logging.info("changing rnnt_decoder config") + self.rnnt_decoder.change_config(**rnnt_decoder) + + @staticmethod + def filter_finetune_args(**kwargs): + args = {} + rnnt_decoder_args = RNNTransducerDecoder.filter_finetune_args( + **kwargs["rnnt_decoder"] + ) + args["rnnt_decoder"] = rnnt_decoder_args + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNTransducerDecoder.add_finetune_args(parser, prefix="rnnt_decoder") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--decoding-method", + default="time_sync_beam_search", + choices=[ + "greedy", + "time_sync_beam_search", + "align_length_sync_beam_search", + ], + ) + + parser.add_argument( + "--beam-width", default=5, type=int, help="beam width for beam search" + ) + parser.add_argument( + "--max-sym-per-frame", + default=3, + type=int, + help="max symbols RNN-T can emit in 1 frame", + ) + parser.add_argument( + "--max-sym-per-utt", + default=1000, + type=int, + help="max symbols RNN-T can emit in 1 frame", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return filter_func_args(RNNTransducer.infer, kwargs) diff --git a/hyperion/torch/models/transducer/subsampling0.py b/hyperion/torch/models/transducer/subsampling0.py new file mode 100644 index 00000000..542fb036 --- /dev/null +++ b/hyperion/torch/models/transducer/subsampling0.py @@ -0,0 +1,161 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch +import torch.nn as nn + + +class Conv2dSubsampling(nn.Module): + """Convolutional 2D subsampling (to 1/4 length). + + Convert an input of shape (N, T, idim) to an output + with shape (N, T', odim), where + T' = ((T-1)//2 - 1)//2, which approximates T' == T//4 + + It is based on + https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py # noqa + """ + + def __init__(self, idim: int, odim: int) -> None: + """ + Args: + idim: + Input dim. The input shape is (N, T, idim). + Caution: It requires: T >=7, idim >=7 + odim: + Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim) + """ + assert idim >= 7 + super().__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=1, out_channels=odim, kernel_size=3, stride=2 + ), + nn.ReLU(), + nn.Conv2d( + in_channels=odim, out_channels=odim, kernel_size=3, stride=2 + ), + nn.ReLU(), + ) + self.out = nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Subsample x. + + Args: + x: + Its shape is (N, T, idim). + + Returns: + Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim) + """ + # On entry, x is (N, T, idim) + x = x.unsqueeze(1) # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W) + x = self.conv(x) + # Now x is of shape (N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2) + b, c, t, f = x.size() + x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) + # Now x is of shape (N, ((T-1)//2 - 1))//2, odim) + return x + + +class VggSubsampling(nn.Module): + """Trying to follow the setup described in the following paper: + https://arxiv.org/pdf/1910.09799.pdf + + This paper is not 100% explicit so I am guessing to some extent, + and trying to compare with other VGG implementations. + + Convert an input of shape (N, T, idim) to an output + with shape (N, T', odim), where + T' = ((T-1)//2 - 1)//2, which approximates T' = T//4 + """ + + def __init__(self, idim: int, odim: int) -> None: + """Construct a VggSubsampling object. + + This uses 2 VGG blocks with 2 Conv2d layers each, + subsampling its input by a factor of 4 in the time dimensions. + + Args: + idim: + Input dim. The input shape is (N, T, idim). + Caution: It requires: T >=7, idim >=7 + odim: + Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim) + """ + super().__init__() + + cur_channels = 1 + layers = [] + block_dims = [32, 64] + + # The decision to use padding=1 for the 1st convolution, then padding=0 + # for the 2nd and for the max-pooling, and ceil_mode=True, was driven by + # a back-compatibility concern so that the number of frames at the + # output would be equal to: + # (((T-1)//2)-1)//2. + # We can consider changing this by using padding=1 on the + # 2nd convolution, so the num-frames at the output would be T//4. + for block_dim in block_dims: + layers.append( + torch.nn.Conv2d( + in_channels=cur_channels, + out_channels=block_dim, + kernel_size=3, + padding=1, + stride=1, + ) + ) + layers.append(torch.nn.ReLU()) + layers.append( + torch.nn.Conv2d( + in_channels=block_dim, + out_channels=block_dim, + kernel_size=3, + padding=0, + stride=1, + ) + ) + layers.append( + torch.nn.MaxPool2d( + kernel_size=2, stride=2, padding=0, ceil_mode=True + ) + ) + cur_channels = block_dim + + self.layers = nn.Sequential(*layers) + + self.out = nn.Linear( + block_dims[-1] * (((idim - 1) // 2 - 1) // 2), odim + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Subsample x. + + Args: + x: + Its shape is (N, T, idim). + + Returns: + Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim) + """ + x = x.unsqueeze(1) + x = self.layers(x) + b, c, t, f = x.size() + x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) + return x diff --git a/hyperion/torch/models/transducer/transducer0.py b/hyperion/torch/models/transducer/transducer0.py new file mode 100644 index 00000000..938149ec --- /dev/null +++ b/hyperion/torch/models/transducer/transducer0.py @@ -0,0 +1,254 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Note we use `rnnt_loss` from torchaudio, which exists only in +torchaudio >= v0.10.0. It also means you have to use torch >= v1.10.0 +""" +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +import logging + +import torchaudio +import torchaudio.functional + +import torch +import torch.nn as nn +from hyperion.utils.text import add_sos + +from ...torch_model import TorchModel +# from .conformer import Conformer +from .decoder import Decoder +from .encoder_interface import EncoderInterface +from .joiner import Joiner + + +class Transducer(TorchModel): + """It implements https://arxiv.org/pdf/1211.3711.pdf + "Sequence Transduction with Recurrent Neural Networks" + """ + + def __init__( + self, + vocab_size, + blank_id, + # conformer_enc, + decoder, + joiner, + ): + """ + Args: + encoder: + It is the transcription network in the paper. Its accepts + two inputs: `x` of (N, T, C) and `x_lens` of shape (N,). + It returns two tensors: `logits` of shape (N, T, C) and + `logit_lens` of shape (N,). + decoder: + It is the prediction network in the paper. Its input shape + is (N, U) and its output shape is (N, U, C). It should contain + one attribute: `blank_id`. + joiner: + It has two inputs with shapes: (N, T, C) and (N, U, C). Its + output shape is (N, T, U, C). Note that its output contains + unnormalized probs, i.e., not processed by log-softmax. + """ + super().__init__() + # assert isinstance(encoder, EncoderInterface) + # assert hasattr(decoder, "blank_id") + + decoder["blank_id"] = blank_id + decoder["vocab_size"] = vocab_size + joiner["out_dims"] = vocab_size + + self.vocab_size = vocab_size + self.blank_id = blank_id + self.decoder = Decoder(**decoder) + self.joiner = Joiner(**joiner) + + def forward( + self, + x: torch.Tensor, + x_lens: torch.Tensor, + y: k2.RaggedTensor, + ) -> torch.Tensor: + """ + Args: + x: + A 3-D tensor of shape (N, T, C). + x_lens: + A 1-D tensor of shape (N,). It contains the number of frames in `x` + before padding. + y: + A ragged tensor with 2 axes [utt][label]. It contains labels of each + utterance. + Returns: + Return the transducer loss. + """ + assert x.ndim == 3, x.shape + assert x_lens.ndim == 1, x_lens.shape + assert y.num_axes == 2, y.num_axes + + assert x.size(0) == x_lens.size(0) == y.dim0 + + # wav2vec2 works as encoder + # encoder_out, x_lens = self.encoder(x, x_lens) + assert torch.all(x_lens > 0) + + encoder_out = x + # Now for the decoder, i.e., the prediction network + row_splits = y.shape.row_splits(1) + y_lens = row_splits[1:] - row_splits[:-1] + + blank_id = self.decoder.blank_id + sos_y = add_sos(y, sos_id=blank_id) + + sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id) + sos_y_padded = sos_y_padded.to(torch.int64) + + decoder_out, _ = self.decoder(sos_y_padded) + + logits = self.joiner(encoder_out, decoder_out) + + # rnnt_loss requires 0 padded targets + # Note: y does not start with SOS + y_padded = y.pad(mode="constant", padding_value=0) + + assert hasattr(torchaudio.functional, "rnnt_loss"), ( + f"Current torchaudio version: {torchaudio.__version__}\n" + "Please install a version >= 0.10.0") + + x_lens = x_lens.to(torch.int32) + + loss = torchaudio.functional.rnnt_loss( + logits=logits, + targets=y_padded.to(torch.int32), + logit_lengths=x_lens, + target_lengths=y_lens, + blank=blank_id, + reduction="sum", + ) + + return logits, loss + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode == "ft-embed-affine": + self.unfreeze() + self.freeze_preembed_layers() + else: + raise ValueError(f"invalid train_mode={mode}") + + self._train_mode = mode + + def _train(self, train_mode: str): + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return ["full", "frozen", "ft-embed-affine"] + + def get_config(self): + dec_cfg = self.decoder.get_config() + join_cfg = self.joiner.get_config() + + config = { + "blank_id": self.blank_id, + "vocab_size": self.vocab_size, + "decoder": dec_cfg, + "joiner": join_cfg, + } + + # base_config = super().get_config() + return dict(list(config.items())) + + @staticmethod + def filter_args(**kwargs): + + # get arguments for pooling + decoder_args = Decoder.filter_args(**kwargs["decoder"]) + joiner_args = Joiner.filter_args(**kwargs["joiner"]) + + valid_args = () + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + args["decoder"] = decoder_args + args["joiner"] = joiner_args + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Decoder.add_class_args(parser, prefix="decoder") + Joiner.add_class_args(parser, prefix="joiner") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + def change_config( + self, + decoder, + # joiner, + ): + logging.info("changing transducer config") + self.decoder.change_config(**decoder) + # self.joiner.change_config(**joiner) + + @staticmethod + def filter_finetune_args(**kwargs): + # get arguments for pooling + decoder_args = Decoder.filter_finetune_args(**kwargs["decoder"]) + # joiner_args = Joiner.filter_finetune_args(**kwargs["joiner"]) + + valid_args = () + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + args["decoder"] = decoder_args + # args["joiner"] = joiner_args + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Decoder.add_finetune_args(parser, prefix="decoder") + # Joiner.add_finetune_args(parser, prefix="joiner") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + add_argparse_args = add_class_args + add_argparse_finetune_args = add_finetune_args diff --git a/hyperion/torch/models/transducer/transformer0.py b/hyperion/torch/models/transducer/transformer0.py new file mode 100644 index 00000000..0beb405f --- /dev/null +++ b/hyperion/torch/models/transducer/transformer0.py @@ -0,0 +1,417 @@ +# Copyright 2021 University of Chinese Academy of Sciences (author: Han Zhu) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +from typing import Optional, Tuple + +import torch +import torch.nn as nn + +from hyperion.utils.text import make_pad_mask + +from .encoder_interface import EncoderInterface +from .subsampling0 import Conv2dSubsampling, VggSubsampling + + +class Transformer(EncoderInterface): + def __init__( + self, + num_features: int, + output_dim: int, + subsampling_factor: int = 4, + d_model: int = 256, + nhead: int = 4, + dim_feedforward: int = 2048, + num_encoder_layers: int = 12, + dropout: float = 0.1, + normalize_before: bool = True, + vgg_frontend: bool = False, + ) -> None: + """ + Args: + num_features: + The input dimension of the model. + output_dim: + The output dimension of the model. + subsampling_factor: + Number of output frames is num_in_frames // subsampling_factor. + Currently, subsampling_factor MUST be 4. + d_model: + Attention dimension. + nhead: + Number of heads in multi-head attention. + Must satisfy d_model // nhead == 0. + dim_feedforward: + The output dimension of the feedforward layers in encoder. + num_encoder_layers: + Number of encoder layers. + dropout: + Dropout in encoder. + normalize_before: + If True, use pre-layer norm; False to use post-layer norm. + vgg_frontend: + True to use vgg style frontend for subsampling. + """ + super().__init__() + + self.num_features = num_features + self.output_dim = output_dim + self.subsampling_factor = subsampling_factor + if subsampling_factor != 4: + raise NotImplementedError("Support only 'subsampling_factor=4'.") + + # self.encoder_embed converts the input of shape (N, T, num_features) + # to the shape (N, T//subsampling_factor, d_model). + # That is, it does two things simultaneously: + # (1) subsampling: T -> T//subsampling_factor + # (2) embedding: num_features -> d_model + if vgg_frontend: + self.encoder_embed = VggSubsampling(num_features, d_model) + else: + self.encoder_embed = Conv2dSubsampling(num_features, d_model) + + self.encoder_pos = PositionalEncoding(d_model, dropout) + + encoder_layer = TransformerEncoderLayer( + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + normalize_before=normalize_before, + ) + + if normalize_before: + encoder_norm = nn.LayerNorm(d_model) + else: + encoder_norm = None + + self.encoder = nn.TransformerEncoder( + encoder_layer=encoder_layer, + num_layers=num_encoder_layers, + norm=encoder_norm, + ) + + # TODO(fangjun): remove dropout + self.encoder_output_layer = nn.Sequential( + nn.Dropout(p=dropout), nn.Linear(d_model, output_dim) + ) + + def forward( + self, x: torch.Tensor, x_lens: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: + The input tensor. Its shape is (batch_size, seq_len, feature_dim). + x_lens: + A tensor of shape (batch_size,) containing the number of frames in + `x` before padding. + Returns: + Return a tuple containing 2 tensors: + - logits, its shape is (batch_size, output_seq_len, output_dim) + - logit_lens, a tensor of shape (batch_size,) containing the number + of frames in `logits` before padding. + """ + x = self.encoder_embed(x) + x = self.encoder_pos(x) + x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) + + # Caution: We assume the subsampling factor is 4! + lengths = ((x_lens - 1) // 2 - 1) // 2 + assert x.size(0) == lengths.max().item() + + mask = make_pad_mask(lengths) + x = self.encoder(x, src_key_padding_mask=mask) # (T, N, C) + + logits = self.encoder_output_layer(x) + logits = logits.permute(1, 0, 2) # (T, N, C) ->(N, T, C) + + return logits, lengths + + +class TransformerEncoderLayer(nn.Module): + """ + Modified from torch.nn.TransformerEncoderLayer. + Add support of normalize_before, + i.e., use layer_norm before the first block. + + Args: + d_model: + the number of expected features in the input (required). + nhead: + the number of heads in the multiheadattention models (required). + dim_feedforward: + the dimension of the feedforward network model (default=2048). + dropout: + the dropout value (default=0.1). + activation: + the activation function of intermediate layer, relu or + gelu (default=relu). + normalize_before: + whether to use layer_norm before the first block. + + Examples:: + >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8) + >>> src = torch.rand(10, 32, 512) + >>> out = encoder_layer(src) + """ + + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: str = "relu", + normalize_before: bool = True, + ) -> None: + super(TransformerEncoderLayer, self).__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + + self.normalize_before = normalize_before + + def __setstate__(self, state): + if "activation" not in state: + state["activation"] = nn.functional.relu + super(TransformerEncoderLayer, self).__setstate__(state) + + def forward( + self, + src: torch.Tensor, + src_mask: Optional[torch.Tensor] = None, + src_key_padding_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional) + + Shape: + src: (S, N, E). + src_mask: (S, S). + src_key_padding_mask: (N, S). + S is the source sequence length, T is the target sequence length, + N is the batch size, E is the feature number + """ + residual = src + if self.normalize_before: + src = self.norm1(src) + src2 = self.self_attn( + src, + src, + src, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask, + )[0] + src = residual + self.dropout1(src2) + if not self.normalize_before: + src = self.norm1(src) + + residual = src + if self.normalize_before: + src = self.norm2(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = residual + self.dropout2(src2) + if not self.normalize_before: + src = self.norm2(src) + return src + + +def _get_activation_fn(activation: str): + if activation == "relu": + return nn.functional.relu + elif activation == "gelu": + return nn.functional.gelu + + raise RuntimeError("activation should be relu/gelu, not {}".format(activation)) + + +class PositionalEncoding(nn.Module): + """This class implements the positional encoding + proposed in the following paper: + + - Attention Is All You Need: https://arxiv.org/pdf/1706.03762.pdf + + PE(pos, 2i) = sin(pos / (10000^(2i/d_modle)) + PE(pos, 2i+1) = cos(pos / (10000^(2i/d_modle)) + + Note:: + + 1 / (10000^(2i/d_model)) = exp(-log(10000^(2i/d_model))) + = exp(-1* 2i / d_model * log(100000)) + = exp(2i * -(log(10000) / d_model)) + """ + + def __init__(self, d_model: int, dropout: float = 0.1) -> None: + """ + Args: + d_model: + Embedding dimension. + dropout: + Dropout probability to be applied to the output of this module. + """ + super().__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = nn.Dropout(p=dropout) + # not doing: self.pe = None because of errors thrown by torchscript + self.pe = torch.zeros(1, 0, self.d_model, dtype=torch.float32) + + def extend_pe(self, x: torch.Tensor) -> None: + """Extend the time t in the positional encoding if required. + + The shape of `self.pe` is (1, T1, d_model). The shape of the input x + is (N, T, d_model). If T > T1, then we change the shape of self.pe + to (N, T, d_model). Otherwise, nothing is done. + + Args: + x: + It is a tensor of shape (N, T, C). + Returns: + Return None. + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model, dtype=torch.float32) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) + * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + # Now pe is of shape (1, T, d_model), where T is x.size(1) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Add positional encoding. + + Args: + x: + Its shape is (N, T, C) + + Returns: + Return a tensor of shape (N, T, C) + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1), :] + return self.dropout(x) + + +class Noam(object): + """ + Implements Noam optimizer. + + Proposed in + "Attention Is All You Need", https://arxiv.org/pdf/1706.03762.pdf + + Modified from + https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/optimizer.py # noqa + + Args: + params: + iterable of parameters to optimize or dicts defining parameter groups + model_size: + attention dimension of the transformer model + factor: + learning rate factor + warm_step: + warmup steps + """ + + def __init__( + self, + params, + model_size: int = 256, + factor: float = 10.0, + warm_step: int = 25000, + weight_decay=0, + ) -> None: + """Construct an Noam object.""" + self.optimizer = torch.optim.Adam( + params, lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=weight_decay + ) + self._step = 0 + self.warmup = warm_step + self.factor = factor + self.model_size = model_size + self._rate = 0 + + @property + def param_groups(self): + """Return param_groups.""" + return self.optimizer.param_groups + + def step(self): + """Update parameters and rate.""" + self._step += 1 + rate = self.rate() + for p in self.optimizer.param_groups: + p["lr"] = rate + self._rate = rate + self.optimizer.step() + + def rate(self, step=None): + """Implement `lrate` above.""" + if step is None: + step = self._step + return ( + self.factor + * self.model_size ** (-0.5) + * min(step ** (-0.5), step * self.warmup ** (-1.5)) + ) + + def zero_grad(self): + """Reset gradient.""" + self.optimizer.zero_grad() + + def state_dict(self): + """Return state_dict.""" + return { + "_step": self._step, + "warmup": self.warmup, + "factor": self.factor, + "model_size": self.model_size, + "_rate": self._rate, + "optimizer": self.optimizer.state_dict(), + } + + def load_state_dict(self, state_dict): + """Load state_dict.""" + for key, value in state_dict.items(): + if key == "optimizer": + self.optimizer.load_state_dict(state_dict["optimizer"]) + else: + setattr(self, key, value) diff --git a/hyperion/torch/models/tvector/__init__.py b/hyperion/torch/models/tvector/__init__.py deleted file mode 100644 index 98db2561..00000000 --- a/hyperion/torch/models/tvector/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -# t-vectors -from .tvector import TVector -from .resnet_tvector import ResNetTVector diff --git a/hyperion/torch/models/tvector/resnet_tvector.py b/hyperion/torch/models/tvector/resnet_tvector.py deleted file mode 100644 index d74272aa..00000000 --- a/hyperion/torch/models/tvector/resnet_tvector.py +++ /dev/null @@ -1,196 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import logging -from argparse import Namespace - -import torch -import torch.nn as nn - -from .xvector import XVector -from ..narchs import ResNetFactory as RNF - - -class ResNetXVector(XVector): - def __init__( - self, - in_feats, - num_classes, - resnet_cfg=Namespace( - resnet_type="resnet34", - in_channels=1, - conv_channels=64, - base_channels=64, - in_kernel_size=7, - in_stride=1, - zero_init_residual=False, - groups=1, - replace_stride_with_dilation=None, - do_maxpool=False, - hid_act={"name": "relu", "inplace": True}, - dropout_rate=0, - norm_layer=None, - use_norm=True, - norm_before=True, - in_norm=False, - se_r=16, - res2net_scale=4, - res2net_width_factor=1, - ), - conformer_cfg=Namespace( - d_model=256, - num_heads=4, - num_blocks=6, - attype="scaled-dot-prod-v1", - atcontext=25, - conv_repeats=1, - conv_kernel_sizes=31, - conv_strides=1, - ff_type="linear", - d_ff=2048, - ff_kernel_size=1, - dropourate=0.1, - pos_dropourate=0.1, - att_dropout_rate=0.0, - in_layer_type="conv2d-sub", - rel_pos_enc=True, - causal_pos_enc=False, - no_pos_enc=False, - hid_act="swish", - conv_norm_layer=None, - se_r=None, - ff_macaron=True, - red_lnorms=False, - concat_after=False, - ), - pool_net="mean+stddev", - head_cfg=Namespace( - embed_dim=256, - num_embed_layers=1, - head_hid_act={"name": "relu", "inplace": True}, - loss_type="arc-softmax", - s=64, - margin=0.3, - margin_warmup_epochs=0, - num_subcenters=2, - norm_layer=None, - use_norm=True, - norm_before=True, - dropout_rate=0, - embed_layer=0, - ), - ): - - logging.info("making %s encoder network" % (resnet_type)) - if isinstance(resnet_cfg, Namespace): - resnet_cfg = var(resnet_cfg) - - self.resnet_type = resnet_cfg["resnet_type"] - encoder_net = RNF.create(**resnet_cfg) - - super().__init__( - encoder_net, - num_classes, - conformer_cfg=conformer_cfg, - pool_net=pool_net, - head_cfg=head_cfg, - in_feats=in_feats, - proj_feats=None, - ) - - @property - def in_channels(self): - return self.encoder_net.in_channels - - @property - def conv_channels(self): - return self.encoder_net.conv_channels - - @property - def base_channels(self): - return self.encoder_net.base_channels - - @property - def in_kernel_size(self): - return self.encoder_net.in_kernel_size - - @property - def in_stride(self): - return self.encoder_net.in_stride - - @property - def zero_init_residual(self): - return self.encoder_net.zero_init_residual - - @property - def groups(self): - return self.encoder_net.groups - - @property - def replace_stride_with_dilation(self): - return self.encoder_net.replace_stride_with_dilation - - @property - def do_maxpool(self): - return self.encoder_net.do_maxpool - - @property - def in_norm(self): - return self.encoder_net.in_norm - - @property - def se_r(self): - return self.encoder_net.se_r - - @property - def res2net_scale(self): - return self.encoder_net.res2net_scale - - @property - def res2net_width_factor(self): - return self.encoder_net.res2net_width_factor - - def get_config(self): - - base_config = super().get_config() - del base_config["encoder_cfg"] - enc_cfg = self.encoder_net.get_config() - del enc_cfg["block"] - del enc_cfg["out_units"] - del enc_cfg["out_act"] - enc_cfg["resnet_type"] = self.resnet_type - - base_config["resnet_cfg"] = enc_cfg - - return base_config - - @classmethod - def load(cls, file_path=None, cfg=None, state_dict=None): - - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - - model = cls(**cfg) - if state_dict is not None: - model.load_state_dict(state_dict) - - return model - - def filter_args(prefix=None, **kwargs): - - base_args = XVector.filter_args(prefix, **kwargs) - child_args = RNF.filter_args(prefix, **kwargs) - - base_args.update(child_args) - return base_args - - @staticmethod - def add_argparse_args(parser, prefix=None): - - XVector.add_argparse_args(parser, prefix) - if prefix is None: - prefix = "resnet" - else: - prefix = prefix + "-resnet" - RNF.add_argparse_args(parser, prefix) diff --git a/hyperion/torch/models/tvector/tvector.py b/hyperion/torch/models/tvector/tvector.py deleted file mode 100644 index 8a3758fb..00000000 --- a/hyperion/torch/models/tvector/tvector.py +++ /dev/null @@ -1,566 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import logging -from jsonargparse import ArgumentParser, ActionParser - -import torch -import torch.nn as nn - -from ..layers import GlobalPool1dFactory as PF -from ..layer_blocks import TDNNBlock -from ...narchs import ClassifHead, ConformerEncoderV1, TorchNALoader -from ..torch_model import TorchModel -from ..utils import eval_nnet_by_chunks - - -class TXVector(TorchModel): - """x-Vector base class""" - - def __init__( - self, - encoder_net, - num_classes, - conformer_net={}, - pool_net="mean+stddev", - classif_net={}, - in_feats=None, - ): - - super().__init__() - - # encoder network - self.encoder_net = encoder_net - - # infer input and output shapes of encoder network - in_shape = self.encoder_net.in_shape() - if len(in_shape) == 3: - # encoder based on 1d conv or transformer - in_feats = in_shape[1] - out_shape = self.encoder_net.out_shape(in_shape) - enc_feats = out_shape[1] - elif len(in_shape) == 4: - # encoder based in 2d convs - assert ( - in_feats is not None - ), "in_feats dimension must be given to calculate pooling dimension" - in_shape = list(in_shape) - in_shape[2] = in_feats - out_shape = self.encoder_net.out_shape(tuple(in_shape)) - enc_feats = out_shape[1] * out_shape[2] - - self.in_feats = in_feats - - logging.info("encoder input shape={}".format(in_shape)) - logging.info("encoder output shape={}".format(out_shape)) - - # create conformer net - if isinstance(conformer_net, nn.Module): - self.conformer_net = conformer_net - else: - logging.info("making conformer net") - conformer_net["in_layer_type"] = "linear" - self.conformer_net = ConformerEncoderV1( - enc_feats, in_time_dim=1, out_time_dim=1, **conformer_net - ) - - d_model = self.conformer_net.d_model - self.pool_net = self._make_pool_net(pool_cfg, d_model) - pool_feats = int(d_model * self.pool_net.size_multiplier) - logging.info("infer pooling dimension %d", pool_feats) - - # create classification head - if isinstance(classif_net, nn.Module): - self.classif_net = classif_net - else: - logging.info("making classification head net") - self.classif_net = ClassifHead(pool_feats, num_classes, **head_cfg) - - @property - def pool_feats(self): - return self.classif_net.in_feats - - @property - def num_classes(self): - return self.classif_net.num_classes - - @property - def embed_dim(self): - return self.classif_net.embed_dim - - @property - def num_embed_layers(self): - return self.classif_net.num_embed_layers - - @property - def s(self): - return self.classif_net.s - - @property - def margin(self): - return self.classif_net.margin - - @property - def margin_warmup_epochs(self): - return self.classif_net.margin_warmup_epochs - - @property - def num_subcenters(self): - return self.classif_net.num_subcenters - - @property - def loss_type(self): - return self.classif_net.loss_type - - def _make_pool_net(self, pool_net, enc_feats=None): - """Makes the pooling block - - Args: - pool_net: str or dict to pass to the pooling factory create function - enc_feats: dimension of the features coming from the encoder - - Returns: - GlobalPool1d object - """ - if isinstance(pool_net, str): - pool_net = {"pool_type": pool_net} - - if isinstance(pool_net, dict): - if enc_feats is not None: - pool_net["in_feats"] = enc_feats - - return PF.create(**pool_net) - elif isinstance(pool_net, nn.Module): - return pool_net - else: - raise Exception("Invalid pool_net argument") - - def update_loss_margin(self, epoch): - """Updates the value of the margin in AAM/AM-softmax losses - given the epoch number - - Args: - epoch: epoch which is about to start - """ - self.classif_net.update_margin(epoch) - - def _pre_enc(self, x): - if self.encoder_net.in_dim() == 4 and x.dim() == 3: - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - return x - - def _post_enc(self, x): - if self.encoder_net.out_dim() == 4: - x = x.view(x.size(0), -1, x.size(-1)) - - if self.proj is not None: - x = self.proj(x) - - return x - - def forward( - self, - x, - y=None, - enc_layers=None, - classif_layers=None, - return_output=True, - use_amp=False, - ): - if enc_layers is None and classif_layers is None: - return self.forward_output(x, y) - - h = self.forward_hid_feats(x, y, enc_layers, classif_layers, return_output) - output = {} - if enc_layers is not None: - if classif_layers is None: - output["h_enc"] = h - else: - output["h_enc"] = h[0] - else: - output["h_enc"] = [] - if classif_layers is not None: - output["h_classif"] = h[1] - else: - output["h_classif"] = [] - if return_output: - output["output"] = h[2] - return output - - def forward_output(self, x, y=None): - """Forward function - - Args: - x: input features tensor with shape=(batch, in_feats, time) - y: target classes torch.long tensor with shape=(batch,) - - Returns: - class posteriors tensor with shape=(batch, num_classes) - """ - if self.encoder_net.in_dim() == 4 and x.dim() == 3: - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - - x = self.encoder_net(x) - x = self.conformer_net(x) - - if self.encoder_net.out_dim() == 4: - x = x.view(x.size(0), -1, x.size(-1)) - - p = self.pool_net(x) - y = self.classif_net(p, y) - return y - - def forward_hid_feats( - self, - x, - y=None, - enc_layers=None, - conf_layers=None, - classif_layers=None, - return_output=False, - ): - """forwards hidden representations in the x-vector network""" - - if self.encoder_net.in_dim() == 4 and x.dim() == 3: - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - - h_enc, x = self.encoder_net.forward_hid_feats(x, enc_layers, return_output=True) - - h_conf, x = self.conformer_net.forward_hid_feats( - x, conf_layers, return_output=True - ) - - if not return_output and classif_layers is None: - return h_enc - - if self.encoder_net.out_dim() == 4: - x = x.view(x.size(0), -1, x.size(-1)) - - if self.proj is not None: - x = self.proj(x) - - p = self.pool_net(x) - h_classif = self.classif_net.forward_hid_feats( - p, y, classif_layers, return_output=return_output - ) - if return_output: - h_classif, y = h_classif - return h_enc, h_classif, y - - return h_enc, h_classif - - def extract_embed(self, x, chunk_length=0, embed_layer=None, detach_chunks=False): - if embed_layer is None: - embed_layer = self.embed_layer - - x = self._pre_enc(x) - # if self.encoder_net.in_dim() == 4 and x.dim() == 3: - # x = x.view(x.size(0), 1, x.size(1), x.size(2)) - x = eval_nnet_by_chunks( - x, self.encoder_net, chunk_length, detach_chunks=detach_chunks - ) - - if x.device != self.device: - x = x.to(self.device) - - x = self._post_enc(x) - - # if self.encoder_net.out_dim() == 4: - # x = x.view(x.size(0), -1, x.size(-1)) - - # if self.proj is not None: - # x = self.proj(x) - - p = self.pool_net(x) - y = self.classif_net.extract_embed(p, embed_layer) - return y - - def extract_embed_slidwin( - self, - x, - win_length, - win_shift, - snip_edges=False, - feat_frame_length=None, - feat_frame_shift=None, - chunk_length=0, - embed_layer=None, - detach_chunks=False, - ): - - if feat_frame_shift is not None: - # assume win_length/shift are in secs, transform to frames - # pass feat times from msecs to secs - feat_frame_shift = feat_frame_shift / 1000 - feat_frame_length = feat_frame_length / 1000 - - # get length and shift in number of feature frames - win_shift = win_shift / feat_frame_shift # this can be a float - win_length = ( - win_length - feat_frame_length + feat_frame_shift - ) / feat_frame_shift - assert win_shift > 0.5, "win-length should be longer than feat-frame-length" - - if embed_layer is None: - embed_layer = self.embed_layer - - in_time = x.size(-1) - x = self._pre_enc(x) - x = eval_nnet_by_chunks( - x, self.encoder_net, chunk_length, detach_chunks=detach_chunks - ) - - if x.device != self.device: - x = x.to(self.device) - - x = self._post_enc(x) - pin_time = x.size(-1) # time dim before pooling - downsample_factor = float(pin_time) / in_time - p = self.pool_net.forward_slidwin( - x, - downsample_factor * win_length, - downsample_factor * win_shift, - snip_edges=snip_edges, - ) - # (batch, pool_dim, time) - - p = p.transpose(1, 2).contiguous().view(-1, p.size(1)) - y = ( - self.classif_net.extract_embed(p, embed_layer) - .view(x.size(0), -1, self.embed_dim) - .transpose(1, 2) - .contiguous() - ) - - return y - - def compute_slidwin_timestamps( - self, - num_windows, - win_length, - win_shift, - snip_edges=False, - feat_frame_length=25, - feat_frame_shift=10, - feat_snip_edges=False, - ): - - P = self.compute_slidwin_left_padding( - win_length, - win_shift, - snip_edges, - feat_frame_length, - feat_frame_shift, - feat_snip_edges, - ) - - tstamps = ( - torch.as_tensor( - [ - [i * win_shift, i * win_shift + win_length] - for i in range(num_windows) - ] - ) - - P - ) - tstamps[tstamps < 0] = 0 - return tstamps - - def compute_slidwin_left_padding( - self, - win_length, - win_shift, - snip_edges=False, - feat_frame_length=25, - feat_frame_shift=10, - feat_snip_edges=False, - ): - - # pass feat times from msecs to secs - feat_frame_shift = feat_frame_shift / 1000 - feat_frame_length = feat_frame_length / 1000 - - # get length and shift in number of feature frames - H = win_shift / feat_frame_shift - L = (win_length - feat_frame_length + feat_frame_shift) / feat_frame_shift - assert L > 0.5, "win-length should be longer than feat-frame-length" - - # compute left padding in case of snip_edges is False - if snip_edges: - P1 = 0 - else: - Q = ( - L - H - ) / 2 # left padding in frames introduced by x-vector sliding window - P1 = ( - Q * feat_frame_shift - ) # left padding in secs introduced by x-vector sliding window - - if feat_snip_edges: - # left padding introduced when computing acoustic feats - P2 = 0 - else: - P2 = (feat_frame_length - feat_frame_shift) / 2 - - # total left padding - return P1 + P2 - - def get_config(self): - - enc_cfg = self.encoder_net.get_config() - pool_cfg = PF.get_config(self.pool_net) - conformer_cfg = self.conformer_net.get_config() - classif_cfg = self.classif_net.get_config() - - config = { - "encoder_cfg": enc_cfg, - "num_classes": self.num_classes, - "conformer_net": self.conformer_cfg, - "pool_net": pool_cfg, - "classif_net": self.classif_cfg, - "in_feats": self.in_feats, - } - - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) - - @classmethod - def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - encoder_net = TorchNALoader.load_from_cfg(cfg=cfg["encoder_cfg"]) - - for k in "encoder_cfg": - del cfg[k] - - model = cls(encoder_net, **cfg) - if state_dict is not None: - model.load_state_dict(state_dict) - - return model - - def rebuild_output_layer( - self, - num_classes=None, - loss_type="arc-softmax", - s=64, - margin=0.3, - margin_warmup_epochs=10, - ): - if (self.num_classes is not None and self.num_classes != num_classes) or ( - self.loss_type != loss_type - ): - # if we change the number of classes or the loss-type - # we need to reinitiate the last layer - self.classif_net.rebuild_output_layer( - num_classes, loss_type, s, margin, margin_warmup_epochs - ) - return - - # otherwise we just change the values of s, margin and margin_warmup - self.classif_net.set_margin(margin) - self.classif_net.set_margin_warmup_epochs(margin_warmup_epochs) - self.classif_net.set_s(s) - - def freeze_preembed_layers(self): - self.encoder_net.freeze() - if self.proj is not None: - self.proj.freeze() - - for param in self.pool_net.parameters(): - param.requires_grad = False - - layer_list = [l for l in range(self.embed_layer)] - self.classif_net.freeze_layers(layer_list) - - def train_mode(self, mode="ft-embed-affine"): - if mode == "ft-full" or mode == "train": - self.train() - return - - self.encoder_net.eval() - self.conformer_net.eval() - self.pool_net.eval() - self.classif_net.train() - layer_list = [l for l in range(self.embed_layer)] - self.classif_net.put_layers_in_eval_mode(layer_list) - - @staticmethod - def filter_args(**kwargs): - - valid_args = ("num_classes", "in_feats") - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - # get arguments for conformer - conformer_args = ConformerEncoderV1.filter_args(**kwargs["conformer_net"]) - args["corformer_net"] = conformer_args - # get arguments for pooling - pool_args = PF.filter_args(**kwargs["pool_net"]) - args["pool_net"] = pool_args - # get arguments for classif head - classif_args = ClassifHead.filter_args(**kwargs["classif_net"]) - args["classif_net"] = classif_args - - return args - - @staticmethod - def add_class_args(parser, prefix=None): - if prefix is not None: - outer_parser = parser - parser = ArgumentParser(prog="") - - CoformerEncoderV1.add_class_args(parser, prefix="conformer_net") - PF.add_class_args( - parser, prefix="pool_net", skip=["dim", "in_feats", "keepdim"] - ) - ClassifHead.add_class_args(parser, prefix="classif_net") - if prefix is not None: - outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), - help="xvector options", - ) - - @staticmethod - def filter_finetune_args(**kwargs): - valid_args = ("loss_type", "s", "margin", "margin_warmup_epochs") - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - return args - - @staticmethod - def add_finetune_args(parser, prefix=None): - if prefix is not None: - outer_parser = parser - parser = ArgumentParser(prog="") - - parser.add_argument( - "--loss-type", - default="arc-softmax", - choices=["softmax", "arc-softmax", "cos-softmax", "subcenter-arc-softmax"], - help="loss type: softmax, arc-softmax, cos-softmax, subcenter-arc-softmax", - ) - - parser.add_argument("--s", default=64, type=float, help="scale for arcface") - - parser.add_argument( - "--margin", default=0.3, type=float, help="margin for arcface, cosface,..." - ) - - parser.add_argument( - "--margin-warmup-epochs", - default=10, - type=float, - help="number of epoch until we set the final margin", - ) - - parser.add_argument( - "--num-subcenters", - default=2, - type=float, - help="number of subcenters in subcenter losses", - ) - - if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/vae/vae.py b/hyperion/torch/models/vae/vae.py index 32239718..86938bf2 100644 --- a/hyperion/torch/models/vae/vae.py +++ b/hyperion/torch/models/vae/vae.py @@ -6,13 +6,13 @@ import logging import torch -import torch.nn as nn import torch.distributions as pdf +import torch.nn as nn -from ...torch_model import TorchModel -from ...narchs import TorchNALoader -from ...layers import tensor2pdf as t2pdf from ...layers import pdf_storage +from ...layers import tensor2pdf as t2pdf +from ...narchs import TorchNALoader +from ...torch_model import TorchModel class VAE(TorchModel): diff --git a/hyperion/torch/models/vae/vq_vae.py b/hyperion/torch/models/vae/vq_vae.py index 9fcc22a0..e86cd04f 100644 --- a/hyperion/torch/models/vae/vq_vae.py +++ b/hyperion/torch/models/vae/vq_vae.py @@ -6,13 +6,13 @@ import logging import torch -import torch.nn as nn import torch.distributions as pdf +import torch.nn as nn -from ...torch_model import TorchModel -from ...narchs import TorchNALoader from ...layers import tensor2pdf as t2pdf from ...layers import vq +from ...narchs import TorchNALoader +from ...torch_model import TorchModel class VQVAE(TorchModel): diff --git a/hyperion/torch/models/wav2transducer/__init__.py b/hyperion/torch/models/wav2transducer/__init__.py new file mode 100644 index 00000000..e57b36ff --- /dev/null +++ b/hyperion/torch/models/wav2transducer/__init__.py @@ -0,0 +1,12 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +# from .hf_wav2vec2_transducer import HFWav2Vec2Transducer +from .hf_wav2vec2conformer_v1_rnn_transducer import HFWav2Vec2ConformerV1RNNTransducer +from .hf_wav2vec2rnn_rnn_transducer import HFWav2Vec2RNNRNNTransducer +from .hf_wav2vec2rnn_transducer import HFWav2Vec2RNNTransducer +from .wav2conformer_v1_rnn_transducer import Wav2ConformerV1RNNTransducer +from .wav2rnn_rnn_transducer import Wav2RNNRNNTransducer diff --git a/hyperion/torch/models/wav2transducer/beam_search.py b/hyperion/torch/models/wav2transducer/beam_search.py new file mode 100644 index 00000000..b23a0769 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/beam_search.py @@ -0,0 +1,232 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch + +from .hf_wav2transducer import HFWav2Transducer + + +def greedy_search(model: HFWav2Transducer, + encoder_out: torch.Tensor) -> List[int]: + """ + Args: + model: + An instance of `Transducer`. + encoder_out: + A tensor of shape (N, T, C) from the encoder. Support only N==1 for now. + Returns: + Return the decoded result. + """ + assert encoder_out.ndim == 3 + + # support only batch_size == 1 for now + assert encoder_out.size(0) == 1, encoder_out.size(0) + blank_id = model.transducer.decoder.blank_id + device = model.device + + sos = torch.tensor([blank_id], device=device, + dtype=torch.int64).reshape(1, 1) + decoder_out, (h, c) = model.transducer.decoder(sos) + T = encoder_out.size(1) + t = 0 + hyp = [] + + sym_per_frame = 0 + sym_per_utt = 0 + + max_sym_per_utt = 1000 + max_sym_per_frame = 3 + + while t < T and sym_per_utt < max_sym_per_utt: + # fmt: off + current_encoder_out = encoder_out[:, t:t + 1, :] + # fmt: on + logits = model.transducer.joiner(current_encoder_out, decoder_out) + # logits is (1, 1, 1, vocab_size) + + log_prob = logits.log_softmax(dim=-1) + # log_prob is (1, 1, 1, vocab_size) + # TODO: Use logits.argmax() + y = log_prob.argmax() + if y != blank_id: + hyp.append(y.item()) + y = y.reshape(1, 1) + decoder_out, (h, c) = model.transducer.decoder(y, (h, c)) + + sym_per_utt += 1 + sym_per_frame += 1 + + if y == blank_id or sym_per_frame > max_sym_per_frame: + sym_per_frame = 0 + t += 1 + + return hyp + + +@dataclass +class Hypothesis: + ys: List[int] # the predicted sequences so far + log_prob: float # The log prob of ys + + # Optional decoder state. We assume it is LSTM for now, + # so the state is a tuple (h, c) + decoder_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + + +def beam_search( + model: HFWav2Transducer, + encoder_out: torch.Tensor, + beam: int = 5, +) -> List[int]: + """ + It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf + espnet/nets/beam_search_transducer.py#L247 is used as a reference. + Args: + model: + An instance of `Transducer`. + encoder_out: + A tensor of shape (N, T, C) from the encoder. Support only N==1 for now. + beam: + Beam size. + Returns: + Return the decoded result. + """ + assert encoder_out.ndim == 3 + + # support only batch_size == 1 for now + assert encoder_out.size(0) == 1, encoder_out.size(0) + blank_id = model.transducer.decoder.blank_id + device = model.device + + sos = torch.tensor([blank_id], device=device).reshape(1, 1) + decoder_out, (h, c) = model.transducer.decoder(sos) + T = encoder_out.size(1) + t = 0 + B = [Hypothesis(ys=[blank_id], log_prob=0.0, decoder_state=None)] + max_u = 20000 # terminate after this number of steps + u = 0 + + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]] = {} + + while t < T and u < max_u: + # fmt: off + current_encoder_out = encoder_out[:, t:t + 1, :] + # fmt: on + A = B + B = [] + # for hyp in A: + # for h in A: + # if h.ys == hyp.ys[:-1]: + # # update the score of hyp + # decoder_input = torch.tensor( + # [h.ys[-1]], device=device + # ).reshape(1, 1) + # decoder_out, _ = model.decoder( + # decoder_input, h.decoder_state + # ) + # logits = model.joiner(current_encoder_out, decoder_out) + # log_prob = logits.log_softmax(dim=-1) + # log_prob = log_prob.squeeze() + # hyp.log_prob += h.log_prob + log_prob[hyp.ys[-1]].item() + + while u < max_u: + y_star = max(A, key=lambda hyp: hyp.log_prob) + A.remove(y_star) + + # Note: y_star.ys is unhashable, i.e., cannot be used + # as a key into a dict + cached_key = "_".join(map(str, y_star.ys)) + + if cached_key not in cache: + decoder_input = torch.tensor([y_star.ys[-1]], + device=device).reshape(1, 1) + + decoder_out, decoder_state = model.transducer.decoder( + decoder_input, + y_star.decoder_state, + ) + cache[cached_key] = (decoder_out, decoder_state) + else: + decoder_out, decoder_state = cache[cached_key] + + logits = model.transducer.joiner(current_encoder_out, decoder_out) + log_prob = logits.log_softmax(dim=-1) + # log_prob is (1, 1, 1, vocab_size) + log_prob = log_prob.squeeze() + # Now log_prob is (vocab_size,) + + # If we choose blank here, add the new hypothesis to B. + # Otherwise, add the new hypothesis to A + + # First, choose blank + skip_log_prob = log_prob[blank_id] + new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() + # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob, + # skip_log_prob.item(), new_y_star_log_prob) + # ys[:] returns a copy of ys + new_y_star = Hypothesis( + ys=y_star.ys[:], + log_prob=new_y_star_log_prob, + # Caution: Use y_star.decoder_state here + decoder_state=y_star.decoder_state, + ) + B.append(new_y_star) + + topk_log_prob = log_prob.topk(beam, dim=-1) + + # Second, choose other labels + #for i, v in enumerate(log_prob.tolist()): + for v, i in zip(*topk_log_prob): + v = v.item() + i = i.item() + if i == blank_id: + continue + new_ys = y_star.ys + [i] + new_log_prob = y_star.log_prob + v + new_hyp = Hypothesis( + ys=new_ys, + log_prob=new_log_prob, + decoder_state=decoder_state, + ) + A.append(new_hyp) + u += 1 + # check whether B contains more than "beam" elements more probable + # than the most probable in A + A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + B = sorted( + [hyp for hyp in B if hyp.log_prob > A_most_probable.log_prob], + key=lambda hyp: hyp.log_prob, + reverse=True, + ) + # print("tuAB2", + # t, + # u, + # len(A), + # A_most_probable.log_prob, + # len(B), + # flush=True) + if len(B) >= beam: + B = B[:beam] + break + t += 1 + best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:])) + ys = best_hyp.ys[1:] # [1:] to remove the blank + return ys diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py new file mode 100644 index 00000000..c4f65ba6 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py @@ -0,0 +1,373 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import contextlib +import logging +from dataclasses import dataclass +from typing import Dict, List, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...torch_model import TorchModel +from ...utils import remove_silence +from ..transducer import RNNTransducer, RNNTransducerOutput + + +class HFWav2RNNTransducer(TorchModel): + """Abstract Base class for RNN-T transducer models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + transducer: transducer model object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: TorchModel, + transducer: Union[Dict, TorchModel], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + super().__init__() + self.hf_feats = hf_feats + if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer["encoder"] = None + transducer = RNNTransducer(**transducer) + else: + assert isinstance(transducer, RNNTransducer) + if transducer.encoder is None: + assert transducer.decoder.in_feats == hf_feats.hidden_size + + self.transducer = transducer + self.feat_fusion_start = feat_fusion_start + self.feat_fusion_method = feat_fusion_method + self._hf_context = contextlib.nullcontext() + self._make_fuser() + + def _make_fuser(self): + if self.feat_fusion_method == "last": + self.feat_fuser = None + return + + num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "linear": + self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers + elif self.feat_fusion_method == "cat": + self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) + + def _fuse_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start :] + if self.feat_fusion_method == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "last": + feats = hid_feats[-1] + + return feats + + def forward_feats( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fusion_method == "last" + else True + ) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + feats = self._fuse_hid_feats(hid_feats) + else: + hid_feats = None + feats = hf_output["last_hidden_state"] + + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) + for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + x_lengths=None, + y=None, + return_feat_layers=None, + # return_enc_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Dataclass with losses, "h_enc" (list of hidden encoder layers), + "h_feats" (wav2vec features) + """ + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers + ) + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + output = self.transducer( + feats, + feat_lengths, + y, + ) + + if return_feat_layers: + output.h_feats = hid_feats + + return output + + def infer( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ): + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + + feats, _, feat_lengths = self.forward_feats(x, x_lengths) + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + y = self.transducer.infer( + feats, + feat_lengths, + decoding_method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt, + ) + return y + + def freeze_feat_fuser(self): + if self.feat_fuser is None: + return + + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser.requires_grad = False + return + + for param in self.feat_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode in ["ft-transducer", "ft-transducer-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode in [ + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.transducer._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "transducer", + "feat_fusion_start", + "feat_fusion_method", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + hf_cfg = self.hf_feats.get_config() + tran_cfg = self.transducer.get_config() + del hf_cfg["class_name"] + del tran_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "transducer": tran_cfg, + "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_method": self.feat_fusion_method, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, transducer): + logging.info("changing hf wav2transducer config") + self.hf_feats.change_config(**hf_feats) + self.transducer.change_config(**transducer) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help=""" + the input to x-vector model will fuse the wav2vec + layers from feat_fusion_start to + the wav2vec num_layers""", + ) + parser.add_argument( + "--feat-fusion-method", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=( + "method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]" + ), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNTransducer.add_infer_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return RNNTransducer.filter_infer_args(**kwargs) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py new file mode 100644 index 00000000..4f1c500d --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py @@ -0,0 +1,386 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import contextlib +import logging + +from jsonargparse import ActionParser, ArgumentParser + +import torch +import torch.nn as nn + +from ...torch_model import TorchModel +from ...utils import remove_silence + +# import torch.nn.functional as nnf + +# from ..wav2xvectors.hf_wav2xvector import HFWav2XVector + + +class HFWav2Transducer(TorchModel): + """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + transducer: transducer model object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + def __init__(self, + hf_feats, + transducer, + feat_fusion_start=0, + feat_fusion_method="weighted-avg"): + + super().__init__() + self.hf_feats = hf_feats + self.transducer = transducer + self.feat_fusion_start = feat_fusion_start + self.feat_fusion_method = feat_fusion_method + self._hf_context = contextlib.nullcontext() + self._make_fuser() + + def _make_fuser(self): + if self.feat_fusion_method == "last": + self.feat_fuser = None + return + + num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "linear": + self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, + num_layers) / num_layers + elif self.feat_fusion_method == "cat": + self.feat_fuser = nn.Linear(num_layers * layer_dim, + layer_dim, + bias=False) + + def _fuse_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start:] + if self.feat_fusion_method == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "last": + feats = hid_feats[-1] + + return feats + + def compute_prototype_affinity(self): + return self.transducer.compute_prototype_affinity() + + # def update_loss_margin(self, epoch): + # """Updates the value of the margin in AAM/AM-softmax losses + # given the epoch number + + # Args: + # epoch: epoch which is about to start + # """ + # self.transducer.update_loss_margin(epoch) + + def rebuild_output_layer( + self, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + ): + self.transducer.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + ) + + def forward_feats(self, + x, + x_lengths, + return_feat_layers=None, + chunk_length=0, + detach_chunks=False): + return_hid_states = (False if return_feat_layers is None + and self.feat_fusion_method == "last" else True) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + feats = self._fuse_hid_feats(hid_feats) + else: + hid_feats = None + feats = hf_output["last_hidden_state"] + + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + x_lengths=None, + y=None, + return_feat_layers=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_enc_layers: list of integers indicating, which classification head layers + we should return. If None, no head layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Tensor with class logits with shape=(batch, num_classes) or + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) + """ + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers) + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + output, loss = self.transducer( + feats, + feat_lengths, + y, + ) + + if not return_feat_layers: + return output, loss + + if not isinstance(output, dict): + # if the transducer just returned the logits we put then into a dictionary + # to append the hid feats later. + output["logits"] = output + + output["h_feats"] = hid_feats + return output, loss + + def extract_embed( + self, + x, + x_lengths=None, + vad_samples=None, + hf_chunk_length=0, + xvec_chunk_length=0, + embed_layer=None, + detach_chunks=False, + ): + + if vad_samples is not None: + x, x_lengths = remove_silence(x, x_lengths) + + feats, _, feat_lengths = self.forward_feats( + x, + x_lengths, + chunk_length=hf_chunk_length, + detach_chunks=detach_chunks) + xvec_chunk_length = int(xvec_chunk_length * + self.hf_feats.sample_frequency * + feats.size(-1) // x.size(-1)) + return self.transducer.extract_embed(feats, feat_lengths, + xvec_chunk_length, embed_layer, + detach_chunks) + + def freeze_feat_fuser(self): + if self.feat_fuser is None: + return + + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser.requires_grad = False + return + + for param in self.feat_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + # elif mode == "ft-embed-affine": + # self.unfreeze() + # self.freeze_feat_fuser() + # self.freeze_hf_feats() + # self.transducer.freeze_preembed_layers() + elif mode in ["ft-transducer", "ft-transducer-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + # elif train_mode == "ft-embed-affine": + # self.hf_feats.train() + # self.transducer._train("ft-embed_affine") + elif train_mode in [ + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.transducer._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "transducer", + "feat_fusion_start", + "feat_fusion_method", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + hf_cfg = self.hf_feats.get_config() + tran_cfg = self.transducer.get_config() + del hf_cfg["class_name"] + # del tran_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "transducer": tran_cfg, + "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_method": self.feat_fusion_method, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, transducer): + logging.info("changing hf wav2transducer config") + self.hf_feats.change_config(**hf_feats) + self.transducer.change_config(**transducer) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help= + ("the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" + "the wav2vec num_layers"), + ) + parser.add_argument( + "--feat-fusion-method", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, cat]"), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py new file mode 100644 index 00000000..123c9de8 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py @@ -0,0 +1,101 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +from jsonargparse import ActionParser, ArgumentParser + +import torch +import torch.nn as nn + +from ...tpm import HFWav2Vec2 +from ..transducer import Transducer +from .hf_wav2transducer import HFWav2Transducer + + +class HFWav2Vec2Transducer(HFWav2Transducer): + """Class extracting Wav2Vec2 + ResNet1d x-vectors from waveform. + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, Transducer], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + transducer["joiner"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + transducer = Transducer(**transducer) + else: + assert isinstance(transducer, Transducer) + assert transducer.decoder.in_feats == hf_feats.hidden_size + assert transducer.joiner.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, transducer, feat_fusion_start, feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2Transducer.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = Transducer.filter_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + Transducer.add_class_args(parser, prefix="transducer") + HFWav2Transducer.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = Transducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + Transducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2conformer_v1_rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2conformer_v1_rnn_transducer.py new file mode 100644 index 00000000..3b18de3a --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2conformer_v1_rnn_transducer.py @@ -0,0 +1,106 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +from jsonargparse import ActionParser, ArgumentParser + +import torch +import torch.nn as nn + +from ...tpm import HFWav2Vec2 +from ..transducer import ConformerV1RNNTransducer +from .hf_wav2rnn_transducer import HFWav2RNNTransducer + + +class HFWav2Vec2ConformerV1RNNTransducer(HFWav2RNNTransducer): + """Class for Conformer based RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, ConformerV1RNNTransducer], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + if isinstance(transducer, dict): + transducer["encoder"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer = ConformerV1RNNTransducer(**transducer) + else: + assert isinstance(transducer, ConformerV1RNNTransducer) + + super().__init__(hf_feats, transducer, feat_fusion_start, + feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNTransducer.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1RNNTransducer.filter_args( + **kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + ConformerV1RNNTransducer.add_class_args(parser, + prefix="transducer", + skip={"in_feats"}) + HFWav2RNNTransducer.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1RNNTransducer.filter_finetune_args( + **kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + ConformerV1RNNTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py new file mode 100644 index 00000000..d9eeaebe --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py @@ -0,0 +1,105 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +from jsonargparse import ActionParser, ArgumentParser + +import torch +import torch.nn as nn + +from ...tpm import HFWav2Vec2 +from ..transducer import RNNRNNTransducer +from .hf_wav2rnn_transducer import HFWav2RNNTransducer + + +class HFWav2Vec2RNNRNNTransducer(HFWav2RNNTransducer): + """Class for RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNRNNTransducer], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + if isinstance(transducer, dict): + transducer["encoder"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer = RNNRNNTransducer(**transducer) + else: + assert isinstance(transducer, RNNRNNTransducer) + + super().__init__(hf_feats, transducer, feat_fusion_start, + feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNTransducer.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNRNNTransducer.filter_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + RNNRNNTransducer.add_class_args(parser, + prefix="transducer", + skip={"in_feats"}) + HFWav2RNNTransducer.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNRNNTransducer.filter_finetune_args( + **kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + RNNRNNTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py new file mode 100644 index 00000000..dac8c776 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py @@ -0,0 +1,90 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from ..transducer import RNNTransducer +from .hf_wav2rnn_transducer import HFWav2RNNTransducer + + +class HFWav2Vec2RNNTransducer(HFWav2RNNTransducer): + """Class for RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNTransducer], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + super().__init__(hf_feats, transducer, feat_fusion_start, feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNTransducer.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNTransducer.filter_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + RNNTransducer.add_class_args(parser, prefix="transducer") + HFWav2RNNTransducer.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNTransducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + RNNTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/wav2conformer_v1_rnn_transducer.py b/hyperion/torch/models/wav2transducer/wav2conformer_v1_rnn_transducer.py new file mode 100644 index 00000000..330aea3b --- /dev/null +++ b/hyperion/torch/models/wav2transducer/wav2conformer_v1_rnn_transducer.py @@ -0,0 +1,73 @@ +""" + Copyright 2024 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from ..transducer import ConformerV1RNNTransducer +from .wav2rnn_transducer import Wav2RNNTransducer + + +class Wav2ConformerV1RNNTransducer(Wav2RNNTransducer): + """Class for RNN-T with ConformerV1 Encoder and acoustic feature input + + Attributes: + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + transducer: Transducer configuration dictionary or object. + """ + + def __init__( + self, + feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, ConformerV1RNNTransducer], + ): + + if isinstance(transducer, dict): + if "class_name" in transducer: + del transducer["class_name"] + + transducer = ConformerV1RNNTransducer(**transducer) + else: + assert isinstance(transducer, ConformerV1RNNTransducer) + + super().__init__(feats, transducer) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Wav2RNNTransducer.add_class_args(parser) + ConformerV1RNNTransducer.add_class_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ConformerV1RNNTransducer.filter_finetune_args( + **kwargs["transducer"] + ) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ConformerV1RNNTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/wav2rnn_rnn_transducer.py b/hyperion/torch/models/wav2transducer/wav2rnn_rnn_transducer.py new file mode 100644 index 00000000..25890d78 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/wav2rnn_rnn_transducer.py @@ -0,0 +1,71 @@ +""" + Copyright 2024 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from ..transducer import RNNRNNTransducer +from .wav2rnn_transducer import Wav2RNNTransducer + + +class Wav2RNNRNNTransducer(Wav2RNNTransducer): + """Class for RNN-T LSTM encoder and acoustic feature input + + Attributes: + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + transducer: Transducer configuration dictionary or object. + """ + + def __init__( + self, + feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNRNNTransducer], + ): + + if isinstance(transducer, dict): + if "class_name" in transducer: + del transducer["class_name"] + + transducer = RNNRNNTransducer(**transducer) + else: + assert isinstance(transducer, RNNRNNTransducer) + + super().__init__(feats, transducer) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Wav2RNNTransducer.add_class_args(parser) + RNNRNNTransducer.add_class_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = RNNRNNTransducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNRNNTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py new file mode 100644 index 00000000..bce8e368 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py @@ -0,0 +1,109 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Dict, Optional, Tuple, Union + +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...narchs import AudioFeatsMVN +from ...torch_model import TorchModel +from ...utils import remove_silence + + +class Wav2RNNTransducer(TorchModel): + """Base class for models that integrate the acoustic feature extractor and and + RNN-T Transducer that takes acoustic features as input + + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + transducer: RNN-T transducer model + """ + + def __init__(self, feats, transducer): + + super().__init__() + + if isinstance(feats, dict): + feats = AudioFeatsMVN.filter_args(**feats) + feats["trans"] = False + feats = AudioFeatsMVN(**feats) + else: + assert isinstance(feats, AudioFeatsMVN) + + self.feats = feats + self.transducer = transducer + + def forward( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: k2.RaggedTensor, + vad_samples: Optional[torch.Tensor] = None, + vad_feats: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + + if vad_samples is not None: + x, x_lengths = remove_silence(x, x_lengths) + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, feat_lengths) + + return self.transducer(feats, feat_lengths, y) + + def set_train_mode(self, mode): + self.transducer.set_train_mode(mode) + + def get_config(self): + feat_cfg = self.feats.get_config() + xvector_cfg = self.transducer.get_config() + config = { + "feats": feat_cfg, + "transducer": xvector_cfg, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + """Filters Wav2XVector class arguments from arguments dictionary. + + Args: + kwargs: Arguments dictionary. + + Returns: + Dictionary with SpecAugment options. + """ + valid_args = ( + "feats", + "transducer", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds Wav2XVector options common to all child classes to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + AudioFeatsMVN.add_class_args(parser, prefix="feats") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/__init__.py b/hyperion/torch/models/wav2xvectors/__init__.py new file mode 100644 index 00000000..6bafd26d --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/__init__.py @@ -0,0 +1,21 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +from .hf_hubert2conformer_v1_xvector import HFHubert2ConformerV1XVector +from .hf_hubert2resnet1d_xvector import HFHubert2ResNet1dXVector +from .hf_wav2vec2conformer_v1_xvector import HFWav2Vec2ConformerV1XVector +from .hf_wav2vec2resnet1d_xvector import HFWav2Vec2ResNet1dXVector +from .hf_wavlm2conformer_v1_xvector import HFWavLM2ConformerV1XVector +from .hf_wavlm2resnet1d_xvector import HFWavLM2ResNet1dXVector +from .wav2conformer_v1_xvector import Wav2ConformerV1XVector + +# from .wav2efficient_net_xvector import Wav2EfficientNetXVector +# from .wav2transformer_xvector_v1 import Wav2TransformerXVectorV1 +# from .wav2spinenet_xvector import Wav2SpineNetXVector +from .wav2resnet1d_xvector import Wav2ResNet1dXVector + +# from .wav2tdnn_xvector import Wav2TDNNXVector +from .wav2resnet_xvector import Wav2ResNetXVector diff --git a/hyperion/torch/models/wav2xvectors/hf_hubert2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/hf_hubert2conformer_v1_xvector.py new file mode 100644 index 00000000..2dc37052 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_hubert2conformer_v1_xvector.py @@ -0,0 +1,94 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...narchs import FeatFuserMVN +from ...tpm import HFHubert +from ..xvectors import ConformerV1XVector +from .hf_wav2xvector import HFWav2XVector + + +class HFHubert2ConformerV1XVector(HFWav2XVector): + """Class extracting Hubert + ConformerV1 x-vectors from waveform. + + Attributes: + Attributes: + hf_feats: HFHubert configuration dictionary or object. + This is a warpper over Hugging Face Hubert model. + xvector: ConformerV1XVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the Hubert layers from "feat_fusion_start" to + the Hubert "num_layers". + feat_fusion_method: method to fuse the hidden layers from the Hubert model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFHubert], + feat_fuser: Union[Dict, FeatFuserMVN], + xvector: Union[Dict, ConformerV1XVector], + feat_fusion_start: int = 0, + ): + if isinstance(hf_feats, dict): + hf_feats = HFHubert(**hf_feats) + else: + assert isinstance(hf_feats, HFHubert) + + if isinstance(xvector, dict): + xvector["encoder"]["in_feats"] = hf_feats.hidden_size + xvector = ConformerV1XVector(**xvector) + else: + assert isinstance(xvector, ConformerV1XVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFHubert.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFHubert.add_class_args(parser, prefix="hf_feats") + ConformerV1XVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFHubert.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFHubert.add_finetune_args(parser, prefix="hf_feats") + ConformerV1XVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py new file mode 100644 index 00000000..a9495ba5 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py @@ -0,0 +1,94 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...narchs import FeatFuserMVN +from ...tpm import HFHubert +from ..xvectors import ResNet1dXVector +from .hf_wav2xvector import HFWav2XVector + + +class HFHubert2ResNet1dXVector(HFWav2XVector): + """Class extracting Hubert + ResNet1d x-vectors from waveform. + + Attributes: + Attributes: + hf_feats: HFHubert configuration dictionary or object. + This is a warpper over Hugging Face Hubert model. + xvector: ResNet1dXVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the Hubert layers from "feat_fusion_start" to + the Hubert "num_layers". + feat_fusion_method: method to fuse the hidden layers from the Hubert model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFHubert], + feat_fuser: Union[Dict, FeatFuserMVN], + xvector: Union[Dict, ResNet1dXVector], + feat_fusion_start: int = 0, + ): + if isinstance(hf_feats, dict): + hf_feats = HFHubert(**hf_feats) + else: + assert isinstance(hf_feats, HFHubert) + + if isinstance(xvector, dict): + xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + xvector = ResNet1dXVector(**xvector) + else: + assert isinstance(xvector, ResNet1dXVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFHubert.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFHubert.add_class_args(parser, prefix="hf_feats") + ResNet1dXVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFHubert.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFHubert.add_finetune_args(parser, prefix="hf_feats") + ResNet1dXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2conformer_v1_xvector.py new file mode 100644 index 00000000..1526c467 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2conformer_v1_xvector.py @@ -0,0 +1,97 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...narchs import FeatFuserMVN +from ...tpm import HFWav2Vec2 +from ..xvectors import ConformerV1XVector +from .hf_wav2xvector import HFWav2XVector + + +class HFWav2Vec2ConformerV1XVector(HFWav2XVector): + """Class extracting Wav2Vec2 + ConformerV1 x-vectors from waveform. + + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + xvector: ConformerV1XVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + feat_fuser: Union[Dict, FeatFuserMVN], + xvector: Union[Dict, ConformerV1XVector], + feat_fusion_start: int = 0, + ): + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + if isinstance(xvector, dict): + xvector["encoder"]["in_feats"] = hf_feats.hidden_size + if "class_name" in xvector: + del xvector["class_name"] + xvector = ConformerV1XVector(**xvector) + else: + assert isinstance(xvector, ConformerV1XVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + ConformerV1XVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + ConformerV1XVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py new file mode 100644 index 00000000..3709e980 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py @@ -0,0 +1,98 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...narchs import FeatFuserMVN +from ...tpm import HFWav2Vec2 +from ..xvectors import ResNet1dXVector +from .hf_wav2xvector import HFWav2XVector + + +class HFWav2Vec2ResNet1dXVector(HFWav2XVector): + """Class extracting Wav2Vec2 + ResNet1d x-vectors from waveform. + + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + xvector: ResNet1dXVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + feat_fuser: Union[Dict, FeatFuserMVN], + xvector: Union[Dict, ResNet1dXVector], + feat_fusion_start: int = 0, + ): + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + if isinstance(xvector, dict): + xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + if "class_name" in xvector: + del xvector["class_name"] + xvector = ResNet1dXVector(**xvector) + else: + assert isinstance(xvector, ResNet1dXVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) + # feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + ResNet1dXVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + ResNet1dXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py new file mode 100644 index 00000000..9a939346 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -0,0 +1,474 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import contextlib +import logging + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...narchs import FeatFuserMVN +from ...torch_model import TorchModel +from ...utils import remove_silence + + +class HFWav2XVector(TorchModel): + """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + feat_fuser: Dictionary to build feature fuser object. + xvector: x-vector model object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used (deprecated). + """ + + def __init__( + self, + hf_feats, + feat_fuser, + xvector, + feat_fusion_start=0, + # feat_fusion_method="weighted-avg", + ): + super().__init__() + self.hf_feats = hf_feats + self.xvector = xvector + self.feat_fusion_start = feat_fusion_start + # self.feat_fusion_method = feat_fusion_method + self._hf_context = contextlib.nullcontext() + self._make_fuser(feat_fuser) + + def _make_fuser(self, feat_fuser): + num_feats = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + feat_dim = self.hf_feats.hidden_size + feat_fuser["feat_fuser"]["num_feats"] = num_feats + feat_fuser["feat_fuser"]["feat_dim"] = feat_dim + self.feat_fuser = FeatFuserMVN(**feat_fuser) + + # def _make_fuser_legacy(self): + # if self.feat_fusion_method == "last": + # self.feat_fuser = None + # return + + # num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + # layer_dim = self.hf_feats.hidden_size + # if self.feat_fusion_method == "weighted-avg": + # self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + # elif self.feat_fusion_method == "linear": + # self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + # self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers + # elif self.feat_fusion_method == "cat": + # self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) + + # def _fuse_hid_feats_legacy(self, hid_feats): + # """Fuses the hidden features from the Wav2Vec model. + + # Args: + # hid_feats: list of hidden features Tensors from Wav2Vec model. + + # Returns: + # Tensor of fused features (batch, channels, time) + # """ + # if len(hid_feats) == 1: + # # There is only one layer of features + # return hid_feats[0] + + # hid_feats = hid_feats[self.feat_fusion_start :] + # if self.feat_fusion_method == "weighted-avg": + # hid_feats = torch.stack(hid_feats, dim=-1) + # norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + # feats = torch.sum(hid_feats * norm_weights, dim=-1) + # elif self.feat_fusion_method == "linear": + # hid_feats = torch.stack(hid_feats, dim=-1) + # feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + # elif self.feat_fusion_method == "cat": + # hid_feats = torch.cat(hid_feats, dim=-1) + # feats = self.feat_fuser(hid_feats) + # elif self.feat_fusion_method == "last": + # feats = hid_feats[-1] + + # return feats + + @property + def sample_frequency(self): + return self.hf_feats.sample_frequency + + def compute_prototype_affinity(self): + return self.xvector.compute_prototype_affinity() + + def update_loss_margin(self, epoch): + """Updates the value of the margin in AAM/AM-softmax losses + given the epoch number + + Args: + epoch: epoch which is about to start + """ + self.xvector.update_loss_margin(epoch) + + def rebuild_output_layer( + self, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + ): + self.xvector.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + ) + + def forward_feats( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fuser.fuser_type == "last" + else True + ) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + hid_feats = hid_feats[self.feat_fusion_start :] + else: + hid_feats = [hf_output["last_hidden_state"]] + + feats, feat_lengths = self.feat_fuser(hid_feats, feat_lengths) + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) + for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + # def forward_feats_legacy( + # self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + # ): + # return_hid_states = ( + # False + # if return_feat_layers is None and self.feat_fusion_method == "last" + # else True + # ) + # with self._hf_context: + # hf_output = self.hf_feats( + # x, + # x_lengths, + # return_hid_states=return_hid_states, + # chunk_length=chunk_length, + # detach_chunks=detach_chunks, + # ) + # feat_lengths = hf_output["hidden_states_lengths"] + # if return_hid_states: + # hid_feats = hf_output["hidden_states"] + # feats = self._fuse_hid_feats(hid_feats) + # else: + # hid_feats = None + # feats = hf_output["last_hidden_state"] + + # feats = feats.transpose(1, 2) + # if return_feat_layers is not None: + # # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # # as the hidden features of the x-vector encoder. + # hid_feats = [ + # f.transpose(1, 2) + # for i, f in enumerate(hid_feats) + # if i in return_feat_layers + # ] + # else: + # hid_feats = None + + # return feats, hid_feats, feat_lengths + + def forward( + self, + x, + x_lengths=None, + y=None, + return_feat_layers=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_enc_layers: list of integers indicating, which classification head layers + we should return. If None, no head layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Tensor with class logits with shape=(batch, num_classes) or + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) + """ + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers + ) + output = self.xvector( + feats, + feat_lengths, + y, + return_enc_layers=return_enc_layers, + return_classif_layers=return_classif_layers, + return_logits=return_logits, + ) + + if not return_feat_layers: + return output + + if not isinstance(output, dict): + # if the xvector just returned the logits we put then into a dictionary + # to append the hid feats later. + output["logits"] = output + + output["h_feats"] = hid_feats + return output + + def extract_embed( + self, + x, + x_lengths=None, + vad_samples=None, + hf_chunk_length=0, + xvec_chunk_length=0, + embed_layer=None, + detach_chunks=False, + ): + if vad_samples is not None: + x, x_lengths = remove_silence(x, vad_samples, x_lengths) + + feats, _, feat_lengths = self.forward_feats( + x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks + ) + xvec_chunk_length = int( + xvec_chunk_length + * self.hf_feats.sample_frequency + * feats.size(-1) + // x.size(-1) + ) + return self.xvector.extract_embed( + feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks + ) + + def freeze_feat_fuser(self): + self.feat_fuser.freeze() + # if self.feat_fuser is None: + # return + + # if self.feat_fusion_method == "weighted-avg": + # self.feat_fuser.requires_grad = False + # return + + # for param in self.feat_fuser.parameters(): + # param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def freeze_hf_except_lora(self, bias=None): + self.hf_feats.freeze_except_lora(bias) + + def has_param_groups(self): + return self.hf_feats.has_param_groups() or self.xvector.has_param_groups() + + def trainable_param_groups(self): + if not self.has_param_groups(): + return [{"params": self.trainable_parameters()}] + + param_groups = self.hf_feats.trainable_param_groups() + param_groups.append({"params": self.feat_fuser.trainable_parameters()}) + param_groups.extend(self.xvector.trainable_param_groups()) + return param_groups + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode == "ft-embed-affine": + self.unfreeze() + self.freeze_feat_fuser() + self.freeze_hf_feats() + self.xvector.freeze_preembed_layers() + elif mode in ["ft-xvector", "ft-xvector-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + elif mode == "hf-lora": + self.unfreeze() + self.freeze_hf_except_lora() + elif mode == "hf-all-bias-lora": + self.unfreeze() + self.freeze_hf_except_lora(bias="all") + elif mode == "hf-lora-with-bias": + self.unfreeze() + self.freeze_hf_except_lora(bias="lora_only") + else: + raise ValueError(f"invalid train_mode={mode}") + + if self.xvector.head_type == "dino": + self.xvector.classif_net.freeze_output_g() + + logging.info("train mode set to %s", mode) + + if "nograd" in mode or mode == "ft-embed-affine": + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode == "ft-embed-affine": + self.hf_feats.train() + self.feat_fuser.train() + self.xvector._train("ft-embed_affine") + elif train_mode in [ + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", + ]: + self.hf_feats.train() + self.feat_fuser.train() + self.xvector._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "feat_fuser", + "xvector", + "feat_fusion_start", + # "feat_fusion_method", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + hf_cfg = self.hf_feats.get_config() + fuser_cfg = self.feat_fuser.get_config() + xvec_cfg = self.xvector.get_config() + del hf_cfg["class_name"] + del fuser_cfg["class_name"] + del xvec_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "feat_fuser": fuser_cfg, + "xvector": xvec_cfg, + "feat_fusion_start": self.feat_fusion_start, + # "feat_fusion_method": self.feat_fusion_method, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, xvector): + logging.info("changing hf wav2xvector config") + self.hf_feats.change_config(**hf_feats) + self.xvector.change_config(**xvector) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + FeatFuserMVN.add_class_args(parser, prefix="feat_fuser") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help=( + "the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" + "the wav2vec num_layers" + ), + ) + # parser.add_argument( + # "--feat-fusion-method", + # default="weighted-avg", + # choices=["weighted-avg", "linear", "cat", "last"], + # help=( + # "method to fuse the hidden layers from the wav2vec model " + # "in [weighted-avg, cat]" + # ), + # ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wavlm2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wavlm2conformer_v1_xvector.py new file mode 100644 index 00000000..bcf82bba --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_wavlm2conformer_v1_xvector.py @@ -0,0 +1,94 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...narchs import FeatFuserMVN +from ...tpm import HFWavLM +from ..xvectors import ConformerV1XVector +from .hf_wav2xvector import HFWav2XVector + + +class HFWavLM2ConformerV1XVector(HFWav2XVector): + """Class extracting WavLM + ConformerV1 x-vectors from waveform. + + Attributes: + Attributes: + hf_feats: HFWavLM configuration dictionary or object. + This is a warpper over Hugging Face WavLM model. + xvector: ConformerV1XVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the WavLM layers from "feat_fusion_start" to + the WavLM "num_layers". + feat_fusion_method: method to fuse the hidden layers from the WavLM model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWavLM], + feat_fuser: Union[Dict, FeatFuserMVN], + xvector: Union[Dict, ConformerV1XVector], + feat_fusion_start: int = 0, + ): + if isinstance(hf_feats, dict): + hf_feats = HFWavLM(**hf_feats) + else: + assert isinstance(hf_feats, HFWavLM) + + if isinstance(xvector, dict): + xvector["encoder"]["in_feats"] = hf_feats.hidden_size + xvector = ConformerV1XVector(**xvector) + else: + assert isinstance(xvector, ConformerV1XVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFWavLM.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWavLM.add_class_args(parser, prefix="hf_feats") + ConformerV1XVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWavLM.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWavLM.add_finetune_args(parser, prefix="hf_feats") + ConformerV1XVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py new file mode 100644 index 00000000..30ace453 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py @@ -0,0 +1,94 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...narchs import FeatFuserMVN +from ...tpm import HFWavLM +from ..xvectors import ResNet1dXVector +from .hf_wav2xvector import HFWav2XVector + + +class HFWavLM2ResNet1dXVector(HFWav2XVector): + """Class extracting WavLM + ResNet1d x-vectors from waveform. + + Attributes: + Attributes: + hf_feats: HFWavLM configuration dictionary or object. + This is a warpper over Hugging Face WavLM model. + xvector: ResNet1dXVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the WavLM layers from "feat_fusion_start" to + the WavLM "num_layers". + feat_fusion_method: method to fuse the hidden layers from the WavLM model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWavLM], + feat_fuser: Union[Dict, FeatFuserMVN], + xvector: Union[Dict, ResNet1dXVector], + feat_fusion_start: int = 0, + ): + if isinstance(hf_feats, dict): + hf_feats = HFWavLM(**hf_feats) + else: + assert isinstance(hf_feats, HFWavLM) + + if isinstance(xvector, dict): + xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + xvector = ResNet1dXVector(**xvector) + else: + assert isinstance(xvector, ResNet1dXVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFWavLM.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWavLM.add_class_args(parser, prefix="hf_feats") + ResNet1dXVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWavLM.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWavLM.add_finetune_args(parser, prefix="hf_feats") + ResNet1dXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/wav2conformer_v1_xvector.py new file mode 100644 index 00000000..3f6acf02 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/wav2conformer_v1_xvector.py @@ -0,0 +1,88 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ..xvectors import ConformerV1XVector +from .wav2xvector import Wav2XVector + + +class Wav2ConformerV1XVector(Wav2XVector): + """Class extracting ConformerV1 x-vectors from waveform. + It contains acoustic feature extraction, feature normalization and + ConformerV1XVector extractor. + + Attributes: + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + xvector: ConformerV1XVector configuration dictionary or object. + """ + + def __init__(self, feats, xvector): + if isinstance(xvector, dict): + xvector = ConformerV1XVector.filter_args(**xvector) + xvector = ConformerV1XVector(**xvector) + else: + assert isinstance(xvector, ConformerV1XVector) + + super().__init__(feats, xvector) + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds Wav2ConformerV1XVector options to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Wav2XVector.add_class_args(parser) + ConformerV1XVector.add_class_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ConformerV1XVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ConformerV1XVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = {} + child_args = ConformerV1XVector.filter_dino_teacher_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ConformerV1XVector.add_dino_teacher_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py new file mode 100644 index 00000000..5a8b14b8 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py @@ -0,0 +1,87 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ..xvectors import ResNet1dXVector +from .wav2xvector import Wav2XVector + + +class Wav2ResNet1dXVector(Wav2XVector): + """Class extracting ResNet1d x-vectors from waveform. + It contains acoustic feature extraction, feature normalization and + ResNet1dXVector extractor. + + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + xvector: ResNet1dXVector configuration dictionary or object. + """ + + def __init__(self, feats, xvector): + if isinstance(xvector, dict): + xvector = ResNet1dXVector.filter_args(**xvector) + xvector = ResNet1dXVector(**xvector) + else: + assert isinstance(xvector, ResNet1dXVector) + + super().__init__(feats, xvector) + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds Wav2ResNet1dXVector options to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Wav2XVector.add_class_args(parser) + ResNet1dXVector.add_class_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ResNet1dXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNet1dXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = {} + child_args = ResNet1dXVector.filter_dino_teacher_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNet1dXVector.add_dino_teacher_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py new file mode 100644 index 00000000..642c282d --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py @@ -0,0 +1,88 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ..xvectors import ResNetXVector +from .wav2xvector import Wav2XVector + + +class Wav2ResNetXVector(Wav2XVector): + """Class extracting ResNet x-vectors from waveform. + It contains acoustic feature extraction, feature normalization and + ResNetXVector extractor. + + Attributes: + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + xvector: ResNetXVector configuration dictionary or object. + """ + + def __init__(self, feats, xvector): + if isinstance(xvector, dict): + xvector = ResNetXVector.filter_args(**xvector) + xvector = ResNetXVector(**xvector) + else: + assert isinstance(xvector, ResNetXVector) + + super().__init__(feats, xvector) + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds Wav2ResNet1dXVector options to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Wav2XVector.add_class_args(parser) + ResNetXVector.add_class_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ResNetXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNetXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = {} + child_args = ResNetXVector.filter_dino_teacher_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNetXVector.add_dino_teacher_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py new file mode 100644 index 00000000..69e7b3ca --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -0,0 +1,237 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import contextlib +import logging + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...narchs import AudioFeatsMVN +from ...torch_model import TorchModel +from ...utils import remove_silence + + +class Wav2XVector(TorchModel): + """Base class for models that integrate the acoustic feature extractor and and x-vector model that takes acoustic features as input. + + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + xvector: x-vector model object. + """ + + def __init__(self, feats, xvector): + super().__init__() + + if isinstance(feats, dict): + feats = AudioFeatsMVN.filter_args(**feats) + feats["trans"] = True + feats = AudioFeatsMVN(**feats) + else: + assert isinstance(feats, AudioFeatsMVN) + + self.feats = feats + self.xvector = xvector + self._feats_context = contextlib.nullcontext() + + @property + def sample_frequency(self): + return self.feats.sample_frequency + + # def clone(self): + # # weight normalized layers cannot be copied with deepcopy, + # # we remove them to clone and put them back later + # modules, cloned_modules = self.xvector.before_cloning() + # new_self = super().clone() + # self.xvector.after_cloning(*modules) + # new_self.xvector.after_cloning(*cloned_modules) + # return new_self + + def compute_prototype_affinity(self): + return self.xvector.compute_prototype_affinity() + + def update_loss_margin(self, epoch): + """Updates the value of the margin in AAM/AM-softmax losses + given the epoch number + + Args: + epoch: epoch which is about to start + """ + self.xvector.update_loss_margin(epoch) + + def rebuild_output_layer( + self, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + ): + self.xvector.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + ) + + def change_config(self, xvector): + logging.info("changing wav2xvector config") + self.xvector.change_config(**xvector) + + def cancel_output_layer_grads(self): + self.xvector.cancel_output_layer_grads() + + def forward( + self, + x, + x_lengths=None, + y=None, + vad_samples=None, + vad_feats=None, + enc_layers=None, + classif_layers=None, + return_output=True, + ): + with self._feats_context: + if vad_samples is not None: + x, x_lengths = remove_silence(x, vad_samples, x_lengths) + + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, vad_feats, feat_lengths) + + n = torch.sum(~torch.isfinite(feats)) + if n > 0: + print( + "feats", + n, + torch.sum(torch.isnan(feats)), + torch.sum(torch.any(torch.isnan(x), dim=-1)), + x.dtype, + feats.dtype, + flush=True, + ) + return self.xvector( + feats, feat_lengths, y, enc_layers, classif_layers, return_output + ) + + def extract_embed( + self, + x, + x_lengths=None, + vad_samples=None, + vad_feats=None, + chunk_length=0, + embed_layer=None, + detach_chunks=False, + ): + with self._feats_context: + if vad_samples is not None: + x, x_lengths = remove_silence(x, vad_samples, x_lengths) + + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, vad_feats, feat_lengths) + + chunk_length = int(chunk_length * feats.shape[1] / x.shape[-1]) + + return self.xvector.extract_embed( + feats, feat_lengths, chunk_length, embed_layer, detach_chunks + ) + + def trainable_param_groups(self): + param_groups = self.xvector.trainable_param_groups() + return param_groups + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + logging.info("setting Wav2XVector train mode to %s", mode) + if mode == "full-feats-grad": + self._feats_context = contextlib.nullcontext() + xvector_mode = "full" + else: + logging.info("using torch.no_grad for feats") + self._feats_context = torch.no_grad() + xvector_mode = mode + + logging.info( + "setting Wav2XVector XVector object train mode to %s", xvector_mode + ) + self.xvector.set_train_mode(xvector_mode) + self._train_mode = mode + + def _train(self, train_mode: str): + self.feats.train() + if train_mode in ["frozen"]: + super()._train(train_mode) + elif train_mode in ["full-feats-grad", "full"]: + self.xvector._train("full") + elif train_mode == "ft-embed-affine": + self.xvector._train(train_mode) + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "full-feats-grad", + ] + + def get_config(self): + feat_cfg = self.feats.get_config() + xvector_cfg = self.xvector.get_config() + config = { + "feats": feat_cfg, + "xvector": xvector_cfg, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + """Filters Wav2XVector class arguments from arguments dictionary. + + Args: + kwargs: Arguments dictionary. + + Returns: + Dictionary with SpecAugment options. + """ + valid_args = ( + "feats", + "xvector", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds Wav2XVector options common to all child classes to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + AudioFeatsMVN.add_class_args(parser, prefix="feats") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/__init__.py b/hyperion/torch/models/xvectors/__init__.py new file mode 100644 index 00000000..92e69a5d --- /dev/null +++ b/hyperion/torch/models/xvectors/__init__.py @@ -0,0 +1,14 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +from .conformer_v1_xvector import ConformerV1XVector +from .efficient_net_xvector import EfficientNetXVector +from .resnet1d_xvector import ResNet1dXVector +from .resnet_xvector import ResNetXVector +from .spinenet_xvector import SpineNetXVector +from .tdnn_xvector import TDNNXVector +from .transformer_xvector_v1 import TransformerXVectorV1 +from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/conformer_v1_xvector.py b/hyperion/torch/models/xvectors/conformer_v1_xvector.py new file mode 100644 index 00000000..896cad77 --- /dev/null +++ b/hyperion/torch/models/xvectors/conformer_v1_xvector.py @@ -0,0 +1,207 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...narchs import ConformerEncoderV1 as Encoder +from .xvector import XVector + + +class ConformerV1XVector(XVector): + def __init__( + self, + encoder, + num_classes, + pool_net="mean+stddev", + embed_dim=256, + num_embed_layers=1, + hid_act={"name": "relu", "inplace": True}, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + dropout_rate=0, + norm_layer=None, + head_norm_layer=None, + use_norm=True, + norm_before=True, + head_use_norm=True, + head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, + embed_layer=0, + proj_feats=None, + head_type="x-vector", + bias_weight_decay=None, + ): + if isinstance(encoder, dict): + logging.info(f"making conformer encoder network={encoder}") + encoder["in_time_dim"] = 2 + encoder["out_time_dim"] = 2 + encoder = Encoder(**encoder) + else: + encoder.in_time_dim = 2 + encoder.out_time_dim = 2 + + super().__init__( + encoder, + num_classes, + pool_net=pool_net, + embed_dim=embed_dim, + num_embed_layers=num_embed_layers, + hid_act=hid_act, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + norm_layer=norm_layer, + head_norm_layer=head_norm_layer, + use_norm=use_norm, + norm_before=norm_before, + head_use_norm=head_use_norm, + head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, + dropout_rate=dropout_rate, + embed_layer=embed_layer, + proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, + ) + + def get_config(self): + base_config = super().get_config() + del base_config["encoder_cfg"] + del base_config["in_feats"] + + encoder_cfg = self.encoder_net.get_config() + del encoder_cfg["class_name"] + config = { + "encoder": encoder_cfg, + } + + config.update(base_config) + return config + + def change_config( + self, + encoder, + override_output=False, + override_dropouts=False, + dropout_rate=0, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0, + num_subcenters=2, + ): + super().change_config( + override_output, + False, + dropout_rate, + num_classes, + loss_type, + cos_scale, + margin, + margin_warmup_epochs, + intertop_k, + intertop_margin, + num_subcenters, + ) + if override_dropouts: + logging.info("chaning x-vector head dropouts") + self.classif_net.change_dropouts(dropout_rate) + + self.encoder_net.change_config(**encoder) + + @classmethod + def load(cls, file_path=None, cfg=None, state_dict=None): + cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) + try: + del cfg["in_feats"] + except: + pass + + model = cls(**cfg) + if state_dict is not None: + model.load_state_dict(state_dict) + + return model + + @staticmethod + def filter_args(**kwargs): + base_args = XVector.filter_args(**kwargs) + child_args = Encoder.filter_args(**kwargs["encoder"]) + + base_args["encoder"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_class_args(parser, skip=set(["in_feats"])) + Encoder.add_class_args(parser, prefix="encoder", skip=set()) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = XVector.filter_finetune_args(**kwargs) + child_args = Encoder.filter_finetune_args(**kwargs["encoder"]) + base_args["encoder"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + Encoder.add_finetune_args(parser, prefix="encoder", skip=set()) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dino_teacher_args(**kwargs) + child_args = Encoder.filter_finetune_args(**kwargs["encoder"]) + base_args["encoder"] = child_args + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + Encoder.add_finetune_args(parser, prefix="encoder", skip=set()) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/efficient_net_xvector.py b/hyperion/torch/models/xvectors/efficient_net_xvector.py index 606a9554..923be8eb 100644 --- a/hyperion/torch/models/xvectors/efficient_net_xvector.py +++ b/hyperion/torch/models/xvectors/efficient_net_xvector.py @@ -4,13 +4,13 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from .xvector import XVector from ...narchs import EfficientNet as EN +from .xvector import XVector class EfficientNetXVector(XVector): @@ -42,6 +42,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, drop_connect_rate=0.2, dropout_rate=0, @@ -49,11 +51,18 @@ def __init__( head_norm_layer=None, use_norm=True, norm_before=True, + head_use_norm=True, + head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", + bias_weight_decay=None, ): - - logging.info("making %s encoder network" % (effnet_type)) + logging.info("making %s encoder network", effnet_type) encoder_net = EN( effnet_type, in_channels, @@ -88,15 +97,25 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, + head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=in_feats, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) @property @@ -172,7 +191,6 @@ def time_se(self): return self.encoder_net.time_se def get_config(self): - base_config = super().get_config() del base_config["encoder_cfg"] @@ -200,9 +218,24 @@ def get_config(self): config.update(base_config) return config + def change_config( + self, + override_output=False, + override_dropouts=False, + dropout_rate=0, + drop_connect_rate=0, + **kwargs + ): + xvec_args = XVector.filter_finetune_args(**kwargs) + xvec_args["override_dropouts"] = False + super().change_config(**xvec_args) + + if override_dropouts: + self.encoder_net.change_dropouts(dropout_rate, drop_connect_rate) + self.classif_net.change_dropouts(dropout_rate) + @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) model = cls(**cfg) @@ -211,8 +244,8 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model + @staticmethod def filter_args(**kwargs): - base_args = XVector.filter_args(**kwargs) child_args = EN.filter_args(**kwargs) @@ -232,6 +265,45 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='xvector options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = XVector.filter_finetune_args(**kwargs) + child_args = EN.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + EN.add_finetune_args(parser) + XVector.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dino_teacher_args(**kwargs) + child_args = EN.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + EN.add_finetune_args(parser) + XVector.add_dino_teacher_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/resnet1d_xvector.py b/hyperion/torch/models/xvectors/resnet1d_xvector.py index 8db9a073..d305bb6a 100644 --- a/hyperion/torch/models/xvectors/resnet1d_xvector.py +++ b/hyperion/torch/models/xvectors/resnet1d_xvector.py @@ -4,13 +4,13 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from .xvector import XVector from ...narchs import ResNet1dEncoder as Encoder +from .xvector import XVector class ResNet1dXVector(XVector): @@ -26,17 +26,25 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, dropout_rate=0, norm_layer=None, head_norm_layer=None, use_norm=True, norm_before=True, - in_norm=False, + head_use_norm=True, + head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", + bias_weight_decay=None, ): - if isinstance(resnet_enc, dict): logging.info("making %s resnet1d encoder network", resnet_enc["resb_type"]) resnet_enc = Encoder(**resnet_enc) @@ -52,70 +60,27 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, + head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) - # @property - # def in_channels(self): - # return self.encoder_net.in_channels - - # @property - # def conv_channels(self): - # return self.encoder_net.conv_channels - - # @property - # def base_channels(self): - # return self.encoder_net.base_channels - - # @property - # def in_kernel_size(self): - # return self.encoder_net.in_kernel_size - - # @property - # def in_stride(self): - # return self.encoder_net.in_stride - - # @property - # def zero_init_residual(self): - # return self.encoder_net.zero_init_residual - - # @property - # def groups(self): - # return self.encoder_net.groups - - # @property - # def replace_stride_with_dilation(self): - # return self.encoder_net.replace_stride_with_dilation - - # @property - # def do_maxpool(self): - # return self.encoder_net.do_maxpool - - # @property - # def in_norm(self): - # return self.encoder_net.in_norm - - # @property - # def se_r(self): - # return self.encoder_net.se_r - - # @property - # def res2net_scale(self): - # return self.encoder_net.res2net_scale - - # @property - # def res2net_width_factor(self): - # return self.encoder_net.res2net_width_factor - def get_config(self): - base_config = super().get_config() del base_config["encoder_cfg"] del base_config["in_feats"] @@ -129,24 +94,56 @@ def get_config(self): config.update(base_config) return config + def change_config( + self, + resnet_enc, + override_output=False, + override_dropouts=False, + dropout_rate=0, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0, + num_subcenters=2, + ): + super().change_config( + override_output, + False, + dropout_rate, + num_classes, + loss_type, + cos_scale, + margin, + margin_warmup_epochs, + intertop_k, + intertop_margin, + num_subcenters, + ) + if override_dropouts: + logging.info("chaning x-vector head dropouts") + self.classif_net.change_dropouts(dropout_rate) + + self.encoder_net.change_config(**resnet_enc) + @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - try: del cfg["in_feats"] except: pass - print(cfg, flush=True) + model = cls(**cfg) if state_dict is not None: model.load_state_dict(state_dict) return model + @staticmethod def filter_args(**kwargs): - base_args = XVector.filter_args(**kwargs) child_args = Encoder.filter_args(**kwargs["resnet_enc"]) @@ -161,11 +158,49 @@ def add_class_args(parser, prefix=None): XVector.add_class_args(parser, skip=set(["in_feats"])) Encoder.add_class_args(parser, prefix="resnet_enc", skip=set(["head_channels"])) - # parser.link_arguments("in_feats", "resnet_enc.in_feats", apply_on="parse") - # parser.link_arguments("norm_layer", "encoder_net.norm_layer", apply_on="parse") - if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='xvector options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = XVector.filter_finetune_args(**kwargs) + child_args = Encoder.filter_finetune_args(**kwargs["resnet_enc"]) + base_args["resnet_enc"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + Encoder.add_finetune_args( + parser, prefix="resnet_enc", skip=set(["head_channels"]) + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dinoteacher_args(**kwargs) + child_args = Encoder.filter_finetune_args(**kwargs["resnet_enc"]) + base_args["resnet_enc"] = child_args + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + Encoder.add_finetune_args( + parser, prefix="resnet_enc", skip=set(["head_channels"]) + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/resnet_xvector.py b/hyperion/torch/models/xvectors/resnet_xvector.py index 58a34c94..a639bdb8 100644 --- a/hyperion/torch/models/xvectors/resnet_xvector.py +++ b/hyperion/torch/models/xvectors/resnet_xvector.py @@ -4,13 +4,13 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from .xvector import XVector from ...narchs import ResNetFactory as RNF +from .xvector import XVector class ResNetXVector(XVector): @@ -36,6 +36,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, dropout_rate=0, norm_layer=None, @@ -43,13 +45,21 @@ def __init__( use_norm=True, norm_before=True, in_norm=False, + head_use_norm=True, + head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", se_r=16, res2net_scale=4, res2net_width_factor=1, + freq_pos_enc=False, + bias_weight_decay=None, ): - logging.info("making %s encoder network", resnet_type) encoder_net = RNF.create( resnet_type, @@ -71,6 +81,7 @@ def __init__( in_feats=in_feats, res2net_scale=res2net_scale, res2net_width_factor=res2net_width_factor, + freq_pos_enc=freq_pos_enc, ) super().__init__( @@ -84,15 +95,25 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, + head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=in_feats, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) self.resnet_type = resnet_type @@ -149,13 +170,13 @@ def res2net_scale(self): def res2net_width_factor(self): return self.encoder_net.res2net_width_factor - def get_config(self): + @property + def freq_pos_enc(self): + return self.encoder_net.freq_pos_enc + def get_config(self): base_config = super().get_config() del base_config["encoder_cfg"] - - pool_cfg = self.pool_net.get_config() - config = { "resnet_type": self.resnet_type, "in_channels": self.in_channels, @@ -171,6 +192,7 @@ def get_config(self): "se_r": self.se_r, "res2net_scale": self.res2net_scale, "res2net_width_factor": self.res2net_width_factor, + "freq_pos_enc": self.freq_pos_enc, } config.update(base_config) @@ -178,7 +200,6 @@ def get_config(self): @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) model = cls(**cfg) @@ -187,8 +208,8 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model + @staticmethod def filter_args(**kwargs): - base_args = XVector.filter_args(**kwargs) child_args = RNF.filter_args(**kwargs) @@ -206,6 +227,45 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='xvector options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = XVector.filter_finetune_args(**kwargs) + child_args = RNF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + RNF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dino_teacher_args(**kwargs) + child_args = RNF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + RNF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/spinenet_xvector.py b/hyperion/torch/models/xvectors/spinenet_xvector.py index 1b7401a4..bf829b64 100644 --- a/hyperion/torch/models/xvectors/spinenet_xvector.py +++ b/hyperion/torch/models/xvectors/spinenet_xvector.py @@ -3,14 +3,14 @@ Copyright 2020 Magdalena Rybicka Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import logging import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from .xvector import XVector from ...narchs import SpineNetFactory as SNF +from .xvector import XVector class SpineNetXVector(XVector): @@ -40,6 +40,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, dropout_rate=0, norm_layer=None, @@ -47,13 +49,20 @@ def __init__( use_norm=True, norm_before=True, in_norm=False, + head_use_norm=True, + head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", se_r=16, res2net_scale=4, res2net_width_factor=1, + bias_weight_decay=None, ): - logging.info("making %s encoder network", spinenet_type) encoder_net = SNF.create( spinenet_type, @@ -92,15 +101,25 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, + head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=in_feats, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) self.spinenet_type = spinenet_type @@ -174,7 +193,6 @@ def res2net_width_factor(self): return self.encoder_net.res2net_width_factor def get_config(self): - base_config = super().get_config() del base_config["encoder_cfg"] @@ -206,7 +224,6 @@ def get_config(self): @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) model = cls(**cfg) @@ -215,8 +232,8 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model + @staticmethod def filter_args(**kwargs): - base_args = XVector.filter_args(**kwargs) child_args = SNF.filter_args(**kwargs) @@ -236,3 +253,43 @@ def add_class_args(parser, prefix=None): outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = XVector.filter_finetune_args(**kwargs) + child_args = SNF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + SNF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dino_teacher_args(**kwargs) + child_args = SNF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + SNF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/tdnn_xvector.py b/hyperion/torch/models/xvectors/tdnn_xvector.py index a0211f87..19c075b6 100644 --- a/hyperion/torch/models/xvectors/tdnn_xvector.py +++ b/hyperion/torch/models/xvectors/tdnn_xvector.py @@ -4,13 +4,13 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from .xvector import XVector from ...narchs import TDNNFactory as TF +from .xvector import XVector class TDNNXVector(XVector): @@ -33,6 +33,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, dropout_rate=0, norm_layer=None, @@ -40,11 +42,18 @@ def __init__( use_norm=True, norm_before=False, in_norm=False, + head_use_norm=True, + head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", + bias_weight_decay=None, ): - - logging.info("making %s encoder network" % (tdnn_type)) + logging.info("making %s encoder network", tdnn_type) encoder_net = TF.create( tdnn_type, num_enc_blocks, @@ -73,15 +82,25 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, + head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=None, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) self.tdnn_type = tdnn_type @@ -118,7 +137,6 @@ def in_norm(self): return self.encoder_net.in_norm def get_config(self): - base_config = super().get_config() del base_config["encoder_cfg"] @@ -149,8 +167,8 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model + @staticmethod def filter_args(**kwargs): - base_args = XVector.filter_args(**kwargs) child_args = TF.filter_args(**kwargs) @@ -168,6 +186,45 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='xvector options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = XVector.filter_finetune_args(**kwargs) + child_args = TF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + TF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dino_teacher_args(**kwargs) + child_args = TF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + TF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/transformer_xvector_v1.py b/hyperion/torch/models/xvectors/transformer_xvector_v1.py index 1eaa03b6..00f54af7 100644 --- a/hyperion/torch/models/xvectors/transformer_xvector_v1.py +++ b/hyperion/torch/models/xvectors/transformer_xvector_v1.py @@ -4,13 +4,13 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from .xvector import XVector from ...narchs import TransformerEncoderV1 as TE +from .xvector import XVector class TransformerXVectorV1(XVector): @@ -46,7 +46,6 @@ class TransformerXVectorV1(XVector): use_norm: if True use batch/layer norm norm_before: if True, use layer norm before layers, otherwise after - in_norm: add batchnorm at the input embed_layer: which layer to use to extract x-vectors proj_feats: add linear projection layer after the encoder to project feature dimension to proj_feats """ @@ -73,6 +72,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, dropout_rate=0.1, pos_dropout_rate=0.1, @@ -81,11 +82,17 @@ def __init__( head_norm_layer=None, use_norm=True, norm_before=False, - in_norm=False, + head_use_norm=True, + head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", + bias_weight_decay=None, ): - logging.info("making transformer-v1 encoder network") encoder_net = TE( in_feats, @@ -118,15 +125,25 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, + head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=None, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) @property @@ -181,10 +198,6 @@ def enc_concat_after(self): def enc_ff_type(self): return self.encoder_net.ff_type - # @property - # def in_norm(self): - # return self.encoder_net.in_norm - def get_config(self): """Gets network config Returns: @@ -210,7 +223,6 @@ def get_config(self): "in_layer_type": self.in_layer_type, "enc_concat_after": self.enc_concat_after, } - #'in_norm': self.in_norm } config.update(base_config) return config @@ -361,3 +373,99 @@ def add_class_args(parser, prefix=None): # help='xvector options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + """Filters arguments correspondin to TransformerXVector + from args dictionary + + Args: + kwargs: args dictionary + + Returns: + args dictionary + """ + base_args = XVector.filter_finetune_args(**kwargs) + + valid_args = ( + "pos_dropout_rate", + "att_dropout_rate", + ) + + child_args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + """Adds TransformerXVector config parameters for finetuning to argparser + + Args: + parser: argparse object + prefix: prefix string to add to the argument names + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + parser.add_argument( + "--pos-dropout-rate", + default=0.1, + type=float, + help="positional encoder dropout", + ) + parser.add_argument( + "--att-dropout-rate", default=0, type=float, help="self-att dropout" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + """Filters arguments correspondin to TransformerXVector + from args dictionary + + Args: + kwargs: args dictionary + + Returns: + args dictionary + """ + base_args = XVector.filter_dino_teacher_args(**kwargs) + + valid_args = ( + "pos_dropout_rate", + "att_dropout_rate", + ) + + child_args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + base_args.update(child_args) + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + """Adds TransformerXVector config parameters for finetuning to argparser + + Args: + parser: argparse object + prefix: prefix string to add to the argument names + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + parser.add_argument( + "--pos-dropout-rate", + default=0.1, + type=float, + help="positional encoder dropout", + ) + parser.add_argument( + "--att-dropout-rate", default=0, type=float, help="self-att dropout" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 21932491..17d77116 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -2,17 +2,42 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging -from jsonargparse import ArgumentParser, ActionParser +from dataclasses import dataclass +from enum import Enum +from typing import List, Optional import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from ...layers import GlobalPool1dFactory as PF +from ....utils import HypDataClass +from ....utils.misc import filter_func_args from ...layer_blocks import TDNNBlock -from ...narchs import ClassifHead, TorchNALoader +from ...layers import GlobalPool1dFactory as PF +from ...narchs import ClassifHead, DINOHead, ProjHead, TorchNALoader from ...torch_model import TorchModel -from ...utils import eval_nnet_by_chunks +from ...utils import eval_nnet_by_chunks, scale_seq_lengths + + +class XVectorHeadType(str, Enum): + XVECTOR = "x-vector" + DINO = "dino" + + @staticmethod + def choices(): + return [o.value for o in XVectorHeadType] + + +@dataclass +class XVectorOutput(HypDataClass): + loss: torch.Tensor + logits: torch.Tensor + xvector: torch.Tensor + h_enc: Optional[List[torch.Tensor]] = None + h_classif: Optional[List[torch.Tensor]] = None + h_feats: Optional[List[torch.Tensor]] = None class XVector(TorchModel): @@ -30,18 +55,27 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, norm_layer=None, head_norm_layer=None, use_norm=True, norm_before=True, + head_use_norm=True, + head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, dropout_rate=0, embed_layer=0, in_feats=None, proj_feats=None, + head_type=XVectorHeadType.XVECTOR, + bias_weight_decay=None, ): - - super().__init__() + super().__init__(bias_weight_decay=bias_weight_decay) # encoder network self.encoder_net = encoder_net @@ -95,73 +129,155 @@ def __init__( # if head_norm_layer is none we use the global norm_layer if head_norm_layer is None and norm_layer is not None: - if norm_layer == "instance-norm" or norm_layer == "instance-norm-affine": + if norm_layer in ("instance-norm", "instance-norm-affine"): head_norm_layer = "batch-norm" else: head_norm_layer = norm_layer # create classification head logging.info("making classification head net") - self.classif_net = ClassifHead( - pool_feats, - num_classes, - embed_dim=embed_dim, - num_embed_layers=num_embed_layers, - hid_act=hid_act, - loss_type=loss_type, - cos_scale=cos_scale, - margin=margin, - margin_warmup_epochs=margin_warmup_epochs, - num_subcenters=num_subcenters, - norm_layer=head_norm_layer, - use_norm=use_norm, - norm_before=norm_before, - dropout_rate=dropout_rate, - ) - + self.embed_dim = embed_dim + self.num_embed_layers = num_embed_layers + self.head_type = head_type self.hid_act = hid_act self.norm_layer = norm_layer - self.head_norm_layer = head_norm_layer self.use_norm = use_norm self.norm_before = norm_before + self.head_use_in_norm = head_use_in_norm + self.head_use_norm = head_use_norm + self.head_norm_layer = head_norm_layer + self.head_hid_dim = head_hid_dim + self.head_bottleneck_dim = head_bottleneck_dim + self.proj_head_use_norm = proj_head_use_norm + self.proj_head_norm_before = proj_head_norm_before self.dropout_rate = dropout_rate self.embed_layer = embed_layer + if self.head_type == XVectorHeadType.XVECTOR: + self.proj_head_net = None + self.classif_net = ClassifHead( + pool_feats, + num_classes, + embed_dim=embed_dim, + num_embed_layers=num_embed_layers, + hid_act=hid_act, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + norm_layer=head_norm_layer, + use_norm=head_use_norm, + norm_before=norm_before, + dropout_rate=dropout_rate, + use_in_norm=head_use_in_norm, + ) + elif self.head_type == XVectorHeadType.DINO: + self.proj_head_net = ProjHead( + pool_feats, + embed_dim, + norm_layer=head_norm_layer, + use_norm=proj_head_use_norm, + norm_before=proj_head_norm_before, + ) + self.classif_net = DINOHead( + embed_dim, + num_classes, + hid_feats=head_hid_dim, + bottleneck_feats=head_bottleneck_dim, + num_hid_layers=num_embed_layers, + hid_act=hid_act, + output_type=loss_type, + norm_layer=head_norm_layer, + use_norm=head_use_norm, + norm_before=norm_before, + dropout_rate=dropout_rate, + use_in_norm=head_use_in_norm, + ) @property def pool_feats(self): - return self.classif_net.in_feats + if self.proj_head_net is None: + return self.classif_net.in_feats + else: + return self.proj_head_net.in_feats @property def num_classes(self): return self.classif_net.num_classes @property - def embed_dim(self): - return self.classif_net.embed_dim + def cos_scale(self): + if self.head_type == XVectorHeadType.XVECTOR: + return self.classif_net.cos_scale + elif self.head_type == XVectorHeadType.DINO: + return 1 + else: + raise ValueError @property - def num_embed_layers(self): - return self.classif_net.num_embed_layers + def margin(self): + if self.head_type == XVectorHeadType.XVECTOR: + return self.classif_net.margin + else: + return 0.0 @property - def cos_scale(self): - return self.classif_net.cos_scale + def margin_warmup_epochs(self): + if self.head_type == XVectorHeadType.XVECTOR: + return self.classif_net.margin_warmup_epochs + else: + return 0 @property - def margin(self): - return self.classif_net.margin + def intertop_k(self): + if self.head_type == XVectorHeadType.XVECTOR: + return self.classif_net.intertop_k + else: + return 0 @property - def margin_warmup_epochs(self): - return self.classif_net.margin_warmup_epochs + def intertop_margin(self): + if self.head_type == XVectorHeadType.XVECTOR: + return self.classif_net.intertop_margin + else: + return 0.0 @property def num_subcenters(self): - return self.classif_net.num_subcenters + if self.head_type == XVectorHeadType.XVECTOR: + return self.classif_net.num_subcenters + else: + return 0 @property def loss_type(self): - return self.classif_net.loss_type + if self.head_type == XVectorHeadType.XVECTOR: + return self.classif_net.loss_type + elif self.head_type == XVectorHeadType.DINO: + return self.classif_net.output_type + else: + raise ValueError() + + # def clone(self): + # # weight normalized layers cannot be copied with deepcopy, + # # we remove them to clone and put them back later + # modules, cloned_modules = self.before_cloning() + # new_self = super().clone() + # self.after_cloning(*modules) + # new_self.after_cloning(*cloned_modules) + # return new_self + + # def before_cloning(self): + # if self.head_type == XVectorHeadType.DINO: + # return self.classif_net.before_cloning() + # else: + # return None, None + + # def after_cloning(self, output): + # if self.head_type == XVectorHeadType.DINO: + # self.classif_net.after_cloning(output) def _make_pool_net(self, pool_net, enc_feats=None): """Makes the pooling block @@ -201,119 +317,236 @@ def _pre_enc(self, x): x = x.view(x.size(0), 1, x.size(1), x.size(2)) return x - def _post_enc(self, x): + def _post_enc(self, x, in_lengths=None, max_in_length=None): if self.encoder_net.out_dim() == 4: x = x.view(x.size(0), -1, x.size(-1)) if self.proj is not None: x = self.proj(x) - return x + if in_lengths is not None: + out_lengths = scale_seq_lengths(in_lengths, x.size(-1), max_in_length) + else: + out_lengths = None + + return x, out_lengths def forward( - self, x, y=None, enc_layers=None, classif_layers=None, return_output=True + self, + x, + x_lengths=None, + y=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the encoder and + classification head. In this case the ouput variable is a dictionary. - if enc_layers is None and classif_layers is None: - return self.forward_output(x, y) + Args: + x: input features tensor with shape=(batch, in_feats, time). + x_lengths: time lengths of the features with shape=(batch,). + y: target classes torch.long tensor with shape=(batch,). + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_enc_layers: list of integers indicating, which classification head layers + we should return. If None, no head layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Tensor with class logits with shape=(batch, num_classes) or + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers). + """ - h = self.forward_hid_feats(x, y, enc_layers, classif_layers, return_output) - output = {} - if enc_layers is not None: - if classif_layers is None: - output["h_enc"] = h - else: - output["h_enc"] = h[0] - else: - output["h_enc"] = [] - if classif_layers is not None: - output["h_classif"] = h[1] - else: - output["h_classif"] = [] - if return_output: - output["output"] = h[2] - return output + if return_enc_layers is None and return_classif_layers is None: + return self.forward_logits(x, x_lengths, y) - def forward_output(self, x, y=None): + return self.forward_hid_feats( + x, x_lengths, y, return_enc_layers, return_classif_layers, return_logits + ) + + def forward_logits(self, x, x_lengths=None, y=None): """Forward function Args: - x: input features tensor with shape=(batch, in_feats, time) - y: target classes torch.long tensor with shape=(batch,) + x: input features tensor with shape=(batch, in_feats, time). + x_lengths: time lengths of the features with shape=(batch,). + y: target classes torch.long tensor with shape=(batch,). Returns: - class posteriors tensor with shape=(batch, num_classes) + class logits tensor with shape=(batch, num_classes). """ - if self.encoder_net.in_dim() == 4 and x.dim() == 3: - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - + max_in_length = x.size(-1) + x = self._pre_enc(x) x = self.encoder_net(x) - - if self.encoder_net.out_dim() == 4: - x = x.view(x.size(0), -1, x.size(-1)) - - if self.proj is not None: - x = self.proj(x) - - p = self.pool_net(x) - y = self.classif_net(p, y) - return y + if isinstance(x, tuple): + x = x[0] + x, x_lengths = self._post_enc(x, x_lengths, max_in_length) + p = self.pool_net(x, x_lengths=x_lengths) + xvector = None + if self.proj_head_net is not None: + p = self.proj_head_net(p) + xvector = p + + logits = self.classif_net(p, y) + # return logits + output = XVectorOutput(None, logits, xvector) + return output def forward_hid_feats( - self, x, y=None, enc_layers=None, classif_layers=None, return_output=False + self, + x, + x_lengths=None, + y=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=False, ): - """forwards hidden representations in the x-vector network""" - - if self.encoder_net.in_dim() == 4 and x.dim() == 3: - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - - h_enc, x = self.encoder_net.forward_hid_feats(x, enc_layers, return_output=True) + """forwards hidden representations in the x-vector network - if not return_output and classif_layers is None: - return h_enc - - if self.encoder_net.out_dim() == 4: - x = x.view(x.size(0), -1, x.size(-1)) - - if self.proj is not None: - x = self.proj(x) - - p = self.pool_net(x) + Args: + x: input features tensor with shape=(batch, in_feats, time). + x_lengths: time lengths of the features with shape=(batch,). + y: target classes torch.long tensor with shape=(batch,). + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_enc_layers: list of integers indicating, which classification head layers + we should return. If None, no head layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers). + """ + max_in_length = x.size(-1) + x = self._pre_enc(x) + h_enc, x = self.encoder_net.forward_hid_feats( + x, return_enc_layers, return_output=True + ) + output = {"h_enc": h_enc} + if not return_logits and return_classif_layers is None: + return output + + x, x_lengths = self._post_enc(x, x_lengths, max_in_length) + p = self.pool_net(x, x_lengths=x_lengths) + if self.proj_head_net is not None: + p = self.proj_head_net(p) h_classif = self.classif_net.forward_hid_feats( - p, y, classif_layers, return_output=return_output + p, y, return_classif_layers, return_logits=return_logits ) - if return_output: - h_classif, y = h_classif - return h_enc, h_classif, y + if return_logits: + h_classif, y_pred = h_classif + else: + y_pred = None - return h_enc, h_classif + if h_classif is not None: + xvector = h_classif[0] + else: + xvector = None + + output = XVectorOutput(None, y_pred, xvector, h_enc, h_classif) + return output - def extract_embed(self, x, chunk_length=0, embed_layer=None, detach_chunks=False): + # def forward_hid_feats( + # self, + # x, + # x_lengths=None, + # y=None, + # return_enc_layers=None, + # return_classif_layers=None, + # return_logits=False, + # ): + # """forwards hidden representations in the x-vector network + + # Args: + # x: input features tensor with shape=(batch, in_feats, time). + # x_lengths: time lengths of the features with shape=(batch,). + # y: target classes torch.long tensor with shape=(batch,). + # return_enc_layers: list of integers indicating, which encoder layers + # we should return. If None, no encoder layers are returned. + # return_enc_layers: list of integers indicating, which classification head layers + # we should return. If None, no head layers are returned. + # return_logits: if True, it adds the logits to the output dictionary. + # Returns: + # Dictionary with "logits", "h_enc" (list of hidden encoder layers), + # "h_classif" (list hidden classification head layers). + # """ + # max_in_length = x.size(-1) + # x = self._pre_enc(x) + # h_enc, x = self.encoder_net.forward_hid_feats( + # x, return_enc_layers, return_output=True + # ) + # output = {"h_enc": h_enc} + # if not return_logits and return_classif_layers is None: + # return output + + # x, x_lengths = self._post_enc(x, x_lengths, max_in_length) + # p = self.pool_net(x, x_lengths=x_lengths) + # if self.proj_head_net is not None: + # p = self.proj_head_net(p) + # h_classif = self.classif_net.forward_hid_feats( + # p, y, return_classif_layers, return_logits=return_logits + # ) + # if return_logits: + # h_classif, y_pred = h_classif + # output["h_classif"] = h_classif + # output["logits"] = y_pred + # return output + + # output["h_classif"] = h_classif + # return output + + def extract_embed_impl( + self, x, x_lengths=None, chunk_length=0, embed_layer=None, detach_chunks=False + ): if embed_layer is None: embed_layer = self.embed_layer + max_in_length = x.size(-1) x = self._pre_enc(x) - # if self.encoder_net.in_dim() == 4 and x.dim() == 3: - # x = x.view(x.size(0), 1, x.size(1), x.size(2)) - x = eval_nnet_by_chunks( - x, self.encoder_net, chunk_length, detach_chunks=detach_chunks - ) - - if x.device != self.device: - x = x.to(self.device) - - x = self._post_enc(x) + if max_in_length <= chunk_length or chunk_length == 0: + x = self.encoder_net(x, x_lengths=x_lengths) + if isinstance(x, tuple): + x = x[0] + else: + x = eval_nnet_by_chunks( + x, self.encoder_net, chunk_length, detach_chunks=detach_chunks + ) - # if self.encoder_net.out_dim() == 4: - # x = x.view(x.size(0), -1, x.size(-1)) + if x.device != self.device: + x = x.to(self.device) - # if self.proj is not None: - # x = self.proj(x) + x, x_lengths = self._post_enc(x, x_lengths, max_in_length) + p = self.pool_net(x, x_lengths=x_lengths) + if self.proj_head_net is not None: + return self.proj_head_net(p) - p = self.pool_net(x) y = self.classif_net.extract_embed(p, embed_layer) return y + def extract_embed( + self, x, x_lengths=None, chunk_length=0, embed_layer=None, detach_chunks=False + ): + + if x.size(-1) <= chunk_length or chunk_length == 0: + return self.extract_embed_impl(x, x_lengths, 0, embed_layer) + else: + e = [] + for i in range(x.size(0)): + x_i = x[i : i + 1] + if x_lengths is not None: + x_i = x_i[..., x_lengths[i]] + + e_i = self.extract_embed_impl( + x_i, + chunk_length=chunk_length, + embed_layer=embed_layer, + detach_chunks=detach_chunks, + ) + e.append(e_i) + + return torch.cat(e, dim=0) + def extract_embed_slidwin( self, x, @@ -326,7 +559,6 @@ def extract_embed_slidwin( embed_layer=None, detach_chunks=False, ): - if feat_frame_shift is not None: # assume win_length/shift are in secs, transform to frames # pass feat times from msecs to secs @@ -344,7 +576,7 @@ def extract_embed_slidwin( embed_layer = self.embed_layer in_time = x.size(-1) - x = self._pre_enc(x) + x, _ = self._pre_enc(x) x = eval_nnet_by_chunks( x, self.encoder_net, chunk_length, detach_chunks=detach_chunks ) @@ -383,7 +615,6 @@ def compute_slidwin_timestamps( feat_frame_shift=10, feat_snip_edges=False, ): - P = self.compute_slidwin_left_padding( win_length, win_shift, @@ -414,7 +645,6 @@ def compute_slidwin_left_padding( feat_frame_shift=10, feat_snip_edges=False, ): - # pass feat times from msecs to secs feat_frame_shift = feat_frame_shift / 1000 feat_frame_length = feat_frame_length / 1000 @@ -445,10 +675,8 @@ def compute_slidwin_left_padding( return P1 + P2 def get_config(self): - enc_cfg = self.encoder_net.get_config() pool_cfg = PF.get_config(self.pool_net) - config = { "encoder_cfg": enc_cfg, "pool_net": pool_cfg, @@ -460,15 +688,25 @@ def get_config(self): "cos_scale": self.cos_scale, "margin": self.margin, "margin_warmup_epochs": self.margin_warmup_epochs, + "intertop_k": self.intertop_k, + "intertop_margin": self.intertop_margin, "num_subcenters": self.num_subcenters, "norm_layer": self.norm_layer, - "head_norm_layer": self.head_norm_layer, "use_norm": self.use_norm, "norm_before": self.norm_before, + "head_norm_layer": self.head_norm_layer, + "head_use_norm": self.head_use_norm, + "head_use_in_norm": self.head_use_in_norm, + "head_hid_dim": self.head_hid_dim, + "head_bottleneck_dim": self.head_bottleneck_dim, + "proj_head_use_norm": self.proj_head_use_norm, + "proj_head_norm_before": self.proj_head_norm_before, "dropout_rate": self.dropout_rate, "embed_layer": self.embed_layer, "in_feats": self.in_feats, "proj_feats": self.proj_feats, + "head_type": self.head_type, + "bias_weight_decay": self.bias_weight_decay, } base_config = super().get_config() @@ -487,6 +725,40 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model + def change_config( + self, + override_output=False, + override_dropouts=False, + dropout_rate=0, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + head_type=XVectorHeadType.XVECTOR, + ): + logging.info("changing x-vector config") + if override_output: + self.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + head_type=head_type, + ) + + if override_dropouts: + logging.info("overriding x-vector dropouts") + self.encoder_net.change_dropouts(dropout_rate) + self.classif_net.change_dropouts(dropout_rate) + def rebuild_output_layer( self, num_classes=None, @@ -494,14 +766,83 @@ def rebuild_output_layer( cos_scale=64, margin=0.3, margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + head_type=XVectorHeadType.XVECTOR, ): - if (self.num_classes is not None and self.num_classes != num_classes) or ( - self.loss_type != loss_type + + if head_type != self.head_type: + # only from dino to x-vector + assert self.head_type == XVectorHeadType.DINO + logging.info("transforming dino head into x-vector head") + self.num_embed_layers = 1 + self.head_use_in_norm = ( + self.proj_head_use_norm and self.proj_head_norm_before + ) + self.head_use_norm = ( + self.proj_head_use_norm and not self.proj_head_norm_before + ) + self.classif_net = ClassifHead( + self.proj_head_net.in_feats, + num_classes, + embed_dim=self.proj_head_net.out_feats, + num_embed_layers=1, + hid_act=None, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + norm_layer=self.head_norm_layer, + use_norm=self.proj_head_use_norm, + norm_before=self.norm_before, + dropout_rate=self.dropout_rate, + use_in_norm=self.head_use_in_norm, + ) + + if ( + self.classif_net.fc_blocks[0].linear.bias is not None + and self.proj_head_net.proj.bias is not None + ): + self.classif_net.fc_blocks[0].linear.bias.data.copy_( + self.proj_head_net.proj.bias.data + ) + + self.classif_net.fc_blocks[0].linear.weight.data.copy_( + self.proj_head_net.proj.weight.data + ) + if self.head_use_norm: + self.classif_net.fc_blocks[0].bn1.load_state_dict( + self.proj_head_net._norm_layer.state_dict() + ) + del self.proj_head_net + self.proj_head_net = None + self.head_type = XVectorHeadType.XVECTOR + return + + if ( + (self.num_classes is not None and self.num_classes != num_classes) + or (self.loss_type != loss_type) + or ( + loss_type == "subcenter-arc-softmax" + and self.classif_net.num_subcenters != num_subcenters + ) ): # if we change the number of classes or the loss-type # we need to reinitiate the last layer + logging.info("rebuilding output layer") self.classif_net.rebuild_output_layer( - num_classes, loss_type, s, margin, margin_warmup_epochs + num_classes, + loss_type, + cos_scale, + margin, + margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, ) return @@ -509,6 +850,13 @@ def rebuild_output_layer( self.classif_net.set_margin(margin) self.classif_net.set_margin_warmup_epochs(margin_warmup_epochs) self.classif_net.set_cos_scale(cos_scale) + self.classif_net.set_intertop_k(intertop_k) + self.classif_net.set_intertop_margin(intertop_margin) + self.classif_net.set_num_subcenters(num_subcenters) + + def cancel_output_layer_grads(self): + for p in self.classif_net.output.parameters(): + p.grad = None def freeze_preembed_layers(self): self.encoder_net.freeze() @@ -521,72 +869,52 @@ def freeze_preembed_layers(self): layer_list = [l for l in range(self.embed_layer)] self.classif_net.freeze_layers(layer_list) - def train_mode(self, mode="ft-embed-affine"): - if mode == "ft-full" or mode == "train": - self.train() + def set_train_mode(self, mode): + if mode == self._train_mode: return - self.encoder_net.eval() - if self.proj is not None: - self.proj.eval() + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode == "ft-embed-affine": + self.unfreeze() + self.freeze_preembed_layers() + else: + raise ValueError(f"invalid train_mode={mode}") - self.pool_net.eval() - self.classif_net.train() - layer_list = [l for l in range(self.embed_layer)] - self.classif_net.put_layers_in_eval_mode(layer_list) + if self.head_type == XVectorHeadType.DINO: + self.classif_net.freeze_output_g() - @staticmethod - def filter_args(**kwargs): + self._train_mode = mode - # # get boolean args that are negated - # if 'pool_wo_bias' in kwargs: - # kwargs['pool_use_bias'] = not kwargs['pool_wo_bias'] - # del kwargs['pool_wo_bias'] + def _train(self, train_mode: str): + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode == "ft-embed-affine": + self.encoder_net.eval() + if self.proj is not None: + self.proj.eval() - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + self.pool_net.eval() + self.classif_net.train() + layer_list = [l for l in range(self.embed_layer)] + self.classif_net.put_layers_in_eval_mode(layer_list) + else: + raise ValueError(f"invalid train_mode={train_mode}") - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + def compute_prototype_affinity(self): + return self.classif_net.compute_prototype_affinity() + @staticmethod + def valid_train_modes(): + return ["full", "frozen", "ft-embed-affine"] + + @staticmethod + def filter_args(**kwargs): # get arguments for pooling pool_args = PF.filter_args(**kwargs["pool_net"]) - # pool_valid_args = ( - # 'pool_type', 'pool_num_comp', 'pool_use_bias', - # 'pool_dist_pow', 'pool_d_k', 'pool_d_v', 'pool_num_heads', - # 'pool_bin_attn', 'pool_inner_feats') - # pool_args = dict((k, kwargs[k]) - # for k in pool_valid_args if k in kwargs) - - # # remove pooling prefix from arg name - # for k in pool_valid_args[1:]: - # if k in pool_args: - # k2 = k.replace('pool_','') - # pool_args[k2] = pool_args[k] - # del pool_args[k] - - valid_args = ( - "num_classes", - "embed_dim", - "num_embed_layers", - "hid_act", - "loss_type", - "s", - "margin", - "margin_warmup_epochs", - "num_subcenters", - "use_norm", - "norm_before", - "in_feats", - "proj_feats", - "dropout_rate", - "norm_layer", - "head_norm_layer", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - + args = filter_func_args(XVector.__init__, kwargs) args["pool_net"] = pool_args return args @@ -600,48 +928,12 @@ def add_class_args(parser, prefix=None, skip=set()): parser, prefix="pool_net", skip=["dim", "in_feats", "keepdim"] ) - # parser.add_argument('--pool-type', type=str.lower, - # default='mean+stddev', - # choices=['avg','mean+stddev', 'mean+logvar', - # 'lde', 'scaled-dot-prod-att-v1', 'ch-wise-att-mean-stddev'], - # help=('Pooling methods: Avg, Mean+Std, Mean+logVar, LDE, ' - # 'scaled-dot-product-attention-v1')) - - # parser.add_argument('--pool-num-comp', - # default=64, type=int, - # help=('number of components for LDE pooling')) - - # parser.add_argument('--pool-dist-pow', - # default=2, type=int, - # help=('Distace power for LDE pooling')) - - # parser.add_argument('--pool-wo-bias', - # default=False, action='store_true', - # help=('Don\' use bias in LDE')) - - # parser.add_argument( - # '--pool-num-heads', default=8, type=int, - # help=('number of attention heads')) - - # parser.add_argument( - # '--pool-d-k', default=256, type=int, - # help=('key dimension for attention')) - - # parser.add_argument( - # '--pool-d-v', default=256, type=int, - # help=('value dimension for attention')) - - # parser.add_argument( - # '--pool-bin-attn', default=False, action='store_true', - # help=('Use binary attention, i.e. sigmoid instead of softmax')) - - # parser.add_argument( - # '--pool-inner-feats', default=128, type=int, - # help=('inner feature size for attentive pooling')) - - # parser.add_argument('--num-classes', - # required=True, type=int, - # help=('number of classes')) + parser.add_argument( + "--head-type", + default=XVectorHeadType.XVECTOR, + choices=XVectorHeadType.choices(), + help="type of classification head in [x-vector, dino]", + ) parser.add_argument( "--embed-dim", default=256, type=int, help=("x-vector dimension") @@ -655,7 +947,7 @@ def add_class_args(parser, prefix=None, skip=set()): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -681,6 +973,16 @@ def add_class_args(parser, prefix=None, skip=set()): help="number of epoch until we set the final margin", ) + parser.add_argument( + "--intertop-k", default=5, type=int, help="K for InterTopK penalty" + ) + parser.add_argument( + "--intertop-margin", + default=0.0, + type=float, + help="margin for InterTopK penalty", + ) + parser.add_argument( "--num-subcenters", default=2, @@ -724,17 +1026,57 @@ def add_class_args(parser, prefix=None, skip=set()): pass parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", + ) + + parser.add_argument( + "--head-use-norm", + default=True, + action=ActionYesNo, + help="batch normalizaton at the head", + ) + parser.add_argument( + "--head-use-in-norm", default=False, - action="store_true", - help="batch normalizaton after activation", + action=ActionYesNo, + help="batch normalizaton at the head input", + ) + + parser.add_argument( + "--head-hid-dim", + default=2048, + type=int, + help="bottleneck dim of DINO head", + ) + + parser.add_argument( + "--head-bottleneck-dim", + default=256, + type=int, + help="bottleneck dim of DINO head", + ) + + parser.add_argument( + "--proj-head-use-norm", + default=True, + action=ActionYesNo, + help="batch normalizaton at projection head", + ) + parser.add_argument( + "--proj-head-norm-before", + default=False, + action=ActionYesNo, + help="batch normalizaton at the begining of projection head", ) try: @@ -762,6 +1104,14 @@ def add_class_args(parser, prefix=None, skip=set()): "if None, there is not projection" ), ) + + parser.add_argument( + "--bias-weight-decay", + default=None, + type=float, + help="biases weight decay, if None default it is used", + ) + if prefix is not None: outer_parser.add_argument( "--" + prefix, @@ -771,9 +1121,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - valid_args = ("loss_type", "s", "margin", "margin_warmup_epochs") - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - + args = filter_func_args(XVector.change_config, kwargs) return args @staticmethod @@ -782,6 +1130,13 @@ def add_finetune_args(parser, prefix=None): outer_parser = parser parser = ArgumentParser(prog="") + parser.add_argument( + "--override-output", + default=False, + action=ActionYesNo, + help="changes the config of the output layer", + ) + parser.add_argument( "--loss-type", default="arc-softmax", @@ -804,16 +1159,74 @@ def add_finetune_args(parser, prefix=None): help="number of epoch until we set the final margin", ) + parser.add_argument( + "--intertop-k", default=5, type=int, help="K for InterTopK penalty" + ) + parser.add_argument( + "--intertop-margin", + default=0.0, + type=float, + help="margin for InterTopK penalty", + ) + parser.add_argument( "--num-subcenters", default=2, - type=float, + type=int, help="number of subcenters in subcenter losses", ) + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + return XVector.filter_finetune_args(**kwargs) + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='xvector finetune opts') add_argparse_args = add_class_args add_argparse_finetune_args = add_finetune_args diff --git a/hyperion/torch/narchs/__init__.py b/hyperion/torch/narchs/__init__.py index 71cd9de4..0bf7ecf4 100644 --- a/hyperion/torch/narchs/__init__.py +++ b/hyperion/torch/narchs/__init__.py @@ -3,36 +3,31 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .fcnet import FCNetV1, FCNetV2 - -from .tdnn import TDNNV1 +from .audio_feats_mvn import AudioFeatsMVN +from .classif_head import ClassifHead +from .conformer_encoder_v1 import ConformerEncoderV1 +from .dc1d_decoder import DC1dDecoder +from .dc1d_encoder import DC1dEncoder +from .dc2d_decoder import DC2dDecoder +from .dc2d_encoder import DC2dEncoder +from .dino_head import DINOHead +from .efficient_net import EfficientNet from .etdnn import ETDNNV1 +from .fcnet import FCNetV1, FCNetV2 +from .feat_fuser_mvn import FeatFuserMVN +from .proj_head import ProjHead from .resetdnn import ResETDNNV1 -from .tdnn_factory import TDNNFactory - from .resnet import * +from .resnet1d_decoder import ResNet1dDecoder +from .resnet1d_encoder import ResNet1dEncoder +from .resnet2d_decoder import ResNet2dDecoder +from .resnet2d_encoder import ResNet2dEncoder from .resnet_factory import ResNetFactory - +from .rnn_encoder import RNNEncoder +from .rnn_transducer_decoder import RNNTransducerDecoder from .spinenet import * from .spinenet_factory import SpineNetFactory - -from .transformer_encoder_v1 import TransformerEncoderV1 -from .conformer_encoder_v1 import ConformerEncoderV1 - -from .dc1d_encoder import DC1dEncoder -from .dc1d_decoder import DC1dDecoder -from .dc2d_encoder import DC2dEncoder -from .dc2d_decoder import DC2dDecoder - -from .resnet1d_encoder import ResNet1dEncoder -from .resnet1d_decoder import ResNet1dDecoder -from .resnet2d_encoder import ResNet2dEncoder -from .resnet2d_decoder import ResNet2dDecoder - -from .efficient_net import EfficientNet - -from .classif_head import ClassifHead - -from .audio_feats_mvn import AudioFeatsMVN - +from .tdnn import TDNNV1 +from .tdnn_factory import TDNNFactory from .torch_na_loader import TorchNALoader +from .transformer_encoder_v1 import TransformerEncoderV1 diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index 1d5cb0a3..dabf308f 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -2,9 +2,9 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser - +import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layers import AudioFeatsFactory as AFF from ..layers import MeanVarianceNorm as MVN @@ -31,7 +31,12 @@ def __init__( if mvn is not None: mvn = MVN.filter_args(**mvn) self.mvn_cfg = mvn - if mvn["norm_mean"] or mvn["norm_var"]: + if ( + ("norm_mean" in mvn) + and mvn["norm_mean"] + or ("norm_var" in mvn) + and mvn["norm_var"] + ): self.mvn = MVN(**mvn) self.spec_augment = None @@ -44,6 +49,10 @@ def __init__( self.trans = trans self.aug_after_mvn = aug_after_mvn + @property + def sample_frequency(self): + return self.audio_feats.fs + @property def fs(self): return self.audio_feats.fs @@ -56,21 +65,29 @@ def frame_length(self): def frame_shift(self): return self.audio_feats.frame_shift - def forward(self, x, lengths=None): + @staticmethod + def _compute_feat_lengths(x_lengths, max_samples, max_frames): + if x_lengths is None: + return None + + return torch.div(x_lengths * max_frames, max_samples, rounding_mode="floor") + + def forward(self, x, x_lengths=None): f = self.audio_feats(x) + f_lengths = self._compute_feat_lengths(x_lengths, x.size(-1), f.size(1)) if self.spec_augment is not None and not self.aug_after_mvn: - f = self.spec_augment(f, lengths) + f = self.spec_augment(f, f_lengths) if self.mvn is not None: - f = self.mvn(f) + f = self.mvn(f, f_lengths) if self.spec_augment is not None and self.aug_after_mvn: - f = self.spec_augment(f, lengths) + f = self.spec_augment(f, f_lengths) if self.trans: f = f.transpose(1, 2).contiguous() - return f + return f, f_lengths def get_config(self): config = { @@ -99,10 +116,9 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--aug-after-mvn", default=False, - action="store_true", + action=ActionYesNo, help=("do spec augment after st-mvn," "instead of before"), ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='feature extraction options') diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index e3af9f2d..a4a7e9a1 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -3,14 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.nn import Linear -from ..layers import CosLossOutput, ArcLossOutput, SubCenterArcLossOutput -from ..layers import NormLayer1dFactory as NLF +from ...utils.misc import filter_func_args from ..layer_blocks import FCBlock +from ..layers import ActivationFactory as AF +from ..layers import ArcLossOutput, CosLossOutput +from ..layers import NormLayer1dFactory as NLF +from ..layers import SubCenterArcLossOutput from .net_arch import NetArch @@ -28,10 +32,13 @@ class ClassifHead(NetArch): cos_scale: scale parameter for cos-softmax and arc-softmax margin: margin parameter for cos-softmax and arc-softmax margin_warmup_epochs: number of epochs to anneal the margin from 0 to margin + intertop_k: adds negative angular penalty to k largest negative scores. + intertop_margin: inter-top-k penalty. num_subcenters: number of subcenters in subcenter losses norm_layer: norm_layer object or str indicating type norm layer, if None it uses BatchNorm1d use_norm: it True it uses layer/batch-normalization norm_before: if True, layer-norm is before the activation function + use_in_norm: put batchnorm at the input """ def __init__( @@ -45,13 +52,15 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, norm_layer=None, use_norm=True, norm_before=True, dropout_rate=0, + use_in_norm=False, ): - super().__init__() assert num_embed_layers >= 1, "num_embed_layers (%d < 1)" % num_embed_layers @@ -60,6 +69,7 @@ def __init__( self.embed_dim = embed_dim self.num_classes = num_classes self.norm_layer = norm_layer + self.use_in_norm = use_in_norm if use_norm: norm_groups = None @@ -77,8 +87,14 @@ def __init__( self.cos_scale = cos_scale self.margin = margin self.margin_warmup_epochs = margin_warmup_epochs + self.intertop_k = intertop_k + self.intertop_margin = intertop_margin self.num_subcenters = num_subcenters + if self.use_in_norm: + assert not self.norm_before + self.in_norm = self._norm_layer(prev_feats) + prev_feats = in_feats fc_blocks = [] for i in range(num_embed_layers - 1): @@ -100,16 +116,21 @@ def __init__( else: act = hid_act - fc_blocks.append( - FCBlock( - prev_feats, - embed_dim, - activation=act, - norm_layer=self._norm_layer, - use_norm=use_norm, - norm_before=norm_before, + if self.use_in_norm: + fc_blocks.append( + FCBlock(prev_feats, embed_dim, activation=act, use_norm=False) + ) + else: + fc_blocks.append( + FCBlock( + prev_feats, + embed_dim, + activation=act, + norm_layer=self._norm_layer, + use_norm=use_norm, + norm_before=norm_before, + ) ) - ) self.fc_blocks = nn.ModuleList(fc_blocks) @@ -123,6 +144,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) elif loss_type == "arc-softmax": self.output = ArcLossOutput( @@ -131,6 +154,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) elif loss_type == "subcenter-arc-softmax": self.output = SubCenterArcLossOutput( @@ -140,18 +165,29 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) def rebuild_output_layer( - self, num_classes, loss_type, s, margin, margin_warmup_epochs, num_subcenters=2 + self, + num_classes, + loss_type, + cos_scale, + margin, + margin_warmup_epochs, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, ): - embed_dim = self.embed_dim self.num_classes = num_classes self.loss_type = loss_type self.cos_scale = cos_scale self.margin = margin self.margin_warmup_epochs = margin_warmup_epochs + self.intertop_margin = intertop_margin + self.num_subcenters = num_subcenters self.num_subcenters = num_subcenters if loss_type == "softmax": @@ -163,6 +199,8 @@ def rebuild_output_layer( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) elif loss_type == "arc-softmax": self.output = ArcLossOutput( @@ -171,6 +209,8 @@ def rebuild_output_layer( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) elif loss_type == "subcenter-arc-softmax": self.output = SubCenterArcLossOutput( @@ -180,6 +220,8 @@ def rebuild_output_layer( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) def set_margin(self, margin): @@ -203,6 +245,27 @@ def set_cos_scale(self, cos_scale): self.cos_scale = cos_scale self.output.cos_scale = cos_scale + def set_intertop_k(self, intertop_k): + if self.loss_type == "softmax": + return + + self.intertop_k = intertop_k + self.output.intertop_k = intertop_k + + def set_intertop_margin(self, intertop_margin): + if self.loss_type == "softmax": + return + + self.intertop_margin = intertop_margin + self.output.intertop_margin = intertop_margin + + def set_num_subcenters(self, num_subcenters): + if not self.loss_type == "subcenter-arc-softmax": + return + + self.num_subcenters = num_subcenters + self.output.num_subcenters = num_subcenters + def update_margin(self, epoch): if hasattr(self.output, "update_margin"): self.output.update_margin(epoch) @@ -217,6 +280,8 @@ def put_layers_in_eval_mode(self, layer_list): self.fc_blocks[l].eval() def forward(self, x, y=None): + if self.use_in_norm: + x = self.in_norm(x) for l in range(self.num_embed_layers): x = self.fc_blocks[l](x) @@ -228,16 +293,18 @@ def forward(self, x, y=None): return y - def forward_hid_feats(self, x, y=None, layers=None, return_output=False): + def forward_hid_feats(self, x, y=None, return_layers=None, return_logits=False): + assert return_layers is not None or return_logits + if return_layers is None: + return_layers = [] - assert layers is not None or return_output - if layers is None: - layers = [] + if self.use_in_norm: + x = self.in_norm(x) h = [] for l in range(self.num_embed_layers): x = self.fc_blocks[l](x) - if l in layers: + if l in return_layers: h.append(x) if self.loss_type == "softmax": @@ -245,20 +312,32 @@ def forward_hid_feats(self, x, y=None, layers=None, return_output=False): else: y = self.output(x, y) - if return_output: + if return_logits: return h, y - return h + return h, None def extract_embed(self, x, embed_layer=0): + if self.use_in_norm: + x = self.in_norm(x) for l in range(embed_layer): x = self.fc_blocks[l](x) - y = self.fc_blocks[embed_layer].forward_linear(x) + if self.loss_type == "softmax" or embed_layer < self.num_embed_layers: + y = self.fc_blocks[embed_layer].forward_linear(x) + else: + y = self.fc_blocks[l](x) return y - def get_config(self): + def compute_prototype_affinity(self): + if self.loss_type != "softmax": + return self.output.compute_prototype_affinity() + kernel = self.output.weight # (num_classes, feat_dim) + kernel = kernel / torch.linalg.norm(kernel, 2, dim=1, keepdim=True) + return torch.mm(kernel, kernel.transpose(0, 1)) + + def get_config(self): hid_act = AF.get_config(self.fc_blocks[0].activation) config = { @@ -271,11 +350,14 @@ def get_config(self): "cos_scale": self.cos_scale, "margin": self.margin, "margin_warmup_epochs": self.margin_warmup_epochs, + "intertop_k": self.intertop_k, + "intertop_margin": self.intertop_margin, "num_subcenters": self.num_subcenters, "norm_layer": self.norm_layer, "use_norm": self.use_norm, "norm_before": self.norm_before, "dropout_rate": self.dropout_rate, + "use_in_norm": self.use_in_norm, } base_config = super().get_config() @@ -283,7 +365,6 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: kwargs["use_norm"] = not kwargs["wo_norm"] del kwargs["wo_norm"] @@ -292,22 +373,7 @@ def filter_args(**kwargs): kwargs["norm_before"] = not kwargs["norm_after"] del kwargs["norm_after"] - valid_args = ( - "num_classes", - "embed_dim", - "num_embed_layers", - "hid_act", - "loss_type", - "s", - "margin", - "margin_warmup_epochs", - "num_subcenters", - "use_norm", - "norm_before", - "dropout_rate", - "norm_layer", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args = filter_func_args(ClassifHead.__init__, kwargs) return args @staticmethod @@ -328,7 +394,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -339,7 +405,9 @@ def add_class_args(parser, prefix=None): help="loss type: softmax, arc-softmax, cos-softmax, subcenter-arc-softmax", ) - parser.add_argument("--s", default=64, type=float, help="scale for arcface") + parser.add_argument( + "--cos-scale", default=64, type=float, help="scale for arcface" + ) parser.add_argument( "--margin", default=0.3, type=float, help="margin for arcface, cosface,..." @@ -352,6 +420,16 @@ def add_class_args(parser, prefix=None): help="number of epoch until we set the final margin", ) + parser.add_argument( + "--intertop-k", default=5, type=int, help="K for InterTopK penalty" + ) + parser.add_argument( + "--intertop-margin", + default=0.0, + type=float, + help="margin for InterTopK penalty", + ) + parser.add_argument( "--num-subcenters", default=2, @@ -376,17 +454,24 @@ def add_class_args(parser, prefix=None): pass parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", + ) + + parser.add_argument( + "--use-in-norm", default=False, - action="store_true", - help="batch normalizaton after activation", + action=ActionYesNo, + help="batch normalizaton in the classif head input", ) try: @@ -396,6 +481,5 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='classification head options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/conformer_decoder_v1.py b/hyperion/torch/narchs/conformer_decoder_v1.py new file mode 100644 index 00000000..ef55d6c3 --- /dev/null +++ b/hyperion/torch/narchs/conformer_decoder_v1.py @@ -0,0 +1,724 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args +from ..layer_blocks import ConformerDecoderBlockV1 as DBlock +from ..layer_blocks import TransformerConv1dSubsampler as Conv1dSubsampler +from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler +from ..layers import ActivationFactory as AF +from ..layers import ConvPosEncoder, NoPosEncoder +from ..layers import NormLayer1dFactory as NLF +from ..layers import PosEncoder, RelPosEncoder +from ..utils import make_attn_mask_causal, scale_seq_lengths, seq_lengths_to_mask +from .net_arch import NetArch + + +class ConformerDecoderV1(NetArch): + """Conformer decoder mixing Transformer Decoder with Conformer Encoder Conv blocks + + This becomes a standard Transformer Decoder by setting conv_repeats=0, pos_enc_type='abs', ff_macaron=False. + + Attributes: + in_feats: input features dimension + d_model: encoder blocks feature dimension + num_heads: number of heads + num_blocks: number of self attn blocks + self_att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1', 'block-scaled-dot-prod-att-v1'] + self_att_context: maximum context range for local attention + cross_att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1', 'block-scaled-dot-prod-att-v1'] + conv_repeats: number of conv blocks in each conformer block + conv_kernel_sizes: kernel size for conv blocks + conv_strides: stride for depth-wise conv in the first conv block of each conformer block + ff_type: string in ['linear', 'conv1dx2', 'conv1d-linear'] + d_ff: dimension of middle layer in feed_forward block + ff_kernel_size: kernel size for convolutional versions of ff block + dropout_rate: dropout rate for ff and conv blocks + pos_dropout_rate: dropout rate for positional encoder + att_dropout_rate: dropout rate for attention block + in_layer_type: input layer block type in ['linear','conv2d-sub', 'embed', None] + pos_enc_type: type of positional encoder ['no', 'abs', 'rel', 'conv'] + + causal_pos_enc: if True, use causal positional encodings (when rel_pos_enc=True), it assumes + that query q_i only attents to key k_j when j<=i + hid_act: hidden activations in ff and input blocks + conv_norm_layer: norm layer constructor or str for conv block, + if None it uses BatchNorm1d + se_r: Squeeze-Excitation compression ratio, + if None it doesn't use Squeeze-Excitation + ff_macaron: if True, it uses macaron-net style ff layers, otherwise transformer style. + red_lnorms: it True, use redundant LNorm layers at the output of the conformer blocks as + in the paper + concat_after: if True, if concats attention input and output and apply linear transform, i.e., + y = x + linear(concat(x, att(x))) + if False, y = x + att(x) + padding_idx: padding idx for embed layer + in_time_dim: time dimension in the input Tensor + out_time_dim: dimension that we want to be time in the output tensor + """ + + def __init__( + self, + num_classes, + d_model=256, + num_heads=4, + num_blocks=6, + self_att_type="scaled-dot-prod-v1", + att_context=25, + cross_att_type="scaled-dot-prod-v1", + conv_repeats=0, + conv_kernel_sizes=31, + conv_strides=1, + ff_type="linear", + d_ff=2048, + ff_kernel_size=1, + dropout_rate=0.1, + pos_dropout_rate=0.1, + att_dropout_rate=0.0, + in_layer_type="embed", + in_stride=4, + pos_enc_type="abs", + causal_pos_enc=False, + pos_kernel_size=128, + pos_num_groups=16, + hid_act="swish", + conv_norm_layer=None, + se_r=None, + ff_macaron=True, + red_lnorms=True, + concat_after=False, + padding_idx=-1, + in_time_dim=1, + src_time_dim=1, + out_time_dim=1, + in_feats=None, + with_output=True, + ): + super().__init__() + self.num_classes = num_classes + self.with_output = with_output + if in_feats is None: + in_feats = num_classes + self.in_feats = in_feats + self.d_model = d_model + self.num_heads = num_heads + self.num_blocks = num_blocks + + self.self_att_type = self_att_type + self.cross_att_type = cross_att_type + self.att_context = att_context + + self.conv_repeats = self._standarize_cblocks_param( + conv_repeats, num_blocks, "conv_repeats" + ) + self.conv_kernel_sizes = self._standarize_cblocks_param( + conv_kernel_sizes, num_blocks, "conv_kernel_sizes" + ) + self.conv_strides = self._standarize_cblocks_param( + conv_strides, num_blocks, "conv_strides" + ) + + self.ff_type = ff_type + self.d_ff = d_ff + self.ff_kernel_size = ff_kernel_size + self.dropout_rate = dropout_rate + self.pos_enc_type = pos_enc_type + self.causal_pos_enc = causal_pos_enc + self.att_dropout_rate = att_dropout_rate + self.pos_dropout_rate = pos_dropout_rate + self.in_layer_type = in_layer_type + self.in_stride = in_stride + self.se_r = se_r + self.ff_macaron = ff_macaron + self.red_lnorms = red_lnorms + self.concat_after = concat_after + self.padding_idx = padding_idx + self.in_time_dim = in_time_dim + self.src_time_dim = src_time_dim + self.out_time_dim = out_time_dim + self.hid_act = hid_act + self.pos_kernel_size = pos_kernel_size + self.pos_num_groups = pos_num_groups + + self.conv_norm_layer = conv_norm_layer + norm_groups = None + if conv_norm_layer == "group-norm": + norm_groups = min(d_model // 2, 32) + self._conv_norm_layer = NLF.create(conv_norm_layer, norm_groups) + + self._make_in_layer() + + blocks = [] + for i in range(num_blocks): + blocks.append( + DBlock( + d_model, + self_att_type, + cross_att_type, + num_heads, + self.conv_repeats[i], + self.conv_kernel_sizes[i], + self.conv_strides[i], + ff_type, + d_ff, + ff_kernel_size, + hid_act=hid_act, + dropout_rate=dropout_rate, + att_context=att_context, + att_dropout_rate=att_dropout_rate, + pos_enc_type=pos_enc_type, + causal_pos_enc=causal_pos_enc, + conv_norm_layer=self._conv_norm_layer, + se_r=se_r, + ff_macaron=ff_macaron, + out_lnorm=self.red_lnorms, + concat_after=concat_after, + ) + ) + + self.blocks = nn.ModuleList(blocks) + if not self.red_lnorms: + self.norm_out = nn.LayerNorm(d_model) + + if with_output: + self.output_layer = nn.Linear(d_model, num_classes) + + @staticmethod + def _standarize_cblocks_param(p, num_blocks, p_name): + if isinstance(p, int): + p = [p] * num_blocks + elif isinstance(p, list): + if len(p) == 1: + p = p * num_blocks + + assert len(p) == num_blocks, "len(%s)(%d)!=%d" % ( + p_name, + len(p), + num_blocks, + ) + else: + raise TypeError("wrong type for param {}={}".format(p_name, p)) + + return p + + def _make_in_layer(self): + in_feats = self.in_feats + d_model = self.d_model + dropout_rate = self.dropout_rate + if self.pos_enc_type == "no": + pos_enc = NoPosEncoder() + elif self.pos_enc_type == "rel": + pos_enc = RelPosEncoder(d_model, self.pos_dropout_rate) + elif self.pos_enc_type == "abs": + pos_enc = PosEncoder(d_model, self.pos_dropout_rate) + elif self.pos_enc_type == "conv": + pos_enc = ConvPosEncoder( + d_model, self.pos_kernel_size, self.pos_num_groups, self.hid_act + ) + else: + raise Exception("wrong pos-enc-type={}".format(self.pos_enc_type)) + + hid_act = AF.create(self.hid_act) + + if self.in_layer_type == "linear": + self.in_layer = nn.Sequential( + nn.Linear(in_feats, d_model), + nn.LayerNorm(d_model), + nn.Dropout(dropout_rate), + hid_act, + pos_enc, + ) + elif self.in_layer_type == "conv2d-sub": + self.in_layer = Conv2dSubsampler( + in_feats, + d_model, + hid_act, + self.in_stride, + pos_enc, + time_dim=self.in_time_dim, + ) + elif self.in_layer_type == "conv1d-sub": + self.in_layer = Conv1dSubsampler( + in_feats, + d_model, + hid_act, + self.in_stride, + pos_enc, + time_dim=self.in_time_dim, + ) + elif self.in_layer_type == "embed": + self.in_layer = nn.Sequential( + nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), pos_enc + ) + elif isinstance(self.in_layer_type, nn.Module): + self.in_layer = nn.Sequential(self.in_layer_type, pos_enc) + elif self.in_layer_type is None: + self.in_layer = pos_enc + else: + raise ValueError(f"unknown in_layer_type: {self.in_layer_type}") + + def _make_masks( + self, + max_in_length, + x_lengths, + x_mask, + max_src_length, + x_src_lengths, + x_src_mask, + causal_mask, + ): + if x_mask is None: + if x_lengths is not None: + x_mask = seq_lengths_to_mask(x_lengths, max_in_length, time_dim=1) + if causal_mask: + x_mask = make_attn_mask_causal(x_mask) + + if x_src_mask is None and x_src_lengths is not None: + x_src_mask = seq_lengths_to_mask(x_src_lengths, max_src_length, time_dim=1) + + return x_mask, x_src_mask + + def _forward_input(self, x, x_mask): + if isinstance(self.in_layer, (Conv2dSubsampler, Conv1dSubsampler)): + x, x_mask = self.in_layer(x, x_mask) + else: + if self.in_time_dim != 1: + x = x.transpose(1, self.in_time_dim).contiguous() + x = self.in_layer(x) + + return x, x_mask + + def forward( + self, + x, + x_src, + x_lengths=None, + x_src_lengths=None, + x_mask=None, + x_src_mask=None, + causal_mask=True, + ): + """Forward pass function + + Args: + x: input tensor with size=(batch, time_out, num_feats) or (batch, time_out) + x_src: source tensor with size=(batch, time_in, num_feats) + x_lengths: lengths of the input sequences. + x_src_lengths: lengths of the source sequences + x_mask: mask to indicate valid time steps for x (batch, time_out). + It overwrites the mask of x_lengths. + x_src_mask: mask to indicate valid time steps for x_src (batch, time_in). + It overwrites the mask of x_src_lengths. + return_mask: if True, it also return the output mask + + Returns: + Tensor with output logits + Tensor with output lengths + Tensor with mask if return_mask is True + """ + if self.src_time_dim != 1: + x_src = x_src.transpose(1, 2) + + max_in_length = x.size(self.in_time_dim) + max_src_length = x_src.size(1) + x_mask, x_src_mask = self._make_masks( + max_in_length, + x_lengths, + x_mask, + max_src_length, + x_src_lengths, + x_src_mask, + causal_mask, + ) + x, x_mask = self._forward_input(x, x_mask) + + if isinstance(x, tuple): + x, pos_emb = x + b_args = {"pos_emb": pos_emb} + else: + b_args = {} + + for i in range(len(self.blocks)): + x, x_mask = self.blocks[i]( + x, x_src, mask=x_mask, mask_src=x_src_mask, **b_args + ) + + if not self.red_lnorms: + x = self.norm_out(x) + + if self.with_output: + x = self.output_layer(x) + + if self.out_time_dim != 1: + x = x.transpose(1, self.out_time_dim) + + return x, x_lengths + + def forward_1step( + self, + x, + x_src, + x_lengths=None, + x_mask=None, + cache=None, + ): + """Forward pass function + + Args: + x: input tensor with size=(batch, time, num_feats) + x_lengths: lengths of the input sequences. + x_mask: mask to indicate valid time steps for x (batch, time). + It overwrites the mask of x_lengths. + return_mask: if True, it also return the output mask + target_shape: unused + + Returns: + Tensor with output features + Tensor with output lengths + Tensor with mask if return_mask is True + """ + max_in_length = x.size(self.in_time_dim) + if x_mask is None and x_lengths is not None: + x_mask = seq_lengths_to_mask(x_lengths, max_in_length, time_dim=1) + + if self.src_time_dim != 1: + x_src = x_src.transpose(1, 2) + + max_src_length = x_src.size(1) + x, x_mask = self._forward_input(x, x_mask) + + if isinstance(x, tuple): + x, pos_emb = x + b_args = {"pos_emb": pos_emb} + else: + b_args = {} + + if cache is None: + cache = [None] * len(self.blocks) + + next_cache = [] + for i in range(len(self.blocks)): + x, x_mask = self.blocks[i](x, x_src, mask=x_mask, cache=cache[i] ** b_args) + next_cache.apppend(x) + + if not self.red_lnorms: + x = self.norm_out(x[:, -1]) + else: + x = x[:, -1] + + if self.with_output: + x = self.output_layer(x) + + return x, next_cache + + def get_config(self): + """Gets network config + Returns: + dictionary with config params + """ + config = { + "num_classes": self.num_classes, + "in_feats": self.in_feats, + "d_model": self.d_model, + "num_heads": self.num_heads, + "num_blocks": self.num_blocks, + "att_type": self.att_type, + "att_context": self.att_context, + "conv_repeats": self.conv_repeats, + "conv_kernel_sizes": self.conv_kernel_sizes, + "conv_strides": self.conv_strides, + "ff_type": self.ff_type, + "d_ff": self.d_ff, + "ff_kernel_size": self.ff_kernel_size, + "dropout_rate": self.dropout_rate, + "att_dropout_rate": self.att_dropout_rate, + "pos_dropout_rate": self.pos_dropout_rate, + "in_layer_type": self.in_layer_type, + "in_stride": self.in_stride, + "pos_enc_type": self.pos_enc_type, + "causal_pos_enc": self.causal_pos_enc, + "pos_kernel_size": self.pos_kernel_size, + "pos_num_groups": self.pos_num_groups, + "hid_act": self.hid_act, + "se_r": self.se_r, + "ff_macaron": self.ff_macaron, + "red_lnorms": self.red_lnorms, + "conv_norm_layer": self.conv_norm_layer, + "concat_after": self.concat_after, + "padding_idx": self.padding_idx, + "in_time_dim": self.in_time_dim, + "out_time_dim": self.out_time_dim, + "with_output": self.with_output, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def in_context(self): + return (self.att_context, self.att_context) + + def in_shape(self): + """Input shape for network + + Returns: + Tuple describing input shape + """ + if self.in_time_dim == 1: + return (None, None, self.in_feats) + else: + return (None, self.in_feats, None) + + def out_shape(self, in_shape=None): + """Infers the network output shape given the input shape + + Args: + in_shape: input shape tuple + + Returns: + Tuple with the output shape + """ + if in_shape is None: + out_t = None + batch_size = None + else: + assert len(in_shape) == 3 + batch_size = in_shape[0] + in_t = in_shape[self.in_time_dim] + if in_t is None: + out_t = None + else: + if isinstance(self.in_layer, Conv2dSubsampler): + # out_t = in_t//4 + out_t = ((in_t - 1) // 2 - 1) // 2 + else: + out_t = in_t + + if self.out_time_dim == 1: + return (batch_size, out_t, self.d_model) + else: + return (batch_size, self.d_model, out_t) + + @staticmethod + def filter_args(**kwargs): + """Filters arguments correspondin to ConformerDecoder + from args dictionary + + Args: + kwargs: args dictionary + + Returns: + args dictionary + """ + args = filter_func_args(ConformerDecoderV1.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + """Adds Conformer config parameters to argparser + + Args: + parser: argparse object + prefix: prefix string to add to the argument names + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument( + "--in-feats", type=int, default=None, help=("input feature dimension") + ) + + parser.add_argument( + "--num-blocks", default=6, type=int, help=("number of tranformer blocks") + ) + + parser.add_argument( + "--d-model", default=512, type=int, help=("encoder layer sizes") + ) + + parser.add_argument( + "--num-heads", + default=4, + type=int, + help=("number of heads in self-attention layers"), + ) + + parser.add_argument( + "--self-att-type", + default="scaled-dot-prod-v1", + choices=[ + "scaled-dot-prod-v1", + "local-scaled-dot-prod-v1", + "block-scaled-dot-prod-v1", + ], + help=("type of self-attention"), + ) + + parser.add_argument( + "--cross-att-type", + default="scaled-dot-prod-v1", + choices=[ + "scaled-dot-prod-v1", + "local-scaled-dot-prod-v1", + "block-scaled-dot-prod-v1", + ], + help=("type of self-attention"), + ) + + parser.add_argument( + "--att-context", + default=25, + type=int, + help=("context size when using local attention"), + ) + + parser.add_argument( + "--conv-repeats", + default=[0], + type=int, + nargs="+", + help=("number of conv blocks in each conformer block"), + ) + + parser.add_argument( + "--conv-kernel-sizes", + default=[31], + nargs="+", + type=int, + help=("kernels sizes for the depth-wise convs of each conformer block"), + ) + + parser.add_argument( + "--conv-strides", + default=[1], + nargs="+", + type=int, + help=("resb-blocks strides for each encoder stage"), + ) + + parser.add_argument( + "--ff-type", + default="linear", + choices=["linear", "conv1dx2", "conv1dlinear"], + help=("type of feed forward layers in transformer block"), + ) + + parser.add_argument( + "--d-ff", + default=2048, + type=int, + help=("size middle layer in feed forward block"), + ) + + parser.add_argument( + "--ff-kernel-size", + default=3, + type=int, + help=("kernel size in convolutional feed forward block"), + ) + + parser.add_argument("--hid-act", default="swish", help="hidden activation") + + parser.add_argument( + "--pos-dropout-rate", + default=0.1, + type=float, + help="positional encoder dropout", + ) + parser.add_argument( + "--att-dropout-rate", default=0, type=float, help="self-att dropout" + ) + parser.add_argument( + "--dropout-rate", default=0.1, type=float, help="feed-forward layer dropout" + ) + + parser.add_argument( + "--in-layer-type", + default="linear", + choices=["embed", "linear", "conv2d-sub", "conv1d-sub"], + help=("type of input layer"), + ) + + parser.add_argument( + "--in-stride", + default=4, + type=int, + choices=[1, 2, 4], + help="stride of conformer input layer", + ) + + parser.add_argument( + "--pos-enc-type", + default="rel", + choices=["no", "rel", "abs", "conv"], + help=("type of positional encoder"), + ) + + parser.add_argument( + "--causal-pos-enc", + default=False, + action=ActionYesNo, + help="relative positional encodings are zero when attending to the future", + ) + parser.add_argument( + "--pos-kernel-size", + default=128, + type=int, + help="kernel size for conv positional encoder", + ) + parser.add_argument( + "--pos-num-groups", + default=16, + type=int, + help="number of conv groups for conv positional encoder", + ) + + parser.add_argument( + "--conv-norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer for conv block in conformer", + ) + + parser.add_argument( + "--se-r", + default=None, + type=int, + help=("squeeze-excitation compression ratio"), + ) + + parser.add_argument( + "--ff-macaron", + default=True, + action=ActionYesNo, + help="do not use macaron style ff layers ", + ) + + parser.add_argument( + "--red-lnorms", + default=True, + action=ActionYesNo, + help="use redundant Lnorm at conformer blocks' outputs", + ) + + parser.add_argument( + "--concat-after", + default=False, + action=ActionYesNo, + help="concatenate attention input and output instead of adding", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/conformer_encoder_v1.py b/hyperion/torch/narchs/conformer_encoder_v1.py index 69f9300c..72f50f82 100644 --- a/hyperion/torch/narchs/conformer_encoder_v1.py +++ b/hyperion/torch/narchs/conformer_encoder_v1.py @@ -3,16 +3,21 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +import logging import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from ..layers import ActivationFactory as AF -from ..layers import NormLayer1dFactory as NLF -from ..layers import PosEncoder, RelPosEncoder, NoPosEncoder +from ...utils.misc import filter_func_args from ..layer_blocks import ConformerEncoderBlockV1 as EBlock +from ..layer_blocks import TransformerConv1dSubsampler as Conv1dSubsampler from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler +from ..layers import ActivationFactory as AF +from ..layers import ConvPosEncoder, NoPosEncoder +from ..layers import NormLayer1dFactory as NLF +from ..layers import PosEncoder, RelPosEncoder +from ..utils import scale_seq_lengths, seq_lengths_to_mask from .net_arch import NetArch @@ -37,7 +42,7 @@ class ConformerEncoderV1(NetArch): d_model: encoder blocks feature dimension num_heads: number of heads num_blocks: number of self attn blocks - att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1'] + att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1', 'block-scaled-dot-prod-att-v1'] att_context: maximum context range for local attention conv_repeats: number of conv blocks in each conformer block conv_kernel_sizes: kernel size for conv blocks @@ -49,11 +54,10 @@ class ConformerEncoderV1(NetArch): pos_dropout_rate: dropout rate for positional encoder att_dropout_rate: dropout rate for attention block in_layer_type: input layer block type in ['linear','conv2d-sub', 'embed', None] - pos_enc_type: type of positional encoder ['no', 'abs', 'rel'] + pos_enc_type: type of positional encoder ['no', 'abs', 'rel', 'conv'] causal_pos_enc: if True, use causal positional encodings (when rel_pos_enc=True), it assumes that query q_i only attents to key k_j when j<=i - no_pos_enc: if True, it doesn't use positional encoder. hid_act: hidden activations in ff and input blocks conv_norm_layer: norm layer constructor or str for conv block, if None it uses BatchNorm1d @@ -68,8 +72,6 @@ class ConformerEncoderV1(NetArch): padding_idx: padding idx for embed layer in_time_dim: time dimension in the input Tensor out_time_dim: dimension that we want to be time in the output tensor - rel_pos_enc: if True, use relative postional encodings, absolute encodings otherwise. (deprecated) - red_lnorm: (deprecated) """ def __init__( @@ -90,21 +92,21 @@ def __init__( pos_dropout_rate=0.1, att_dropout_rate=0.0, in_layer_type="conv2d-sub", + in_stride=4, pos_enc_type="rel", causal_pos_enc=False, + pos_kernel_size=128, + pos_num_groups=16, hid_act="swish", conv_norm_layer=None, se_r=None, ff_macaron=True, - red_lnorms=False, + red_lnorms=True, concat_after=False, padding_idx=-1, - in_time_dim=-1, + in_time_dim=1, out_time_dim=1, - rel_pos_enc=True, - red_lnorm=False, ): - super().__init__() self.in_feats = in_feats self.d_model = d_model @@ -133,6 +135,7 @@ def __init__( self.att_dropout_rate = att_dropout_rate self.pos_dropout_rate = pos_dropout_rate self.in_layer_type = in_layer_type + self.in_stride = in_stride self.se_r = se_r self.ff_macaron = ff_macaron self.red_lnorms = red_lnorms @@ -141,6 +144,8 @@ def __init__( self.in_time_dim = in_time_dim self.out_time_dim = out_time_dim self.hid_act = hid_act + self.pos_kernel_size = pos_kernel_size + self.pos_num_groups = pos_num_groups self.conv_norm_layer = conv_norm_layer norm_groups = None @@ -200,7 +205,6 @@ def _standarize_cblocks_param(p, num_blocks, p_name): return p def _make_in_layer(self): - in_feats = self.in_feats d_model = self.d_model dropout_rate = self.dropout_rate @@ -210,12 +214,15 @@ def _make_in_layer(self): pos_enc = RelPosEncoder(d_model, self.pos_dropout_rate) elif self.pos_enc_type == "abs": pos_enc = PosEncoder(d_model, self.pos_dropout_rate) + elif self.pos_enc_type == "conv": + pos_enc = ConvPosEncoder( + d_model, self.pos_kernel_size, self.pos_num_groups, self.hid_act + ) else: - raise Exception("wrong pos-enc-type={}".format(self.pos_enc_type)) - - hid_act = AF.create(self.hid_act) + raise Exception(f"wrong pos-enc-type={self.pos_enc_type}") if self.in_layer_type == "linear": + hid_act = AF.create(self.hid_act) self.in_layer = nn.Sequential( nn.Linear(in_feats, d_model), nn.LayerNorm(d_model), @@ -225,37 +232,93 @@ def _make_in_layer(self): ) elif self.in_layer_type == "conv2d-sub": self.in_layer = Conv2dSubsampler( - in_feats, d_model, hid_act, pos_enc, time_dim=self.in_time_dim + in_feats, + d_model, + self.hid_act, + self.in_stride, + pos_enc, + time_dim=self.in_time_dim, + ) + elif self.in_layer_type == "conv1d-sub": + self.in_layer = Conv1dSubsampler( + in_feats, + d_model, + self.hid_act, + self.in_stride, + pos_enc, + time_dim=self.in_time_dim, ) elif self.in_layer_type == "embed": self.in_layer = nn.Sequential( nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), pos_enc ) elif isinstance(self.in_layer_type, nn.Module): - self.in_layer = nn.Sequential(in_layer_type, pos_enc) + self.in_layer = nn.Sequential(self.in_layer_type, pos_enc) elif self.in_layer_type is None: self.in_layer = pos_enc else: - raise ValueError("unknown in_layer_type: " + self.in_layer_type) + raise ValueError(f"unknown in_layer_type: {self.in_layer_type}") + + def _make_masks(self, max_in_length, x_lengths=None, x_mask=None): + if x_mask is None and x_lengths is not None: + x_mask = seq_lengths_to_mask(x_lengths, max_in_length, time_dim=1) + + return x_mask + + def _forward_input(self, x, x_mask): + if isinstance(self.in_layer, (Conv2dSubsampler, Conv1dSubsampler)): + x, x_mask = self.in_layer(x, x_mask) + else: + if self.in_time_dim != 1: + x = x.transpose(1, self.in_time_dim).contiguous() + x = self.in_layer(x) + + return x, x_mask - def forward(self, x, mask=None, target_shape=None): + def change_config( + self, override_dropouts, dropout_rate, pos_dropout_rate, att_dropout_rate + ): + if override_dropouts: + logging.info("changing conformer dropouts") + self.change_dropouts(dropout_rate, pos_dropout_rate, att_dropout_rate) + + def change_dropouts(self, dropout_rate, pos_dropout_rate, att_dropout_rate): + super().change_dropouts(dropout_rate) + from ..layers import PosEncoderBase + + for m in self.modules(): + if isinstance(m, PosEncoderBase): + if hasattr(m, "dropout_rate"): + m.dropout_rate = pos_dropout_rate + m.dropout.p = pos_dropout_rate + elif isinstance(m, EBlock): + m.change_attn_dropout(att_dropout_rate) + + self.dropout_rate = dropout_rate + self.pos_dropout_rate = pos_dropout_rate + self.att_dropout_rate = att_dropout_rate + + def forward( + self, x, x_lengths=None, x_mask=None, return_mask=False, target_shape=None + ): """Forward pass function Args: x: input tensor with size=(batch, time, num_feats) - mask: mask to indicate valid time steps for x (batch, time) + x_lengths: lengths of the input sequences. + x_mask: mask to indicate valid time steps for x (batch, time). + It overwrites the mask of x_lengths. + return_mask: if True, it also return the output mask + target_shape: unused Returns: Tensor with output features - Tensor with mask + Tensor with output lengths + Tensor with mask if return_mask is True """ - if isinstance(self.in_layer, Conv2dSubsampler): - x, mask = self.in_layer(x, mask) - else: - if self.in_time_dim != 1: - x = x.transpose(1, self.in_time_dim).contiguous() - x = self.in_layer(x) - + max_in_length = x.size(self.in_time_dim) + x_mask = self._make_masks(max_in_length, x_lengths, x_mask) + x, x_mask = self._forward_input(x, x_mask) if isinstance(x, tuple): x, pos_emb = x b_args = {"pos_emb": pos_emb} @@ -263,7 +326,7 @@ def forward(self, x, mask=None, target_shape=None): b_args = {} for i in range(len(self.blocks)): - x, mask = self.blocks[i](x, mask=mask, **b_args) + x, x_mask = self.blocks[i](x, mask=x_mask, **b_args) if not self.red_lnorms: x = self.norm_out(x) @@ -271,10 +334,13 @@ def forward(self, x, mask=None, target_shape=None): if self.out_time_dim != 1: x = x.transpose(1, self.out_time_dim) - if mask is None: - return x + if x_lengths is not None: + x_lengths = scale_seq_lengths(x_lengths, x.size(1), max_in_length) + + if return_mask: + return x, x_lengths, x_mask - return x, mask + return x, x_lengths def get_config(self): """Gets network config @@ -298,8 +364,11 @@ def get_config(self): "att_dropout_rate": self.att_dropout_rate, "pos_dropout_rate": self.pos_dropout_rate, "in_layer_type": self.in_layer_type, + "in_stride": self.in_stride, "pos_enc_type": self.pos_enc_type, "causal_pos_enc": self.causal_pos_enc, + "pos_kernel_size": self.pos_kernel_size, + "pos_num_groups": self.pos_num_groups, "hid_act": self.hid_act, "se_r": self.se_r, "ff_macaron": self.ff_macaron, @@ -360,7 +429,7 @@ def out_shape(self, in_shape=None): @staticmethod def filter_args(**kwargs): - """Filters arguments correspondin to TransformerXVector + """Filters arguments correspondin to Conformer Encoder from args dictionary Args: @@ -369,41 +438,11 @@ def filter_args(**kwargs): Returns: args dictionary """ - - if "no_ff_macaron" in kwargs: - kwargs["ff_macaron"] = not kwargs["no_ff_macaron"] - - valid_args = ( - "num_blocks", - "in_feats", - "d_model", - "num_heads", - "att_type", - "att_context", - "conv_repeats", - "conv_kernel_sizes", - "conv_strides", - "ff_type", - "d_ff", - "ff_kernel_size", - "dropout_rate", - "pos_dropout_rate", - "att_dropout_rate", - "in_layer_type", - "hid_act", - "pos_enc_type", - "causal_pos_enc", - "conv_norm_layer", - "se_r", - "ff_macaron", - "red_lnorms", - "concat_after", - ) - - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args = filter_func_args(ConformerEncoderV1.__init__, kwargs) + return args @staticmethod - def add_class_args(parser, prefix=None, in_feats=False): + def add_class_args(parser, prefix=None, skip=set()): """Adds Conformer config parameters to argparser Args: @@ -414,7 +453,7 @@ def add_class_args(parser, prefix=None, in_feats=False): outer_parser = parser parser = ArgumentParser(prog="") - if in_feats: + if "in_feats" not in skip: parser.add_argument( "--in-feats", type=int, default=80, help=("input feature dimension") ) @@ -437,7 +476,11 @@ def add_class_args(parser, prefix=None, in_feats=False): parser.add_argument( "--att-type", default="scaled-dot-prod-v1", - choices=["scaled-dot-prod-v1", "local-scaled-dot-prod-v1"], + choices=[ + "scaled-dot-prod-v1", + "local-scaled-dot-prod-v1", + "block-scaled-dot-prod-v1", + ], help=("type of self-attention"), ) @@ -493,10 +536,7 @@ def add_class_args(parser, prefix=None, in_feats=False): help=("kernel size in convolutional feed forward block"), ) - try: - parser.add_argument("--hid-act", default="swish", help="hidden activation") - except: - pass + parser.add_argument("--hid-act", default="swish", help="hidden activation") parser.add_argument( "--pos-dropout-rate", @@ -514,41 +554,56 @@ def add_class_args(parser, prefix=None, in_feats=False): parser.add_argument( "--in-layer-type", default="linear", - choices=["linear", "conv2d-sub"], + choices=["linear", "conv2d-sub", "conv1d-sub"], help=("type of input layer"), ) - # parser.add_argument('--abs-pos-enc', default=False, action='store_true', - # help='use absolute positional encoder') + parser.add_argument( + "--in-stride", + default=4, + type=int, + choices=[1, 2, 4], + help="stride of conformer input layer", + ) + parser.add_argument( "--pos-enc-type", default="rel", - choices=["no", "rel", "abs"], + choices=["no", "rel", "abs", "conv"], help=("type of positional encoder"), ) parser.add_argument( "--causal-pos-enc", default=False, - action="store_true", + action=ActionYesNo, help="relative positional encodings are zero when attending to the future", ) + parser.add_argument( + "--pos-kernel-size", + default=128, + type=int, + help="kernel size for conv positional encoder", + ) + parser.add_argument( + "--pos-num-groups", + default=16, + type=int, + help="number of conv groups for conv positional encoder", + ) - try: - parser.add_argument( - "--conv-norm-layer", - default=None, - choices=[ - "batch-norm", - "group-norm", - "instance-norm", - "instance-norm-affine", - "layer-norm", - ], - help="type of normalization layer for conv block in conformer", - ) - except: - pass + parser.add_argument( + "--conv-norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer for conv block in conformer", + ) parser.add_argument( "--se-r", @@ -558,30 +613,85 @@ def add_class_args(parser, prefix=None, in_feats=False): ) parser.add_argument( - "--no-ff-macaron", - default=False, - action="store_true", + "--ff-macaron", + default=True, + action=ActionYesNo, help="do not use macaron style ff layers ", ) parser.add_argument( "--red-lnorms", - default=False, - action="store_true", + default=True, + action=ActionYesNo, help="use redundant Lnorm at conformer blocks' outputs", ) parser.add_argument( "--concat-after", default=False, - action="store_true", + action=ActionYesNo, help="concatenate attention input and output instead of adding", ) - # parser.add_argument('--in-norm', default=False, action='store_true', - # help='batch normalization at the input') if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='conformer encoder options') - add_argparse_args = add_class_args + @staticmethod + def filter_finetune_args(**kwargs): + valid_args = ( + "override_dropouts", + "dropout_rate", + "pos_dropout_rate", + "att_dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set([])): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) + except: + pass + + try: + parser.add_argument( + "--pos-dropout-rate", + default=0, + type=float, + help="positional encoder dropout probability", + ) + except: + pass + + try: + parser.add_argument( + "--att-dropout-rate", + default=0, + type=float, + help="attention dropout probability", + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/dc1d_decoder.py b/hyperion/torch/narchs/dc1d_decoder.py index c35d7720..172a3d70 100644 --- a/hyperion/torch/narchs/dc1d_decoder.py +++ b/hyperion/torch/narchs/dc1d_decoder.py @@ -4,15 +4,17 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn +from ..layer_blocks import DC1dDecBlock from ..layers import ActivationFactory as AF +from ..layers import ICNR1d from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import DC1dDecBlock -from ..layers import SubPixelConv1d, ICNR1d +from ..layers import SubPixelConv1d from .net_arch import NetArch @@ -29,7 +31,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -66,7 +68,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(self.conv_channels) // 2, 32) + norm_groups = min(min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block @@ -196,7 +198,7 @@ def _standarize_convblocks_param(p, num_blocks, p_name): return p def _compute_out_size(self, in_size): - out_size = in_size * in_stride + out_size = in_size * self.in_stride for stride in self.conv_strides: out_size *= stride @@ -279,13 +281,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -387,7 +389,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -418,18 +420,31 @@ def add_class_args(parser, prefix=None, head_channels=False): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) if prefix is not None: diff --git a/hyperion/torch/narchs/dc1d_encoder.py b/hyperion/torch/narchs/dc1d_encoder.py index 091629f4..6cf7f4ca 100644 --- a/hyperion/torch/narchs/dc1d_encoder.py +++ b/hyperion/torch/narchs/dc1d_encoder.py @@ -2,15 +2,16 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn +from ..layer_blocks.dc1d_blocks import DC1dEncBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks.dc1d_blocks import DC1dEncBlock from .net_arch import NetArch @@ -27,7 +28,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -64,7 +65,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(self.conv_channels) // 2, 32) + norm_groups = min(min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block @@ -209,7 +210,7 @@ def out_shape(self, in_shape=None): else: T = self._compute_out_size(in_shape[2]) - return (in_shape[0], out_chanels, T) + return (in_shape[0], out_channels, T) def forward(self, x): @@ -252,13 +253,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_feats", @@ -361,7 +362,7 @@ def add_class_args(parser, prefix=None, head_channels=False, in_feats=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -392,18 +393,32 @@ def add_class_args(parser, prefix=None, head_channels=False, in_feats=False): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) + parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) if prefix is not None: diff --git a/hyperion/torch/narchs/dc2d_decoder.py b/hyperion/torch/narchs/dc2d_decoder.py index 6ad7c4c9..68679e0b 100644 --- a/hyperion/torch/narchs/dc2d_decoder.py +++ b/hyperion/torch/narchs/dc2d_decoder.py @@ -3,16 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn +from ..layer_blocks import DC2dDecBlock from ..layers import ActivationFactory as AF +from ..layers import ICNR2d from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import DC2dDecBlock -from ..layers import SubPixelConv2d, ICNR2d +from ..layers import SubPixelConv2d from .net_arch import NetArch @@ -29,7 +31,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -66,7 +68,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(self.conv_channels) // 2, 32) + norm_groups = min(min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block @@ -210,7 +212,7 @@ def _standarize_convblocks_param(p, num_blocks, p_name): return p def _compute_out_size(self, in_size): - out_size = in_size * in_stride + out_size = in_size * self.in_stride for stride in self.conv_strides: out_size *= stride @@ -243,7 +245,7 @@ def out_shape(self, in_shape=None): else: W = self._compute_out_size(in_shape[3]) - return (in_shape[0], out_chanels, H, W) + return (in_shape[0], out_channels, H, W) def _match_shape(self, x, target_shape): x_dim = x.dim() @@ -300,13 +302,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -408,7 +410,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -439,18 +441,31 @@ def add_class_args(parser, prefix=None, head_channels=False): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) if prefix is not None: diff --git a/hyperion/torch/narchs/dc2d_encoder.py b/hyperion/torch/narchs/dc2d_encoder.py index c6857ff6..bc7e4b33 100644 --- a/hyperion/torch/narchs/dc2d_encoder.py +++ b/hyperion/torch/narchs/dc2d_encoder.py @@ -4,14 +4,15 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn +from ..layer_blocks import DC2dEncBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import DC2dEncBlock from .net_arch import NetArch @@ -28,7 +29,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -65,7 +66,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(self.conv_channels) // 2, 32) + norm_groups = min(min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block @@ -215,7 +216,7 @@ def out_shape(self, in_shape=None): else: W = self._compute_out_size(in_shape[3]) - return (in_shape[0], out_chanels, H, W) + return (in_shape[0], out_channels, H, W) def forward(self, x): @@ -258,13 +259,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -366,7 +367,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -397,18 +398,31 @@ def add_class_args(parser, prefix=None, head_channels=False): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) if prefix is not None: diff --git a/hyperion/torch/narchs/dino_head.py b/hyperion/torch/narchs/dino_head.py new file mode 100644 index 00000000..9f05aa7f --- /dev/null +++ b/hyperion/torch/narchs/dino_head.py @@ -0,0 +1,290 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from typing import Optional + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args +from ..layer_blocks import FCBlock +from ..layers import ActivationFactory as AF +from ..layers import CosLossOutput +from ..layers import NormLayer1dFactory as NLF +from .net_arch import NetArch + + +class DINOHead(NetArch): + """Classification Head for DINO x-vector style networks + + Attributes: + in_feats: input features + num_classes: number of output classes + hid_feats: dimension of hidding layer + bottleneck_feats: dimension of bottleneck layer before output + num_hid_layers: number of hidden layers + hid_act: str or dict hidden activation type in ['relu', 'relu6', 'swish', ... ] + output_type: type of output layer that will be used with the x-vector in ['softmax', 'cos-softmax'], + corresponding to standard cross-entorpy, cosine scoring + norm_layer: norm_layer object or str indicating type norm layer, if None it uses BatchNorm1d + use_norm: it True it uses layer/batch-normalization + norm_before: if True, layer-norm is before the activation function + use_in_norm: put batchnorm at the input + """ + + def __init__( + self, + in_feats, + num_classes, + hid_feats=2048, + bottleneck_feats=256, + num_hid_layers=3, + hid_act="gelu", + output_type="softmax", + norm_layer=None, + use_norm=False, + norm_before=True, + dropout_rate=0, + use_in_norm=False, + ): + super().__init__() + assert num_hid_layers >= 1, "num_layers (%d < 1)" % num_hid_layers + + self.num_hid_ayers = num_hid_layers + self.in_feats = in_feats + self.hid_feats = hid_feats + self.bottleneck_feats = bottleneck_feats + self.num_classes = num_classes + self.norm_layer = norm_layer + self.use_in_norm = use_in_norm + + if use_norm: + norm_groups = None + if norm_layer == "group-norm": + norm_groups = min(hid_feats // 8, 32) + self._norm_layer = NLF.create(norm_layer, norm_groups) + else: + self._norm_layer = None + + self.use_norm = use_norm + self.norm_before = norm_before + + self.dropout_rate = dropout_rate + self.output_type = output_type + if use_in_norm: + assert not self.norm_before + self.in_norm = self._norm_layer(in_feats) + + if num_hid_layers == 1: + self.fc_layers = nn.Linear(in_feats, bottleneck_feats) + else: + use_bias = False if use_norm and norm_before else True + layers = [nn.Linear(in_feats, hid_feats, bias=use_bias)] + if use_norm and norm_before: + layers.append(self._norm_layer(hid_feats)) + layers.append(AF.create(hid_act)) + if use_norm and not norm_before: + layers.append(self._norm_layer(hid_feats)) + if self.dropout_rate > 0: + layers.append(nn.Dropout(self.dropout_rate)) + + for _ in range(num_hid_layers - 2): + layers.append(nn.Linear(hid_feats, hid_feats, bias=use_bias)) + if use_norm and norm_before: + layers.append(self._norm_layer(hid_feats)) + layers.append(AF.create(hid_act)) + if use_norm and not norm_before: + layers.append(self._norm_layer(hid_feats)) + if self.dropout_rate > 0: + layers.append(nn.Dropout(self.dropout_rate)) + + layers.append(nn.Linear(hid_feats, bottleneck_feats)) + self.hid_layers = nn.Sequential(*layers) + + self.apply(self._init_weights) + if output_type == "softmax": + output = nn.Linear(bottleneck_feats, num_classes, bias=False) + with torch.no_grad(): + self.output = nn.utils.weight_norm(output) + self.output.weight_g.data.fill_(1) + self.output.weight_g.requires_grad = False + elif output_type == "cos-softmax": + self.output = CosLossOutput( + hid_feats, + num_classes, + cos_scale=1, + margin=0, + margin_warmup_epochs=0, + intertop_k=0, + intertop_margin=0, + ) + else: + raise ValueError(f"wrong loss_type={output_type}") + + # def before_cloning(self): + # if self.output_type == "cos-softmax": + # return None, None + + # torch.nn.utils.remove_weight_norm(self.output) + # return None, None + # cloned_output = self._clone_output() + # output = self.output + # self.output = None + # return output, cloned_output + + # def after_cloning(self, output: nn.Module): + # if self.output_type == "cos-softmax": + # return + + # self.output = nn.utils.weight_norm(self.output) + # self.output.weight_g.data.fill_(1) + # self.output.weight_g.requires_grad = False + + # def _clone_output(self): + # output = nn.utils.weight_norm( + # nn.Linear(self.bottleneck_feats, self.num_classes, bias=False) + # ) + # output.weight_g.data.fill_(1) + # output.weight_v.data = self.output_v.data.detach() + # output.weight_g.requires_grad = False + # return output + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None): + if self.use_in_norm: + x = self.in_norm(x) + # assert not torch.any( + # torch.isnan(x) + # ), f"x is nan {x.size()} {torch.sum(torch.isnan(x))}" + x = self.hid_layers(x) + # assert not torch.any( + # torch.isnan(x) + # ), f"x_hid is nan {x.size()} {torch.sum(torch.isnan(x))}" + x = nn.functional.normalize(x, dim=-1, p=2) + # assert not torch.any( + # torch.isnan(x) + # ), f"x_l2 is nan {x.size()} {torch.sum(torch.isnan(x))}" + x = self.output(x) + # assert not torch.any( + # torch.isnan(x) + # ), f"out is nan {x.size()} {torch.sum(torch.isnan(x))}" + return x + + def get_config(self): + hid_act = AF.get_config(self.fc_blocks[0].activation) + + config = { + "in_feats": self.in_feats, + "num_classes": self.num_classes, + "hid_feats": self.hid_feats, + "bottleneck_feats": self.bottleneck_feats, + "num_hid_layers": self.num_hid_layers, + "hid_act": hid_act, + "output_type": self.output_type, + "norm_layer": self.norm_layer, + "use_norm": self.use_norm, + "norm_before": self.norm_before, + "dropout_rate": self.dropout_rate, + "use_in_norm": self.use_in_norm, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] + + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] + + args = filter_func_args(DINOHead.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--botteneck-feats", + default=256, + type=int, + help=("bottleneck dimension before output layer"), + ) + + parser.add_argument( + "--num-hid-layers", + default=3, + type=int, + help=("number of hidden layers in the classif head"), + ) + + try: + parser.add_argument("--hid-act", default="gelu", help="hidden activation") + except: + pass + + parser.add_argument( + "--output-layer", + default="softmax", + choices=["softmax", "cos-softmax"], + help="loss type: softmax, cos-softmax", + ) + + try: + parser.add_argument( + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer for all components of x-vector network", + ) + except: + pass + + parser.add_argument( + "--use-norm", + default=True, + action=ActionYesNo, + help="without batch normalization", + ) + + parser.add_argument( + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", + ) + + parser.add_argument( + "--use-in-norm", + default=False, + action=ActionYesNo, + help="batch normalizaton in the classif head input", + ) + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/efficient_net.py b/hyperion/torch/narchs/efficient_net.py index ab60b8e2..b9efdcef 100644 --- a/hyperion/torch/narchs/efficient_net.py +++ b/hyperion/torch/narchs/efficient_net.py @@ -4,15 +4,16 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn -from torch.nn import Linear, Dropout +from torch.nn import Dropout, Linear +from ..layer_blocks import MBConvBlock, MBConvInOutBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import MBConvBlock, MBConvInOutBlock from .net_arch import NetArch @@ -132,7 +133,7 @@ def __init__( # set depth/width scales from net name self.cfg_width_scale = width_scale self.cfg_depth_scale = depth_scale - if width_scale is None or dept_scale is None: + if width_scale is None or depth_scale is None: width_scale, depth_scale = self.efficientnet_params(effnet_type)[:2] self.width_scale = width_scale self.depth_scale = depth_scale @@ -395,6 +396,17 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) + def change_dropouts(self, dropout_rate, drop_connect_rate): + super().change_dropouts(dropout_rate) + from ..layers import DropConnect2d + + for module in self.modules(): + if isinstance(module, DropConnect2d): + module.p *= drop_connect_rate / self.drop_connect_rate + + self.drop_connect_rate = drop_connect_rate + self.dropout_rate = dropout_rate + @staticmethod def filter_args(**kwargs): @@ -424,7 +436,6 @@ def filter_args(**kwargs): ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return args @staticmethod @@ -590,6 +601,53 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='efficientnet options') add_argparse_args = add_class_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + parser.add_argument( + "--drop-connect-rate", + default=0.2, + type=float, + help="layer drop probability", + ) + + try: + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + "out_units", + "override_dropouts", + "drop_connect_rate", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args diff --git a/hyperion/torch/narchs/etdnn.py b/hyperion/torch/narchs/etdnn.py index ebc14534..a73439b7 100644 --- a/hyperion/torch/narchs/etdnn.py +++ b/hyperion/torch/narchs/etdnn.py @@ -9,9 +9,9 @@ import torch.nn as nn from torch.nn import Conv1d, Linear +from ..layer_blocks import ETDNNBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import ETDNNBlock from .net_arch import NetArch diff --git a/hyperion/torch/narchs/fcnet.py b/hyperion/torch/narchs/fcnet.py index e0c8afd5..a47f304e 100644 --- a/hyperion/torch/narchs/fcnet.py +++ b/hyperion/torch/narchs/fcnet.py @@ -4,12 +4,12 @@ """ import torch.nn as nn -from torch.nn import Linear, BatchNorm1d, Dropout +from torch.nn import BatchNorm1d, Dropout, Linear +from ..layer_blocks import FCBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF from .net_arch import NetArch -from ..layer_blocks import FCBlock class FCNetV1(NetArch): @@ -125,7 +125,7 @@ def __init__( in_units, hid_units, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, dropout_rate=0, norm_layer=None, diff --git a/hyperion/torch/narchs/feat_fuser_mvn.py b/hyperion/torch/narchs/feat_fuser_mvn.py new file mode 100644 index 00000000..0656e279 --- /dev/null +++ b/hyperion/torch/narchs/feat_fuser_mvn.py @@ -0,0 +1,111 @@ +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from typing import Any, Dict, Optional + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ..layers import FeatFuserFactory as FFF +from ..layers import MeanVarianceNorm as MVN +from ..layers import SpecAugment +from .net_arch import NetArch + + +class FeatFuserMVN(NetArch): + """FeatureFuser for Wav2Vec style hidden features + ST-MVN + Optional SpecAugment + """ + + def __init__( + self, + feat_fuser: Dict[str, Any], + mvn: Optional[Dict[str, Any]] = None, + spec_augment: Optional[Dict[str, Any]] = None, + trans: bool = False, + aug_after_mvn: bool = False, + ): + super().__init__() + + feat_fuser = FFF.filter_args(**feat_fuser) + self.feat_fuser_cfg = feat_fuser + self.feat_fuser = FFF.create(**feat_fuser) + + self.mvn = None + self.mvn_cfg = None + if mvn is not None: + mvn = MVN.filter_args(**mvn) + self.mvn_cfg = mvn + if ( + ("norm_mean" in mvn) + and mvn["norm_mean"] + or ("norm_var" in mvn) + and mvn["norm_var"] + ): + self.mvn = MVN(**mvn) + + self.spec_augment = None + self.spec_augment_cfg = None + if spec_augment is not None: + spec_augment = SpecAugment.filter_args(**spec_augment) + self.spec_augment_cfg = spec_augment + self.spec_augment = SpecAugment(**spec_augment) + + self.trans = trans + self.aug_after_mvn = aug_after_mvn + + @property + def fuser_type(self): + return self.feat_fuser_cfg["fuser_type"] + + def forward(self, feats, feats_lengths=None): + feats = self.feat_fuser(feats) + if self.spec_augment is not None and not self.aug_after_mvn: + feats = self.spec_augment(feats, feats_lengths) + + if self.mvn is not None: + feats = self.mvn(feats, feats_lengths) + + if self.spec_augment is not None and self.aug_after_mvn: + feats = self.spec_augment(feats, feats_lengths) + + if self.trans: + feats = feats.transpose(1, 2).contiguous() + + return feats, feats_lengths + + def get_config(self): + config = { + "feat_fuser": self.feat_fuser_cfg, + "mvn": self.mvn_cfg, + "spec_augment": self.spec_augment_cfg, + "trans": self.trans, + "aug_after_mvn": self.aug_after_mvn, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + valid_args = ("feat_fuser", "mvn", "spec_augment", "trans", "aug_after_mvn") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + FFF.add_class_args(parser, prefix="feat_fuser") + MVN.add_class_args(parser, prefix="mvn") + SpecAugment.add_class_args(parser, prefix="spec_augment") + parser.add_argument( + "--aug-after-mvn", + default=False, + action=ActionYesNo, + help=("do spec augment after st-mvn," "instead of before"), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/proj_head.py b/hyperion/torch/narchs/proj_head.py new file mode 100644 index 00000000..63a5e128 --- /dev/null +++ b/hyperion/torch/narchs/proj_head.py @@ -0,0 +1,140 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from torch.nn import Linear + +from ...utils.misc import filter_func_args +from ..layer_blocks import FCBlock +from ..layers import ActivationFactory as AF +from ..layers import NormLayer1dFactory as NLF +from .net_arch import NetArch + + +class ProjHead(NetArch): + """Classification Head for x-vector style networks + + Attributes: + in_feats: input features + num_classes: number of output classes + out_feats: dimension of embedding layer + num_embed_layers: number of hidden layers + hid_act: str or dict hidden activation type in ['relu', 'relu6', 'swish', ... ] + loss_type: type of loss function that will be used with the x-vector in ['softmax', 'cos-softmax', 'arc-softmax'], + corresponding to standard cross-entorpy, additive margin softmax or additive angular margin softmax. + cos_scale: scale parameter for cos-softmax and arc-softmax + margin: margin parameter for cos-softmax and arc-softmax + margin_warmup_epochs: number of epochs to anneal the margin from 0 to margin + intertop_k: adds negative angular penalty to k largest negative scores. + intertop_margin: inter-top-k penalty. + num_subcenters: number of subcenters in subcenter losses + norm_layer: norm_layer object or str indicating type norm layer, if None it uses BatchNorm1d + use_norm: it True it uses layer/batch-normalization + norm_before: if True, layer-norm is before the activation function + use_in_norm: put batchnorm at the input + """ + + def __init__( + self, + in_feats, + out_feats=256, + norm_layer=None, + use_norm=True, + norm_before=True, + ): + super().__init__() + + self.in_feats = in_feats + self.out_feats = out_feats + self.norm_layer = norm_layer + self.use_norm = use_norm + self.norm_before = norm_before + use_bias = True + if use_norm: + norm_groups = None + if norm_layer == "group-norm": + norm_groups = min(out_feats // 8, 32) + _norm_layer = NLF.create(norm_layer, norm_groups) + if norm_before: + self._norm_layer = _norm_layer(in_feats) + else: + self._norm_layer = _norm_layer(out_feats) + use_bias = False + else: + self._norm_layer = None + + self.proj = nn.Linear(in_feats, out_feats, bias=use_bias) + + def forward(self, x, y=None): + if self.use_norm and self.norm_before: + x = self._norm_layer(x) + + x = self.proj(x) + + if self.use_norm and not self.norm_before: + x = self._norm_layer(x) + + return x + + def get_config(self): + config = { + "in_feats": self.in_feats, + "out_feats": self.out_feats, + "norm_layer": self.norm_layer, + "use_norm": self.use_norm, + "norm_before": self.norm_before, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(ProjHead.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--out-feats", default=256, type=int, help=("projection dimension") + ) + + try: + parser.add_argument( + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer for all components of x-vector network", + ) + except: + pass + + parser.add_argument( + "--use-norm", + default=False, + action=ActionYesNo, + help="without batch normalization", + ) + + parser.add_argument( + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton after activation", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/resetdnn.py b/hyperion/torch/narchs/resetdnn.py index 2c7f3e00..eb964fa5 100644 --- a/hyperion/torch/narchs/resetdnn.py +++ b/hyperion/torch/narchs/resetdnn.py @@ -7,11 +7,11 @@ import torch import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear +from ..layer_blocks import ETDNNBlock, ResETDNNBlock, TDNNBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import ResETDNNBlock, ETDNNBlock, TDNNBlock from .net_arch import NetArch diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py index ca972713..7abe4e54 100644 --- a/hyperion/torch/narchs/resnet.py +++ b/hyperion/torch/narchs/resnet.py @@ -5,23 +5,23 @@ import logging import numpy as np - import torch import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear -from ..layers import ActivationFactory as AF -from ..layers import NormLayer2dFactory as NLF from ..layer_blocks import ( - ResNetInputBlock, + Res2NetBasicBlock, + Res2NetBNBlock, ResNetBasicBlock, ResNetBNBlock, + ResNetEndpointBlock, + ResNetInputBlock, SEResNetBasicBlock, SEResNetBNBlock, - Res2NetBasicBlock, - Res2NetBNBlock, ) -from ..layer_blocks import ResNetEndpointBlock +from ..layers import ActivationFactory as AF +from ..layers import NormLayer2dFactory as NLF +from ..utils import scale_seq_lengths, seq_lengths_to_mask from .net_arch import NetArch @@ -29,9 +29,12 @@ class ResNet(NetArch): """ResNet2D base class Attributes: - block: resnet basic block type in ['basic', 'bn', 'sebasic', 'sebn'], meaning + block: resnet basic block type in + ['basic', 'bn', 'sebasic', 'sebn', 'res2basic' + 'res2bn', 'seres2basic', 'seres2bn'], meaning basic resnet block, bottleneck resnet block, basic block with squeeze-excitation, - and bottleneck block with squeeze-excitation + bottleneck block with squeeze-excitation, Res2Net basic and bottlenec, and + squeeze-excitation Res2Net basic and bottleneck. num_layers: list with the number of layers in each of the 4 layer blocks that we find in resnets, after each layer block feature maps are downsmapled times 2 in each dimension @@ -46,6 +49,8 @@ class ResNet(NetArch): out_act: output activation zero_init_residual: initializes batchnorm weights to zero so each residual block behaves as identitiy at the beggining. We observed worse results when using this option in x-vectors + multilevel: if True, the output is the combination of the feature maps at different resolution levels. + endpoint_channels: number of output channels when multilevel is True. groups: number of groups in convolutions replace_stride_with_dilation: use dialted conv nets instead of downsammpling, we never tested this. dropout_rate: dropout rate @@ -57,7 +62,9 @@ class ResNet(NetArch): instead of time-freq dimension or HxW dimensions in_feats: input feature size (number of components in dimension of 2 of input tensor), this is only required when time_se=True to calculcate the size of the squeeze excitation matrices. - + res2net_scale: Res2Net scale parameter + res2net_width_factor: Res2Net multiplier for the width of the bottlneck layers. + freq_pos_enc: use frequency wise positional encoder """ def __init__( @@ -68,7 +75,7 @@ def __init__( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, @@ -83,17 +90,20 @@ def __init__( do_maxpool=True, in_norm=True, se_r=16, - time_se=False, + se_type="cw-se", in_feats=None, res2net_scale=4, res2net_width_factor=1, + resb_channels=None, + time_se=False, + freq_pos_enc=False, ): - super().__init__() logging.info("{}".format(locals())) self.block = block self.has_se = False self.is_res2net = False + if isinstance(block, str): if block == "basic": self._block = ResNetBasicBlock @@ -111,13 +121,15 @@ def __init__( elif block == "res2bn": self._block = Res2NetBNBlock self.is_res2net = True - elif block == "seres2bn" or block == "tseres2bn": + elif block in ("seres2bn", "tseres2bn"): self._block = Res2NetBNBlock self.has_se = True self.is_res2net = True else: self._block = block + assert not self.has_se and not freq_pos_enc or in_feats is not None + self.num_layers = num_layers self.in_channels = in_channels self.conv_channels = conv_channels @@ -134,12 +146,17 @@ def __init__( # self.width_per_group = width_per_group self.se_r = se_r self.time_se = time_se + if time_se: + se_type = "t-se" + self.se_type = se_type self.in_feats = in_feats self.res2net_scale = res2net_scale self.res2net_width_factor = res2net_width_factor + self.resb_channels = resb_channels self.multilevel = multilevel self.endpoint_channels = endpoint_channels + self.freq_pos_enc = freq_pos_enc self.norm_layer = norm_layer norm_groups = None @@ -180,25 +197,31 @@ def __init__( self._context = self.in_block.context self._downsample_factor = self.in_block.downsample_factor + if resb_channels is None: + resb_channels = [base_channels * (2**i) for i in range(4)] + self.cur_in_channels = conv_channels - self.layer1 = self._make_layer(self._block, base_channels, num_layers[0]) + self.layer1 = self._make_layer(self._block, resb_channels[0], num_layers[0]) self.layer2 = self._make_layer( self._block, - 2 * base_channels, + # 2 * base_channels, + resb_channels[1], num_layers[1], stride=2, dilate=replace_stride_with_dilation[0], ) self.layer3 = self._make_layer( self._block, - 4 * base_channels, + # 4 * base_channels, + resb_channels[2], num_layers[2], stride=2, dilate=replace_stride_with_dilation[1], ) self.layer4 = self._make_layer( self._block, - 8 * base_channels, + # 8 * base_channels, + resb_channels[3], num_layers[3], stride=2, dilate=replace_stride_with_dilation[2], @@ -271,8 +294,6 @@ def __init__( nn.init.constant_(m.bn2.weight, 0) def _make_layer(self, block, channels, num_blocks, stride=1, dilate=False): - norm_layer = self._norm_layer - downsample = None previous_dilation = self.dilation if dilate: self.dilation *= stride @@ -280,11 +301,20 @@ def _make_layer(self, block, channels, num_blocks, stride=1, dilate=False): kwargs = {} if self.has_se: - if self.time_se: - num_feats = int(self.in_feats / (self._downsample_factor * stride)) - kwargs = {"se_r": self.se_r, "time_se": True, "num_feats": num_feats} - else: + if self.se_type == "cw-se": kwargs = {"se_r": self.se_r} + else: + num_feats = int(self.in_feats / (self._downsample_factor * stride)) + kwargs = { + "se_r": self.se_r, + "se_type": self.se_type, + "num_feats": num_feats, + } + + if self.freq_pos_enc: + kwargs["freq_pos_enc"] = True + num_feats = int(self.in_feats / (self._downsample_factor * stride)) + kwargs["num_feats"] = num_feats if self.is_res2net: kwargs["scale"] = self.res2net_scale @@ -395,40 +425,61 @@ def out_shape(self, in_shape=None): return (in_shape[0], self.layer4[-1].out_channels, H, W) - def forward(self, x, use_amp=False): - if use_amp: - with torch.cuda.amp.autocast(): - return self._forward(x) + def _forward_layer_with_lens(layer, x, in_lengths, max_in_length): + x_lengths = scale_seq_lengths(in_lengths, x.size(-1), max_in_length) + x_mask = seq_lengths_to_mask(x_lengths, x.size(-1), time_dim=3) - return self._forward(x) + for sub_layer in layer: + x = sub_layer(x, x_mask) - def _forward(self, x): + return x + + def forward(self, x, x_lengths=None): """forward function Args: x: input tensor of size=(batch, Cin, Hin, Win) for image or size=(batch, C, freq, time) for audio - + x_lengths: when x are sequences with time in Win dimension, it + contains the lengths of the sequences. Returns: Tensor with output logits of size=(batch, out_units) if out_units>0, otherwise, it returns tensor of represeantions of size=(batch, Cout, Hout, Wout) """ + if x_lengths is not None: + # if all lengths are eq. to the max length, we set x_lengths to None + max_length = x.size(-1) + if torch.all(x_lengths == max_length): + x_lengths = None if self.in_norm: x = self.in_bn(x) feats = [] x = self.in_block(x) - x = self.layer1(x) - x = self.layer2(x) - if self.multilevel: - feats.append(x) - x = self.layer3(x) - if self.multilevel: - feats.append(x) - x = self.layer4(x) - if self.multilevel: - feats.append(x) + + if x_lengths is None: + x = self.layer1(x) + x = self.layer2(x) + if self.multilevel: + feats.append(x) + x = self.layer3(x) + if self.multilevel: + feats.append(x) + x = self.layer4(x) + if self.multilevel: + feats.append(x) + else: + x = self._forward_layer_with_lens(self.layer1, x, x_lengths, max_length) + x = self._forward_layer_with_lens(self.layer2, x, x_lengths, max_length) + if self.multilevel: + feats.append(x) + x = self._forward_layer_with_lens(self.layer3, x, x_lengths, max_length) + if self.multilevel: + feats.append(x) + x = self._forward_layer_with_lens(self.layer4, x, x_lengths, max_length) + if self.multilevel: + feats.append(x) if self.multilevel: out2 = self.endpoint2(feats[0]) @@ -547,9 +598,12 @@ def get_config(self): "out_act": out_act, "hid_act": hid_act, "se_r": self.se_r, + "se_type": self.se_type, "in_feats": self.in_feats, "res2net_scale": self.res2net_scale, "res2net_width_factor": self.res2net_width_factor, + "resb_channels": self.resb_channels, + "freq_pos_enc": self.freq_pos_enc, } base_config = super().get_config() @@ -608,6 +662,20 @@ def __init__(self, in_channels, **kwargs): super().__init__("bn", [3, 4, 23, 3], in_channels, **kwargs) +class IdRndResNet100(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + super().__init__("basic", [6, 16, 24, 3], in_channels, **kwargs) + + +class IdRndResNet202(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + super().__init__("basic", [6, 16, 75, 3], in_channels, **kwargs) + + class LResNet18(ResNet): def __init__(self, in_channels, **kwargs): kwargs["conv_channels"] = 16 @@ -636,6 +704,16 @@ def __init__(self, in_channels, **kwargs): super().__init__("bn", [3, 4, 6, 3], in_channels, **kwargs) +# multi-level feature ResNet +class LResNet34_345(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["multilevel"] = True + kwargs["endpoint_channels"] = 64 + super().__init__("basic", [3, 4, 6, 3], in_channels, **kwargs) + + # Squezee-Excitation ResNets @@ -813,8 +891,231 @@ def __init__(self, in_channels, **kwargs): super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) +# Freq-wise Squezee-Excitation ResNets + + +class FwSEResNet18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("sebasic", [2, 2, 2, 2], in_channels, **kwargs) + + +class FwSEResNet34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("sebasic", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEResNet101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSEResNet152(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 8, 36, 3], in_channels, **kwargs) + + +class FwSEResNext50_32x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEResNext101_32x8d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 256 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSEWideResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEWideResNet101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSELResNet18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "fw-se" + super().__init__("sebasic", [2, 2, 2, 2], in_channels, **kwargs) + + +class FwSELResNet34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "fw-se" + super().__init__("sebasic", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSELResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSELResNext50_4x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 4 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEIdRndResNet100(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + kwargs["se_type"] = "fw-se" + super().__init__("sebasic", [6, 16, 24, 3], in_channels, **kwargs) + + +class FwSEIdRndResNet202(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + kwargs["se_type"] = "fw-se" + super().__init__("sebasic", [6, 16, 75, 3], in_channels, **kwargs) + + +# Channel-Freq-wise Squezee-Excitation ResNets + + +class CFwSEResNet18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("sebasic", [2, 2, 2, 2], in_channels, **kwargs) + + +class CFwSEResNet34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("sebasic", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEResNet101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSEResNet152(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 8, 36, 3], in_channels, **kwargs) + + +class CFwSEResNext50_32x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEResNext101_32x8d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 256 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSEWideResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEWideResNet101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSELResNet18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("sebasic", [2, 2, 2, 2], in_channels, **kwargs) + + +class CFwSELResNet34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("sebasic", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSELResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSELResNext50_4x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 4 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEIdRndResNet100(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + kwargs["se_type"] = "cfw-se" + super().__init__("sebasic", [6, 16, 24, 3], in_channels, **kwargs) + + +class CFwSEIdRndResNet202(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + kwargs["se_type"] = "cfw-se" + super().__init__("sebasic", [6, 16, 75, 3], in_channels, **kwargs) + + #################### Res2Net variants ######################## + # Standard Res2Nets class Res2Net18(ResNet): def __init__(self, in_channels, **kwargs): @@ -1024,11 +1325,155 @@ def __init__(self, in_channels, **kwargs): super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) -# multi-level feature ResNet -class LResNet34_345(ResNet): +# frequency-wise Squezee-Excitation Res2Nets +class FwSERes2Net18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("se2basic", [2, 2, 2, 2], in_channels, **kwargs) + + +class FwSERes2Net34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("se2basic", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSERes2Net50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSERes2Net101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSERes2Net152(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 8, 36, 3], in_channels, **kwargs) + + +class FwSERes2Next50_32x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSERes2Next101_32x8d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 256 + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSEWideRes2Net50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEWideRes2Net101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSELRes2Net50(ResNet): def __init__(self, in_channels, **kwargs): kwargs["conv_channels"] = 16 kwargs["base_channels"] = 16 - kwargs["multilevel"] = True - kwargs["endpoint_channels"] = 64 - super().__init__("basic", [3, 4, 6, 3], in_channels, **kwargs) + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSELRes2Next50_4x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 4 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +# channel-frequency-wise Squezee-Excitation Res2Nets +class CFwSERes2Net18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("se2basic", [2, 2, 2, 2], in_channels, **kwargs) + + +class CFwSERes2Net34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("se2basic", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSERes2Net50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSERes2Net101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSERes2Net152(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 8, 36, 3], in_channels, **kwargs) + + +class CFwSERes2Next50_32x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSERes2Next101_32x8d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 256 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSEWideRes2Net50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEWideRes2Net101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSELRes2Net50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSELRes2Next50_4x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 4 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) diff --git a/hyperion/torch/narchs/resnet1d_decoder.py b/hyperion/torch/narchs/resnet1d_decoder.py index e3290c71..9332724f 100644 --- a/hyperion/torch/narchs/resnet1d_decoder.py +++ b/hyperion/torch/narchs/resnet1d_decoder.py @@ -2,17 +2,24 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn +from ..layer_blocks import ( + DC1dDecBlock, + ResNet1dBasicDecBlock, + ResNet1dBNDecBlock, + SEResNet1dBasicDecBlock, + SEResNet1dBNDecBlock, +) from ..layers import ActivationFactory as AF +from ..layers import ICNR1d from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import ResNet1dBasicDecBlock, ResNet1dBNDecBlock, DC1dDecBlock -from ..layer_blocks import SEResNet1dBasicDecBlock, SEResNet1dBNDecBlock -from ..layers import SubPixelConv1d, ICNR1d +from ..layers import SubPixelConv1d from .net_arch import NetArch @@ -31,7 +38,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -85,7 +92,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(resb_channels) // 2, 32) + norm_groups = min(min(resb_channels) // 2, 32) norm_groups = max(norm_groups, resb_groups) self._norm_layer = NLF.create(norm_layer, norm_groups) @@ -236,7 +243,7 @@ def _standarize_resblocks_param(p, num_blocks, p_name): return p def _compute_out_size(self, in_size): - out_size = in_size * in_stride + out_size = in_size * self.in_stride for stride in self.conv_strides: out_size *= stride @@ -264,7 +271,7 @@ def out_shape(self, in_shape=None): else: T = self._compute_out_size(in_shape[2]) - return (in_shape[0], out_chanels, T) + return (in_shape[0], out_channels, T) def _match_shape(self, x, target_shape): t = x.size(-1) @@ -323,13 +330,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -349,7 +356,7 @@ def filter_args(**kwargs): "head_act", "dropout_rate", "use_norm", - "norm-layer", + "norm_layer", "norm_before", ) @@ -447,7 +454,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -478,18 +485,31 @@ def add_class_args(parser, prefix=None): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) parser.add_argument( diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py index 78ceeac6..97b244f3 100644 --- a/hyperion/torch/narchs/resnet1d_encoder.py +++ b/hyperion/torch/narchs/resnet1d_encoder.py @@ -3,26 +3,28 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +import logging import math import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn -from ..layers import ActivationFactory as AF -from ..layers import NormLayer1dFactory as NLF from ..layer_blocks import ( + DC1dEncBlock, + Res2Net1dBasicBlock, + Res2Net1dBNBlock, ResNet1dBasicBlock, ResNet1dBNBlock, - DC1dEncBlock, ResNet1dEndpoint, SEResNet1dBasicBlock, SEResNet1dBNBlock, - Res2Net1dBasicBlock, - Res2Net1dBNBlock, ) +from ..layers import ActivationFactory as AF +from ..layers import NormLayer1dFactory as NLF +from ..utils import seq_lengths_to_mask from .net_arch import NetArch @@ -41,7 +43,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, drop_connect_rate=0, @@ -371,13 +373,36 @@ def _match_lens(endpoints): return endpoints - def forward(self, x): + @staticmethod + def _update_mask(x, x_lengths, x_mask=None): + if x_lengths is None: + return None - x = self.in_block(x) + if x_mask is not None and x.size(-1) == x_mask.size(-1): + return x_mask + + return seq_lengths_to_mask(x_lengths, x.size(-1), time_dim=2) + + def forward(self, x, x_lengths=None): + """forward function + + Args: + x: input tensor of size=(batch, C, time) + x_lengths: it contains the lengths of the sequences. + Returns: + Tensor with output logits of size=(batch, out_units) if out_units>0, + otherwise, it returns tensor of represeantions of size=(batch, Cout, out_time) + + """ + + x_mask = self._update_mask(x, x_lengths) + x = self.in_block(x, x_mask=x_mask) endpoints = [] + for i, superblock in enumerate(self.blocks): for j, block in enumerate(superblock): - x = block(x) + x_mask = self._update_mask(x, x_lengths, x_mask) + x = block(x, x_mask=x_mask) if self.multilayer and self.is_endpoint[i]: endpoint_i = x @@ -401,11 +426,12 @@ def forward(self, x): x = torch.mean(torch.stack(endpoints), 0) if self.head_channels > 0: + x_mask = self._update_mask(x, x_lengths, x_mask) x = self.head_block(x) return x - def forward_hid_feats(self, x, layers=None, return_output=False): + def forward_hid_feats(self, x, x_lengths=None, layers=None, return_output=False): assert layers is not None or return_output if layers is None: @@ -452,7 +478,7 @@ def forward_hid_feats(self, x, layers=None, return_output=False): if self.head_channels > 0: x = self.head_block(x) - return x + return h, x def get_config(self): @@ -493,15 +519,31 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) + def change_config(self, override_dropouts, dropout_rate, drop_connect_rate): + if override_dropouts: + logging.info("chaning resnet1d dropouts") + self.change_dropouts(dropout_rate, drop_connect_rate) + + def change_dropouts(self, dropout_rate, drop_connect_rate): + super().change_dropouts(dropout_rate) + from ..layers import DropConnect1d + + for module in self.modules(): + if isinstance(module, DropConnect1d): + module.p *= drop_connect_rate / self.drop_connect_rate + + self.drop_connect_rate = drop_connect_rate + self.dropout_rate = dropout_rate + @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_feats", @@ -639,7 +681,7 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -680,18 +722,31 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) parser.add_argument( @@ -712,10 +767,7 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])): ) parser.add_argument( - "--res2net-scale", - default=1, - type=int, - help=("res2net scaling parameter "), + "--res2net-scale", default=1, type=int, help=("res2net scaling parameter "), ) parser.add_argument( @@ -766,6 +818,55 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='ResNet1d encoder options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + "override_dropouts", + "drop_connect_rate", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set([])): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) + except: + pass + + try: + parser.add_argument( + "--drop-connect-rate", + default=0, + type=float, + help="layer drop probability", + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/resnet2d_decoder.py b/hyperion/torch/narchs/resnet2d_decoder.py index f5becf76..0afa1acc 100644 --- a/hyperion/torch/narchs/resnet2d_decoder.py +++ b/hyperion/torch/narchs/resnet2d_decoder.py @@ -4,16 +4,23 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn +from ..layer_blocks import ( + DC2dDecBlock, + ResNet2dBasicDecBlock, + ResNet2dBNDecBlock, + SEResNet2dBasicDecBlock, + SEResNet2dBNDecBlock, +) from ..layers import ActivationFactory as AF +from ..layers import ICNR2d from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import ResNet2dBasicDecBlock, ResNet2dBNDecBlock, DC2dDecBlock -from ..layer_blocks import SEResNet2dBasicDecBlock, SEResNet2dBNDecBlock -from ..layers import SubPixelConv2d, ICNR2d +from ..layers import SubPixelConv2d from .net_arch import NetArch @@ -32,7 +39,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -86,7 +93,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(resb_channels) // 2, 32) + norm_groups = min(min(resb_channels) // 2, 32) norm_groups = max(norm_groups, resb_groups) self._norm_layer = NLF.create(norm_layer, norm_groups) @@ -237,7 +244,7 @@ def _standarize_resblocks_param(p, num_blocks, p_name): return p def _compute_out_size(self, in_size): - out_size = in_size * in_stride + out_size = in_size * self.in_stride for stride in self.conv_strides: out_size *= stride @@ -270,7 +277,7 @@ def out_shape(self, in_shape=None): else: W = self._compute_out_size(in_shape[3]) - return (in_shape[0], out_chanels, H, W) + return (in_shape[0], out_channels, H, W) def _match_shape(self, x, target_shape): x_dim = x.dim() @@ -330,13 +337,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -454,7 +461,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -485,18 +492,31 @@ def add_class_args(parser, prefix=None): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) parser.add_argument( diff --git a/hyperion/torch/narchs/resnet2d_encoder.py b/hyperion/torch/narchs/resnet2d_encoder.py index 22fc7fdd..a7fd047e 100644 --- a/hyperion/torch/narchs/resnet2d_encoder.py +++ b/hyperion/torch/narchs/resnet2d_encoder.py @@ -3,21 +3,59 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import math -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn +from ..layer_blocks import ( + DC2dEncBlock, + Res2Net2dBasicBlock, + Res2Net2dBNBlock, + ResNet2dBasicBlock, + ResNet2dBNBlock, + SEResNet2dBasicBlock, + SEResNet2dBNBlock, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import ResNet2dBasicBlock, ResNet2dBNBlock, DC2dEncBlock -from ..layer_blocks import SEResNet2dBasicBlock, SEResNet2dBNBlock -from ..layer_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock +from ..utils import seq_lengths_to_mask from .net_arch import NetArch class ResNet2dEncoder(NetArch): + """ResNet 2d Encoder. + This is similar to ResNet class but it offers more configuration possibilities + + Attributes: + in_channels=1, + in_conv_channels=64, + in_kernel_size=3, + in_stride=1, + resb_type="basic", + resb_repeats=[2, 2, 2, 2], + resb_channels=[64, 128, 256, 512], + resb_kernel_sizes=3, + resb_strides=2, + resb_dilations=1, + resb_groups=1, + head_channels=0, + hid_act="relu", + head_act=None, + dropout_rate=0, + se_r=16, + time_se=False, + in_feats=None, + res2net_width_factor=1, + res2net_scale=4, + use_norm=True, + norm_layer=None, + norm_before=True, + """ + def __init__( self, in_channels=1, @@ -32,7 +70,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -104,7 +142,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(resb_channels) // 2, 32) + norm_groups = min(min(resb_channels) // 2, 32) norm_groups = max(norm_groups, resb_groups) self._norm_layer = NLF.create(norm_layer, norm_groups) @@ -266,7 +304,17 @@ def out_shape(self, in_shape=None): else: W = self._compute_out_size(in_shape[3]) - return (in_shape[0], out_chanels, H, W) + return (in_shape[0], out_channels, H, W) + + @staticmethod + def _update_mask(x, x_lengths, x_mask=None): + if x_lengths is None: + return None + + if x_mask is not None and x.size(-1) == x_mask.size(-1): + return x_mask + + return seq_lengths_to_mask(x_lengths, x.size(-1), time_dim=3) def forward(self, x): @@ -312,16 +360,32 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) + def change_config(self, override_dropouts, dropout_rate, drop_connect_rate): + if override_dropouts: + logging.info("chaning resnet2d dropouts") + self.change_dropouts(dropout_rate, drop_connect_rate) + + def change_dropouts(self, dropout_rate, drop_connect_rate): + super().change_dropouts(dropout_rate) + from ..layers import DropConnect2d + + for module in self.modules(): + if isinstance(module, DropConnect2d): + module.p *= drop_connect_rate / self.drop_connect_rate + + self.drop_connect_rate = drop_connect_rate + self.dropout_rate = dropout_rate + @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -452,7 +516,7 @@ def add_class_args(parser, prefix=None, skip=set()): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -483,18 +547,32 @@ def add_class_args(parser, prefix=None, skip=set()): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) + parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) parser.add_argument( diff --git a/hyperion/torch/narchs/resnet_factory.py b/hyperion/torch/narchs/resnet_factory.py index bd58cd2b..ba9d21a5 100644 --- a/hyperion/torch/narchs/resnet_factory.py +++ b/hyperion/torch/narchs/resnet_factory.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from .resnet import * @@ -21,6 +21,7 @@ "lresnet34": LResNet34, "lresnet50": LResNet50, "lresnext50_4x4d": LResNext50_4x4d, + "lresnet34_345": LResNet34_345, "seresnet18": SEResNet18, "seresnet34": SEResNet34, "seresnet50": SEResNet50, @@ -47,6 +48,32 @@ "tselresnet34": TSELResNet34, "tselresnet50": TSELResNet50, "tselresnext50_4x4d": TSELResNext50_4x4d, + "fwseresnet18": FwSEResNet18, + "fwseresnet34": FwSEResNet34, + "fwseresnet50": FwSEResNet50, + "fwseresnet101": FwSEResNet101, + "fwseresnet152": FwSEResNet152, + "fwseresnext50_32x4d": FwSEResNext50_32x4d, + "fwseresnext101_32x8d": FwSEResNext101_32x8d, + "fwsewideresnet50": FwSEWideResNet50, + "fwsewideresnet101": FwSEWideResNet101, + "fwselresnet18": FwSELResNet18, + "fwselresnet34": FwSELResNet34, + "fwselresnet50": FwSELResNet50, + "fwselresnext50_4x4d": FwSELResNext50_4x4d, + "cfwseresnet18": CFwSEResNet18, + "cfwseresnet34": CFwSEResNet34, + "cfwseresnet50": CFwSEResNet50, + "cfwseresnet101": CFwSEResNet101, + "cfwseresnet152": CFwSEResNet152, + "cfwseresnext50_32x4d": CFwSEResNext50_32x4d, + "cfwseresnext101_32x8d": CFwSEResNext101_32x8d, + "cfwsewideresnet50": CFwSEWideResNet50, + "cfwsewideresnet101": CFwSEWideResNet101, + "cfwselresnet18": CFwSELResNet18, + "cfwselresnet34": CFwSELResNet34, + "cfwselresnet50": CFwSELResNet50, + "cfwselresnext50_4x4d": CFwSELResNext50_4x4d, "res2net18": Res2Net18, "res2net34": Res2Net34, "res2net50": Res2Net50, @@ -80,7 +107,34 @@ "tsewideres2net101": TSEWideRes2Net101, "tselres2net50": TSELRes2Net50, "tselres2next50_4x4d": TSELRes2Next50_4x4d, - "lresnet34_345": LResNet34_345, + "fwseres2net18": FwSERes2Net18, + "fwseres2net34": FwSERes2Net34, + "fwseres2net50": FwSERes2Net50, + "fwseres2net101": FwSERes2Net101, + "fwseres2net152": FwSERes2Net152, + "fwseres2next50_32x4d": FwSERes2Next50_32x4d, + "fwseres2next101_32x8d": FwSERes2Next101_32x8d, + "fwsewideres2net50": FwSEWideRes2Net50, + "fwsewideres2net101": FwSEWideRes2Net101, + "fwselres2net50": FwSELRes2Net50, + "fwselres2next50_4x4d": FwSELRes2Next50_4x4d, + "cfwseres2net18": CFwSERes2Net18, + "cfwseres2net34": CFwSERes2Net34, + "cfwseres2net50": CFwSERes2Net50, + "cfwseres2net101": CFwSERes2Net101, + "cfwseres2net152": CFwSERes2Net152, + "cfwseres2next50_32x4d": CFwSERes2Next50_32x4d, + "cfwseres2next101_32x8d": CFwSERes2Next101_32x8d, + "cfwsewideres2net50": CFwSEWideRes2Net50, + "cfwsewideres2net101": CFwSEWideRes2Net101, + "cfwselres2net50": CFwSELRes2Net50, + "cfwselres2next50_4x4d": CFwSELRes2Next50_4x4d, + "idrndresnet100": IdRndResNet100, + "idrndresnet202": IdRndResNet202, + "fwseidrndresnet100": FwSEIdRndResNet100, + "fwseidrndresnet202": FwSEIdRndResNet202, + "cfwseidrndresnet100": CFwSEIdRndResNet100, + "cfwseidrndresnet202": CFwSEIdRndResNet202, } @@ -92,7 +146,7 @@ def create( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, @@ -108,8 +162,8 @@ def create( in_feats=None, res2net_scale=4, res2net_width_factor=1, + freq_pos_enc=False, ): - try: resnet_class = resnet_dict[resnet_type] except: @@ -136,15 +190,12 @@ def create( in_feats=in_feats, res2net_scale=res2net_scale, res2net_width_factor=res2net_width_factor, + freq_pos_enc=freq_pos_enc, ) return resnet def filter_args(**kwargs): - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] - if "no_maxpool" in kwargs: kwargs["do_maxpool"] = not kwargs["no_maxpool"] del kwargs["no_maxpool"] @@ -170,6 +221,7 @@ def filter_args(**kwargs): "se_r", "res2net_scale", "res2net_width_factor", + "freq_pos_enc", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -247,27 +299,24 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--in-norm", default=False, - action="store_true", + action=ActionYesNo, help="batch normalization at the input", ) parser.add_argument( "--no-maxpool", default=False, - action="store_true", + action=ActionYesNo, help="don't do max pooling after first convolution", ) parser.add_argument( "--zero-init-residual", default=False, - action="store_true", + action=ActionYesNo, help="Zero-initialize the last BN in each residual branch", ) - # parser.add_argument('--replace-stride-with-dilation', default=None, nargs='+', type=bool, - # help='replaces strides with dilations to increase context without downsampling') - parser.add_argument( "--se-r", default=16, @@ -287,17 +336,18 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass try: parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) + except: pass @@ -306,8 +356,50 @@ def add_class_args(parser, prefix=None): except: pass + parser.add_argument( + "--freq-pos-enc", + default=False, + action=ActionYesNo, + help="use frequency wise positional encoder", + ) + if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='ResNet options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + valid_args = ( + "override_dropouts", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/rnn_encoder.py b/hyperion/torch/narchs/rnn_encoder.py new file mode 100644 index 00000000..7df33274 --- /dev/null +++ b/hyperion/torch/narchs/rnn_encoder.py @@ -0,0 +1,282 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +from typing import Dict, Optional, Tuple, Union + +import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +import torch +import torch.nn as nn +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence + +from ...utils.misc import filter_func_args +from ..layer_blocks import TransformerConv2dSubsampler as Subsampler +from ..layers import ActivationFactory as AF +#from ..layers import NormLayer1dFactory as NLF +from ..utils import seq_lengths_to_mask +from .net_arch import NetArch + + +class RNNEncoder(NetArch): + """ RNN Encoder network + + Attributeds: + in_feats: input features + hid_feats: hidden features in RNN layers + out_feats: output features, if 0 we remove last projection layer + num_layers: number of RNN layers + proj_feats: projection features in LSTM layers + rnn_type: type of RNN in [lstm, gru] + bidirectional: whether RNN layers are bidirectional + dropout_rate: dropout rate + subsample_input: whether to subsample the input features time dimension x4 + subsampling_act: activation function of the subsampling block + """ + + def __init__(self, + in_feats: int, + hid_feats: int, + out_feats: int, + num_layers: int, + proj_feats: int = 0, + rnn_type: str = "lstm", + bidirectional: bool = False, + dropout_rate: float = 0.0, + subsample_input: bool = False, + subsampling_act: str = "relu"): + super().__init__() + if rnn_type != "lstm": + proj_feats = 0 + + self.in_feats = in_feats + self.hid_feats = hid_feats + self.out_feats = out_feats + self.num_layers = num_layers + self.proj_feats = proj_feats + self.rnn_type = rnn_type + self.bidirectional = bidirectional + self.subsample_input = subsample_input + self.subsampling_act = subsampling_act + + rnn_feats = hid_feats if proj_feats == 0 else proj_feats + if subsample_input: + subsamplinb_act = AF.create(subsampling_act) + self.subsampler = Subsampler(in_feats, + hid_feats, + hid_act=subsampling_act) + lstm_in_dim = hid_feats + else: + self.subsampler = None + lstm_in_dim = in_feats + + if rnn_type == "lstm": + self.rnn = nn.LSTM( + input_size=lstm_in_dim, + hidden_size=hid_feats, + num_layers=num_layers, + bias=True, + proj_size=proj_feats, + batch_first=True, + dropout=dropout_rate, + bidirectional=bidirectional, + ) + else: + self.rnn = nn.GRU( + input_size=lstm_in_dim, + hidden_size=hid_feats, + num_layers=num_layers, + bias=True, + batch_first=True, + dropout=dropout_rate, + bidirectional=bidirectional, + ) + + if out_feats > 0: + self.output = nn.Sequential( + nn.Dropout(p=dropout_rate), + nn.Linear(rnn_feats, out_feats), + ) + + def forward(self, x: torch.Tensor, + x_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + if self.subsample_input: + t1 = x.size(1) + x = self.subsampler(x) + t2 = x.size(2) + x_lengths = torch.div(t2 * x_lengths, t1, rounding_mode="floor") + + x = pack_padded_sequence(input=x, + lengths=x_lengths.cpu(), + batch_first=True, + enforce_sorted=True) + x, _ = self.rnn(x) + x, x_lengths = pad_packed_sequence(x, batch_first=True) + if self.out_feats > 0: + x = self.output(x) + + return x, x_lengths + + def in_context(self): + return (self._context, self._context) + + def in_shape(self): + return (None, None, self.in_feats) + + def out_shape(self, in_shape=None): + out_feats = self.out_feats if self.out_feats > 0 else ( + self.proj_feats if self.proj_feats > 0 else self.hid_feats) + + if in_shape is None: + return (None, None, out_feats) + + assert len(in_shape) == 3 + return (*in_shape, out_feats) + + def get_config(self): + config = filter_func_args(RNNEncoder.__init__, self.__dict__) + base_config = super().get_config() + base_config.update(config) + return base_config + #return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, override_dropouts, dropout_rate): + if override_dropouts: + logging.info("changing RNNEncoder dropouts") + self.change_dropouts(dropout_rate) + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(RNNEncoder.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument("--in-feats", + type=int, + required=True, + help=("input feature dimension")) + + parser.add_argument( + "--hid-feats", + default=1024, + type=int, + help=("num of hidden dimensions of RNN layers"), + ) + + parser.add_argument( + "--out-feats", + default=512, + type=int, + help= + ("number of output dimensions of the encoder, if 0 output projection is removed" + ), + ) + + parser.add_argument( + "--proj-feats", + default=0, + type=int, + help=("projection features of LSTM layers"), + ) + + parser.add_argument( + "--num-layers", + default=5, + type=int, + help=("number of RNN layers"), + ) + + parser.add_argument( + "--in-kernel-size", + default=3, + type=int, + help=("kernel size of input convolution"), + ) + + parser.add_argument( + "--rnn-type", + default="lstm", + choices=[ + "lstm", + "gru", + ], + help=("RNN type in [lstm, gru]"), + ) + + parser.add_argument( + "--bidirectional", + default=False, + action=ActionYesNo, + help="whether to use bidirectional RNN", + ) + + parser.add_argument( + "--subsample-input", + default=False, + action=ActionYesNo, + help="whether to subsaple input features x4", + ) + parser.add_argument("--subsampling-act", + default="relu", + help="activation for subsampler block") + + if "dropout_rate" not in skip: + parser.add_argument("--dropout-rate", + default=0, + type=float, + help="dropout probability") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + "override_dropouts", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set([])): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", + default=0, + type=float, + help="dropout probability") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py new file mode 100644 index 00000000..77c1234a --- /dev/null +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -0,0 +1,826 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn as nn +import torchaudio +import torchaudio.functional +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +try: + import k2 +except ModuleNotFoundError: + from ..utils import dummy_k2 as k2 + +from ...utils.misc import filter_func_args +from ...utils.text import add_sos +from ..layer_blocks import TransducerConvPredictor as ConvPredictor +from ..layer_blocks import TransducerJoiner as Joiner +from ..layer_blocks import TransducerRNNPredictor as RNNPredictor +from .net_arch import NetArch + + +@dataclass +class Hypothesis: + ys: List[int] # predicted sequences + log_prob: float # log prob of ys + + # Optional LSTM predictor state. + pred_state: Optional[Tuple[torch.Tensor, ...]] = None + + +class RNNTransducerDecoder(NetArch): + """RNN-T Decoder composed of Predictor and Joiner networks + Implementation based on + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer/transducer.py + + Attributes: + in_feats: input features dimension (encoder output) + vocab_size: Number of tokens of the modeling unit including blank. + predictor: Dictionary with the predictor options. + joiner: Dictionary with the joiner options. + blank_id: id of the null symbol. + rnnt_loss: type of rnn-t loss between torchaudio, k2 or k2_pruned. + rnnt_type: rnn-t variation between regular, modified or constrained. + delay_penalty: penalize symbol delay, which is used to make symbol + emit earlier. + reduction: type of reduction for rnn-t loss between sum or mean + prune_range: how many symbols to keep for each frame in k2 rnn-t + pruned loss. + lm_scale: language model scale in rnn-t smoothed loss. + am_scale: acoustic model scale in rnn-t smoothed loss. + simple_loss_scale: weight of rnn-t simple loss when using k2 pruned loss. + pruned_warmup_steps: number of steps to warm up the k2 rnn-t pruned loss + from 0.1 to 1. + """ + + def __init__( + self, + in_feats: int, + vocab_size: int, + predictor: Dict, + joiner: Dict, + blank_id: int = 0, + rnnt_loss: str = "k2_pruned", + rnnt_type: str = "regular", + delay_penalty: float = 0.0, + reduction: str = "sum", + prune_range: int = 5, + lm_scale: float = 0.25, + am_scale: float = 0.0, + simple_loss_scale: float = 0.5, + pruned_warmup_steps: int = 2000, + ): + super().__init__() + self.in_feats = in_feats + self.vocab_size = vocab_size + self.predictor_args = predictor + self.joiner_args = joiner + self.blank_id = blank_id + self.rnnt_loss = rnnt_loss + self.rnnt_type = rnnt_type + self.delay_penalty = delay_penalty + self.reduction = reduction + self.prune_range = prune_range + self.lm_scale = lm_scale + self.am_scale = am_scale + self.simple_loss_scale = simple_loss_scale + self.pruned_warmup_steps = pruned_warmup_steps + + self._make_predictor() + self._make_joiner() + + if self.rnnt_loss == "k2_pruned": + self.simple_am_proj = nn.Linear(in_feats, vocab_size) + self.simple_lm_proj = nn.Linear(self.predictor.out_feats, vocab_size) + self.register_buffer("cur_step", torch.as_tensor(0, dtype=torch.int)) + + def _make_predictor(self): + pred_type = self.predictor_args["pred_type"] + self.predictor_args["in_feats"] = self.in_feats + self.predictor_args["vocab_size"] = self.vocab_size + self.predictor_args["blank_id"] = self.blank_id + if pred_type == "rnn": + pred_args = filter_func_args(RNNPredictor.__init__, self.predictor_args) + self.predictor = RNNPredictor(**pred_args) + elif pred_type == "conv": + pred_args = filter_func_args(ConvPredictor.__init__, self.predictor_args) + self.predictor = ConvPredictor(**pred_args) + self.predictor_args["out_feats"] = self.predictor.embed_dim + else: + raise ValueError(f"Unknown predictor type {pred_type}") + + def _make_joiner(self): + joiner_type = self.joiner_args["joiner_type"] + + if joiner_type == "basic": + pred_feats = self.predictor_args["out_feats"] + hid_feats = self.joiner_args["hid_feats"] + self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, self.vocab_size) + else: + raise ValueError(f"Unknown joiner type {joiner_type}") + + def get_config(self): + config = { + "in_feats": self.in_feats, + "vocab_size": self.vocab_size, + "predictor": self.predictor_args, + "joiner": self.joiner_args, + "blank_id": self.blank_id, + "rnnt_loss": self.rnnt_loss, + "rnnt_type": self.rnnt_type, + "delay_penalty": self.delay_penalty, + "reduction": self.reduction, + "prune_range": self.prune_range, + "lm_scale": self.lm_scale, + "am_scale": self.am_scale, + "simple_loss_scale": self.simple_loss_scale, + "pruned_warmup_steps": self.pruned_warmup_steps, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def _rnnt_loss_torchaudio( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): + logits = self.joiner(x, pred_out) + # rnnt_loss requires 0 padded targets + # Note: y does not start with SOS + y_padded = y.pad(mode="constant", padding_value=0) + x_lengths = x_lengths.to(torch.int32) + loss = torchaudio.functional.rnnt_loss( + logits=logits, + targets=y_padded.to(torch.int32), + logit_lengths=x_lengths, + target_lengths=y_lengths, + blank=self.blank_id, + reduction=self.reduction, + ) + return loss + + def _rnnt_loss_k2( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): + y_padded = y.pad(mode="constant", padding_value=0) + y_padded = y_padded.to(torch.int64) + boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device) + boundary[:, 2] = y_lengths + boundary[:, 3] = x_lengths + + logits = self.joiner(x, pred_out) + + with torch.cuda.amp.autocast(enabled=False): + loss = k2.rnnt_loss( + logits=logits.float(), + symbols=y_padded, + termination_symbol=self.blank_id, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + ) + return loss + + def _rnnt_loss_k2_pruned( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): + y_padded = y.pad(mode="constant", padding_value=0) + y_padded = y_padded.to(torch.int64) + boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device) + boundary[:, 2] = y_lengths + boundary[:, 3] = x_lengths + + am_simple = self.simple_am_proj(x) + lm_simple = self.simple_lm_proj(pred_out) + with torch.cuda.amp.autocast(enabled=False): + loss_simple, (px_grad, py_grad) = k2.rnnt_loss_smoothed( + lm=lm_simple.float(), + am=am_simple.float(), + symbols=y_padded, + termination_symbol=self.blank_id, + lm_only_scale=self.lm_scale, + am_only_scale=self.am_scale, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + return_grad=True, + ) + + # ranges : [B, T, prune_range] + ranges = k2.get_rnnt_prune_ranges( + px_grad=px_grad, + py_grad=py_grad, + boundary=boundary, + s_range=self.prune_range, + ) + + # am_pruned : [B, T, prune_range, encoder_dim] + # lm_pruned : [B, T, prune_range, decoder_dim] + am_pruned, lm_pruned = k2.do_rnnt_pruning( + am=self.joiner.enc_proj(x), + lm=self.joiner.pred_proj(pred_out), + ranges=ranges, + ) + + # logits : [B, T, prune_range, vocab_size] + + # project_input=False since we applied the decoder's input projections + # prior to do_rnnt_pruning (this is an optimization for speed). + logits = self.joiner(am_pruned, lm_pruned, project_input=False) + + with torch.cuda.amp.autocast(enabled=False): + loss_pruned = k2.rnnt_loss_pruned( + logits=logits.float(), + symbols=y_padded, + ranges=ranges, + termination_symbol=self.blank_id, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + ) + + if self.cur_step > self.pruned_warmup_steps: + simple_loss_scale = self.simple_loss_scale + pruned_loss_scale = 1.0 + else: + r = self.cur_step / self.pruned_warmup_steps + simple_loss_scale = 1.0 - r * (1.0 - self.simple_loss_scale) + pruned_loss_scale = 0.1 + 0.9 * r + self.cur_step += 1 + # print(simple_loss_scale, pruned_loss_scale) + + loss = simple_loss_scale * loss_simple + pruned_loss_scale * loss_pruned + + return loss, loss_simple, loss_pruned + + def forward( + self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + # get y_lengths + row_splits = y.shape.row_splits(1) + y_lengths = row_splits[1:] - row_splits[:-1] + # shift y adding token + sos_y = add_sos(y, sos_id=self.blank_id) + sos_y_padded = sos_y.pad(mode="constant", padding_value=self.blank_id) + sos_y_padded = sos_y_padded.to(torch.int64) + # apply predictor and joiner + pred_out, _ = self.predictor(sos_y_padded) + loss_simple = loss_pruned = None + if self.rnnt_loss == "k2_pruned": + loss, loss_simple, loss_pruned = self._rnnt_loss_k2_pruned( + x, x_lengths, y, y_lengths, pred_out + ) + elif self.rnnt_loss == "k2": + loss = self._rnnt_loss_k2(x, x_lengths, y, y_lengths, pred_out) + elif self.rnnt_loss == "torchaudio": + loss_simple = loss_pruned = None + loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths, pred_out) + + return loss, loss_simple, loss_pruned + + def decode( + self, + x: torch.Tensor, + x_lengths: torch.Tensor = None, + method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ) -> List[int]: + if method == "time_sync_beam_search": + return self.decode_time_sync_beam_search( + x, x_lengths, beam_width=beam_width + ) + elif method == "align_length_sync_beam_search": + return self.decode_align_length_sync_beam_search( + x, x_lengths, beam_width=beam_width, max_sym_per_utt=max_sym_per_utt + ) + elif method == "greedy": + return self.decode_greedy( + x, + x_lengths, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt, + ) + + def decode_greedy( + self, + x: torch.Tensor, + x_lengths: torch.Tensor = None, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ) -> List[int]: + """ + Args: + x: encoder embeddings with shape = (N, T, C) + Returns: + Decoded tokens + """ + assert x.ndim == 3 + + # support only batch_size == 1 for now + assert x.size(0) == 1, x.size(0) + blank_id = self.blank_id + device = x.device + + sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1) + pred_out, state = self.predictor(sos) + T = x.size(1) + t = 0 + hyp = [] + + sym_per_frame = 0 + sym_per_utt = 0 + + while t < T and sym_per_utt < max_sym_per_utt: + x_t = x[:, t : t + 1, :] + logits = self.joiner(x_t, pred_out) # (1, 1, 1, vocab_size) + # logits is + + log_prob = logits.log_softmax(dim=-1) # (1, 1, 1, vocab_size) + # TODO: Use logits.argmax() + y = log_prob.argmax() + if y != blank_id: + hyp.append(y.item()) + y = y.reshape(1, 1) + pred_out, state = self.predictor(y, state) + + sym_per_utt += 1 + sym_per_frame += 1 + + if y == blank_id or sym_per_frame > max_sym_per_frame: + sym_per_frame = 0 + t += 1 + + return hyp + + def decode_time_sync_beam_search( + self, x: torch.Tensor, x_lengths: torch.Tensor = None, beam_width: int = 5 + ) -> List[int]: + assert x.ndim == 3 + assert x.size(0) == 1, x.size(0) + + blank_id = self.blank_id + device = x.device + + sos = torch.tensor([blank_id], device=device).reshape(1, 1) + pred_out, state = self.predictor(sos) + T = x.size(1) + t = 0 + B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] + max_u = 20000 # terminate after this number of steps + u = 0 + + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {} + + while t < T and u < max_u: + x_t = x[:, t : t + 1, :] + A = B + B = [] + + while u < max_u: + y_star = max(A, key=lambda hyp: hyp.log_prob) + A.remove(y_star) + + # Note: y_star.ys is unhashable, i.e., cannot be used + # as a key into a dict + cached_key = "_".join(map(str, y_star.ys)) + + if cached_key not in cache: + pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) + + pred_out, pred_state = self.predictor( + pred_in, + y_star.pred_state, + ) + cache[cached_key] = (pred_out, pred_state) + else: + pred_out, pred_state = cache[cached_key] + + logits = self.joiner(x_t, pred_out) + log_prob = logits.log_softmax(dim=-1) + # log_prob is (1, 1, 1, vocab_size) + log_prob = log_prob.squeeze() + # Now log_prob is (vocab_size,) + + # If we choose blank here, add the new hypothesis to B. + # Otherwise, add the new hypothesis to A + + # First, choose blank + skip_log_prob = log_prob[blank_id] + new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() + # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob, + # skip_log_prob.item(), new_y_star_log_prob) + # ys[:] returns a copy of ys + new_y_star = Hypothesis( + ys=y_star.ys[:], + log_prob=new_y_star_log_prob, + # Caution: Use y_star.decoder_state here + pred_state=y_star.pred_state, + ) + B.append(new_y_star) + + topk_log_prob = log_prob.topk(beam_width, dim=-1) + + # Second, choose other labels + # for i, v in enumerate(log_prob.tolist()): + for v, i in zip(*topk_log_prob): + v = v.item() + i = i.item() + if i == blank_id: + continue + new_ys = y_star.ys + [i] + new_log_prob = y_star.log_prob + v + new_hyp = Hypothesis( + ys=new_ys, + log_prob=new_log_prob, + pred_state=pred_state, + ) + A.append(new_hyp) + + u += 1 + # check whether B contains more than "beam" elements more probable + # than the most probable in A + A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + # print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + B = sorted( + [hyp for hyp in B if hyp.log_prob > A_most_probable.log_prob], + key=lambda hyp: hyp.log_prob, + reverse=True, + ) + # print("tuAB2", + # t, + # u, + # len(A), + # A_most_probable.log_prob, + # len(B), + # flush=True) + if len(B) >= beam_width: + B = B[:beam_width] + break + t += 1 + + best_hyp = max(B, key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + ys = best_hyp.ys[1:] # [1:] to remove the blank + return ys + + def decode_align_length_sync_beam_search( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + beam_width: int = 5, + max_sym_per_utt: int = 1000, + ) -> List[int]: + assert x.ndim == 3 + assert x.size(0) == 1, x.size(0) + + blank_id = self.blank_id + device = x.device + + sos = torch.tensor([blank_id], device=device).reshape(1, 1) + pred_out, state = self.predictor(sos) + T = x.size(1) + # t = 0 + B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] + # max_u = 20000 # terminate after this number of steps + # u = 0 + + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {} + F = [] + # for t < T and u < max_u: + for i in range(T + max_sym_per_utt): + A = [] + for y_star in B: + # while u < max_u: + u = len(y_star.ys) - 1 + t = i - u + if t >= T: + continue + + # y_star = max(A, key=lambda hyp: hyp.log_prob) + # A.remove(y_star) + x_t = x[:, t : t + 1, :] + # Note: y_star.ys is unhashable, i.e., cannot be used + # as a key into a dict + cached_key = "_".join(map(str, y_star.ys)) + + if cached_key not in cache: + pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) + + pred_out, pred_state = self.predictor( + pred_in, + y_star.pred_state, + ) + cache[cached_key] = (pred_out, pred_state) + else: + pred_out, pred_state = cache[cached_key] + + logits = self.joiner(x_t, pred_out) + log_prob = logits.log_softmax(dim=-1) # (1, 1, 1, vocab_size) + log_prob = log_prob.squeeze() # (vocab_size,) + + # First, choose blank + skip_log_prob = log_prob[blank_id] + new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() + # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob, + # skip_log_prob.item(), new_y_star_log_prob) + # ys[:] returns a copy of ys + new_y_star = Hypothesis( + ys=y_star.ys[:], + log_prob=new_y_star_log_prob, + # Caution: Use y_star.pred_state here + pred_state=y_star.pred_state, + ) + A.append(new_y_star) + if t == T - 1: + F.append(y_star) + + topk_log_prob = log_prob.topk(beam_width, dim=-1) + + # Second, choose other labels + # for i, v in enumerate(log_prob.tolist()): + for v, i in zip(*topk_log_prob): + v = v.item() + i = i.item() + if i == blank_id: + continue + new_ys = y_star.ys + [i] + new_log_prob = y_star.log_prob + v + new_hyp = Hypothesis( + ys=new_ys, + log_prob=new_log_prob, + pred_state=pred_state, + ) + A.append(new_hyp) + + # check whether B contains more than "beam_width" elements more probable + # than the most probable in A + # A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + # print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + B0 = sorted( + [hyp for hyp in A], + key=lambda hyp: hyp.log_prob, + reverse=True, + ) + B = [] + B_ys = set() + for hyp in B0: + hyp_ys = tuple(hyp.ys) # to make ys hashable + if hyp_ys not in B_ys: + B.append(hyp) + B_ys.add(hyp_ys) + # print("tuAB2", + # t, + # u, + # len(A), + # A_most_probable.log_prob, + # len(B), + # flush=True) + if len(B) >= beam_width: + B = B[:beam_width] + break + + best_hyp = max(F, key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + ys = best_hyp.ys[1:] # [1:] to remove the blank + return ys + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + ): + logging.info("changing decoder config") + self.predictor.change_config( + override_dropouts, embed_dropout_rate, rnn_dropout_rate + ) + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(RNNTransducerDecoder.__init__, kwargs) + return args + + @staticmethod + def filter_finetune_args(**kwargs): + args = filter_func_args(RNNTransducerDecoder.change_config, kwargs) + return args + + @staticmethod + def add_pred_args(parser): + pred_parser = ArgumentParser(prog="") + pred_parser.add_argument( + "--pred-type", + default="rnn", + choices=["rnn", "conv"], + help="""type of predictor between RNN and Convolutional [rnn, conv]""", + ) + pred_parser.add_argument( + "--embed-dim", default=1024, type=int, help=("token embedding dimension") + ) + pred_parser.add_argument( + "--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for predictor input embeddings"), + ) + pred_parser.add_argument( + "--rnn-dropout-rate", + default=0.0, + type=float, + help="""dropout prob for decoder RNN """, + ) + pred_parser.add_argument( + "--rnn-type", + default="lstm", + choices=["lstm", "gru"], + help="""type of recurrent network for thep predictor in [lstm, gru]""", + ) + + pred_parser.add_argument( + "--num-layers", + default=2, + type=int, + help="""number of layers of the predictor """, + ) + + pred_parser.add_argument( + "--hid-feats", + default=512, + type=int, + help="""hidden features of the predictor""", + ) + pred_parser.add_argument( + "--out-feats", + default=512, + type=int, + help="""output features of the predictor""", + ) + pred_parser.add_argument( + "--context-size", + default=2, + type=int, + help="""context length of the convolutional + predictor, 1->bigram, 2-> trigram,...""", + ) + + parser.add_argument("--predictor", action=ActionParser(parser=pred_parser)) + + @staticmethod + def add_joiner_args(parser): + pred_parser = ArgumentParser(prog="") + pred_parser.add_argument( + "--joiner-type", + default="basic", + choices=["basic"], + help="""type of joiner network, there is only basic joiner for now""", + ) + pred_parser.add_argument( + "--hid-feats", + default=512, + type=int, + help="""hidden features of the joiner""", + ) + parser.add_argument("--joiner", action=ActionParser(parser=pred_parser)) + + @staticmethod + def add_class_args( + parser, prefix=None, skip=set(["in_feats", "blank_id", "vocab_size"]) + ): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument( + "--in-feats", type=int, required=True, help=("input feature dimension") + ) + if "blank_id" not in skip: + parser.add_argument( + "--blank-id", + type=int, + default=0, + help=("blank id from tokenizer model"), + ) + if "vocab_size" not in skip: + parser.add_argument( + "--vocab-size", + type=int, + required=True, + help=("output prediction dimension"), + ) + + RNNTransducerDecoder.add_pred_args(parser) + RNNTransducerDecoder.add_joiner_args(parser) + parser.add_argument( + "--rnnt-loss", + default="k2_pruned", + choices=["torchaudio", "k2", "k2_pruned"], + help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""", + ) + parser.add_argument( + "--rnnt-type", + default="regular", + choices=["regular", "modified", "constrained"], + help="""type of rnn-t loss between regular, modified or constrained.""", + ) + parser.add_argument( + "--delay-penalty", + default=0.0, + type=float, + help="""penalize symbol delay, which is used to make symbol emit earlier + for streaming models.""", + ) + parser.add_argument( + "--reduction", + default="sum", + choices=["sum", "mean"], + help="""type of reduction for rnn-t loss between sum or mean""", + ) + parser.add_argument( + "--prune-range", + default=5, + type=int, + help="""how many symbols to keep for each frame in k2 rnn-t + pruned loss.""", + ) + parser.add_argument( + "--lm-scale", + default=0.25, + type=float, + help="""language model scale in rnn-t smoothed loss""", + ) + parser.add_argument( + "--am-scale", + default=0.0, + type=float, + help="""acoustic model scale in rnn-t smoothed loss""", + ) + parser.add_argument( + "--simple-loss-scale", + default=0.5, + type=float, + help="""weight of rnn-t simple loss when using k2 pruned loss""", + ) + parser.add_argument( + "--pruned-warmup-steps", + default=2000, + type=int, + help="""number of steps to warm up the k2 rnn-t pruned loss + from 0.1 to 1""", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + parser.add_argument( + "--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings"), + ) + parser.add_argument( + "--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN "), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/spinenet.py b/hyperion/torch/narchs/spinenet.py index 4185d9c4..4349dbe1 100644 --- a/hyperion/torch/narchs/spinenet.py +++ b/hyperion/torch/narchs/spinenet.py @@ -3,17 +3,27 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import logging + +import numpy as np + import torch import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d - +from torch.nn import BatchNorm1d, Conv1d, Linear + +from ..layer_blocks import ( + BlockSpec, + Res2NetBasicBlock, + Res2NetBNBlock, + ResNetBasicBlock, + ResNetBNBlock, + ResNetInputBlock, + SpineConv, + SpineEndpoints, + SpineResample, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import ResNetInputBlock, ResNetBasicBlock, ResNetBNBlock -from ..layer_blocks import Res2NetBNBlock, Res2NetBasicBlock -from ..layer_blocks import BlockSpec, SpineResample, SpineEndpoints, SpineConv from .net_arch import NetArch SPINENET_BLOCK_SPECS = [ @@ -109,7 +119,7 @@ def __init__( do_endpoint_conv=True, concat_ax=3, upsampling_type="nearest", - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, diff --git a/hyperion/torch/narchs/spinenet_factory.py b/hyperion/torch/narchs/spinenet_factory.py index 02e36244..871b37e9 100644 --- a/hyperion/torch/narchs/spinenet_factory.py +++ b/hyperion/torch/narchs/spinenet_factory.py @@ -2,7 +2,7 @@ Copyright 2020 Magdalena Rybicka Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from .spinenet import * @@ -44,7 +44,7 @@ def create( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, @@ -243,7 +243,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -266,3 +266,40 @@ def add_class_args(parser, prefix=None): outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + "override_dropouts", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/tdnn.py b/hyperion/torch/narchs/tdnn.py index 6cdcbf85..55e47e6a 100644 --- a/hyperion/torch/narchs/tdnn.py +++ b/hyperion/torch/narchs/tdnn.py @@ -9,9 +9,9 @@ import torch.nn as nn from torch.nn import Linear +from ..layer_blocks import TDNNBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import TDNNBlock from .net_arch import NetArch diff --git a/hyperion/torch/narchs/tdnn_factory.py b/hyperion/torch/narchs/tdnn_factory.py index 584e9243..77f69b9c 100644 --- a/hyperion/torch/narchs/tdnn_factory.py +++ b/hyperion/torch/narchs/tdnn_factory.py @@ -3,11 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from .tdnn import TDNNV1 from .etdnn import ETDNNV1 from .resetdnn import ResETDNNV1 +from .tdnn import TDNNV1 class TDNNFactory(object): @@ -21,7 +21,7 @@ def create( kernel_size=3, dilation=1, dilation_factor=1, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_units=0, out_act=None, dropout_rate=0, @@ -194,7 +194,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass @@ -264,3 +264,40 @@ def add_class_args(parser, prefix=None): # help='TDNN options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + "override_dropouts", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/torch_na_loader.py b/hyperion/torch/narchs/torch_na_loader.py index 542742fa..58152fc7 100644 --- a/hyperion/torch/narchs/torch_na_loader.py +++ b/hyperion/torch/narchs/torch_na_loader.py @@ -5,39 +5,31 @@ import torch -from .fcnet import FCNetV1 - -from .tdnn import TDNNV1 -from .etdnn import ETDNNV1 -from .resetdnn import ResETDNNV1 - -from .resnet import * - -from .transformer_encoder_v1 import TransformerEncoderV1 +from .audio_feats_mvn import AudioFeatsMVN +from .classif_head import ClassifHead from .conformer_encoder_v1 import ConformerEncoderV1 - -from .dc1d_encoder import DC1dEncoder from .dc1d_decoder import DC1dDecoder -from .dc2d_encoder import DC2dEncoder +from .dc1d_encoder import DC1dEncoder from .dc2d_decoder import DC2dDecoder - -from .resnet1d_encoder import ResNet1dEncoder +from .dc2d_encoder import DC2dEncoder +from .efficient_net import EfficientNet +from .etdnn import ETDNNV1 +from .fcnet import FCNetV1 +from .resetdnn import ResETDNNV1 +from .resnet import * from .resnet1d_decoder import ResNet1dDecoder -from .resnet2d_encoder import ResNet2dEncoder +from .resnet1d_encoder import ResNet1dEncoder from .resnet2d_decoder import ResNet2dDecoder - -from .efficient_net import EfficientNet - -from .classif_head import ClassifHead - -from .audio_feats_mvn import AudioFeatsMVN +from .resnet2d_encoder import ResNet2dEncoder +from .tdnn import TDNNV1 +from .transformer_encoder_v1 import TransformerEncoderV1 class TorchNALoader(object): @staticmethod def load(file_path, extra_objs={}): - model_data = torch.load(model_path) + model_data = torch.load(file_path) cfg = model_data["model_cfg"] class_name = cfg["class_name"] del cfg["class_name"] diff --git a/hyperion/torch/narchs/transformer_encoder_v1.py b/hyperion/torch/narchs/transformer_encoder_v1.py index 8d479f24..f8b50491 100644 --- a/hyperion/torch/narchs/transformer_encoder_v1.py +++ b/hyperion/torch/narchs/transformer_encoder_v1.py @@ -3,15 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn +from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler +from ..layer_blocks import TransformerEncoderBlockV1 as EBlock from ..layers import ActivationFactory as AF from ..layers import PosEncoder, RelPosEncoder -from ..layer_blocks import TransformerEncoderBlockV1 as EBlock -from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler from .net_arch import NetArch @@ -64,7 +64,7 @@ def __init__( in_layer_type="conv2d-sub", rel_pos_enc=False, causal_pos_enc=False, - hid_act="relu6", + hid_act="relu", norm_before=True, concat_after=False, padding_idx=-1, @@ -125,9 +125,6 @@ def __init__( if self.norm_before: self.norm = nn.LayerNorm(d_model) - # def _make_in_layer(self, in_layer_type, in_feats, d_model, - # dropout_rate, pos_dropout_rate, - # padding_idx, time_dim): def _make_in_layer(self): in_feats = self.in_feats @@ -157,7 +154,7 @@ def _make_in_layer(self): nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), pos_enc ) elif isinstance(self.in_layer_type, nn.Module): - self.in_layer = nn.Sequential(in_layer_type, pos_enc) + self.in_layer = nn.Sequential(self.in_layer_type, pos_enc) elif self.in_layer_type is None: self.in_layer = pos_enc else: @@ -240,6 +237,31 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) + def change_dropouts(self, pos_dropout_rate, att_dropout_rate, ff_dropout_rate): + + assert pos_dropout_rate == 0 or self.pos_dropout_rate > 0 + assert att_dropout_rate == 0 or self.att_dropout_rate > 0 + assert ff_dropout_rate == 0 or self.ff_dropout_rate > 0 + + for module in self.modules(): + if isinstance(module, PosEncoder): + for layer in module.modules(): + if isinstance(layer, nn.Dropout): + layer.p = pos_dropout_rate + + elif isinstance(module, EBlock): + for layer in module.modules(): + if isinstance(layer, nn.Dropout): + layer.p = ff_dropout_rate + + for layer in module.self_attn.modules(): + if isinstance(layer, nn.Dropout): + layer.p = att_dropout_rate + + self.pos_dropout_rate = pos_dropout_rate + self.att_dropout_rate = att_dropout_rate + self.ff_dropout_rate = ff_dropout_rate + def in_context(self): return (self.att_context, self.att_context) @@ -386,7 +408,7 @@ def add_class_args(parser, prefix=None, in_feats=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/xvector_classif.py b/hyperion/torch/narchs/xvector_classif.py deleted file mode 100644 index e87c3db1..00000000 --- a/hyperion/torch/narchs/xvector_classif.py +++ /dev/null @@ -1,145 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -import torch.nn as nn -from torch.nn import Linear, BatchNorm1d, Dropout - -from ..layers import ActivationFactory as AF -from .net_arch import NetArch - - -class XVectorClassifV1(NetArch): - def __init__( - self, - input_units, - num_classes, - embed_dim=512, - num_hid_layers=2, - hid_act="relu", - outputs="logits", - use_batchnorm=True, - dropout_rate=0, - ): - - super(XVectorClassifV1, self).__init__() - assert num_hid_layers >= 1, "num_hid_layers (%d < 1)" % num_hid_layers - - self.num_hid_layers = num_hid_layers - self.input_units = input_units - self.embed_dim = embed_dim - self.num_classes = num_classes - self.use_batchnorm = use_batchnorm - self.dropout_rate = dropout_rate - self.outputs = outputs - - if isinstance(hid_units, list): - assert num_hid_layers == len(embed_dim) - else: - embed_dim = [embed_dim for i in range(num_hid_layers)] - - units = [input_units] + embed_dim - - # fully connected layers - fc_layers = [] - for i in range(1, num_hid_layers + 1): - fc_layers.append(Linear(units[i - 1], units[i])) - - self.fc_layers = nn.ModuleList(fc_layers) - - # hidden activations - self.hid_acts = None - if hid_act is not None: - hid_acts = [] - for i in range(num_hid_layers): - hid_act = AF.create(hid_act) - hid_acts.append(hid_act) - self.hid_acts = nn.ModuleList(hid_acts) - - # batch normalization - self.batchnorm_layers = None - if use_batchnorm: - batchnorm_layers = [] - for i in range(num_hid_layers): - batchnorm_layers.append(BatchNorm1d(units[i])) - self.batchnorm_layers = nn.ModuleList(batchnorm_layers) - - # dropout - self.dropout_layers = None - if dropout_rate > 0: - dropout_layers = [] - for i in range(num_hid_layers): - dropout_layers.append(Dropout(dropout_rate)) - self.dropout_layers = nn.ModuleList(dropout_layers) - - # output layers - self.logits_layer = Linear(units[-1], num_classes) - - def forward(self, x): - - for l in range(self.num_hid_layers): - if self.use_batchnorm: - x = self.batchnorm_layers[l](x) - - x = self.fc_layers[l](x) - if self.hid_acts is not None: - x = self.hid_acts[l](x) - - if self.dropout_rate > 0: - x = self.dropout_layers[l](x) - - y = self.logits_layer(x) - - return y - - def extract_embed(self, x, embed_layers=0): - - if isinstance(embed_layers, int): - embed_layers = [embed_layers] - - last_embed_layer = np.max(embed_layers) - embed_layers = set(embed_layers) - - embed_list = [] - for l in range(self.num_hid_layers): - if self.use_batchnorm: - x = self.batchnorm_layers[l](x) - - x = self.fc_layers[l](x) - if l in embed_layers: - embed_list.append(x) - - if l == last_embed_layer: - break - - if self.hid_acts is not None: - x = self.hid_acts[l](x) - - if self.dropout_rate > 0: - x = self.dropout_layers[l](x) - - y = torch.cat((embed_list), dim=-1) - return y - - def get_config(self): - - if self.hid_acts is None: - hid_act = None - else: - hid_act = AF.get_config(self.hid_acts[0]) - - config = { - "num_hid_layers": self.num_hid_layers, - "num_classes": self.num_classes, - "embed_dim": self.embed_dim, - "input_units": self.input_units, - "use_batchnorm": self.use_batchnorm, - "dropout_rate": self.dropout_rate, - "hid_act": hid_act, - } - - base_config = super(XVectorClassifV1, self).get_config() - return dict(list(base_config.items()) + list(config.items())) diff --git a/hyperion/torch/optim/__init__.py b/hyperion/torch/optim/__init__.py index cba89796..33364d63 100644 --- a/hyperion/torch/optim/__init__.py +++ b/hyperion/torch/optim/__init__.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .ema import ExpMovingAvg +from .factory import OptimizerFactory from .fgsm import FGSM from .radam import RAdam -from .factory import OptimizerFactory diff --git a/hyperion/torch/optim/ema.py b/hyperion/torch/optim/ema.py new file mode 100644 index 00000000..f120bf21 --- /dev/null +++ b/hyperion/torch/optim/ema.py @@ -0,0 +1,74 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import math + +import torch +from jsonargparse import ActionParser, ArgumentParser + + +class ExpMovingAvg: + def __init__( + self, params, init_momentum=0.996, momentum=0.996, warmup_steps=0, global_step=0 + ): + if not isinstance(params, list): + params = [params] + self.params = [list(p) for p in params] + self.init_momentum = init_momentum + self._momentum = momentum + self.warmup_steps = warmup_steps + self.global_step = global_step + + def state_dict(self): + """Returns the state of the optimizer as a :class:`dict` needed to restart the training.""" + return {"global_step": self.global_step} + + def load_state_dict(self, state_dict): + """Loads the optimizer state. + + Arguments: + state_dict (dict): scheduler state. Should be an object returned + from a call to :meth:`state_dict`. + """ + self.__dict__.update(state_dict) + + @property + def momentum(self): + if self.global_step >= self.warmup_steps: + return self._momentum + else: + alpha = (1 + math.cos(self.global_step / self.warmup_steps * math.pi)) / 2 + return self.init_momentum * alpha + self._momentum * (1 - alpha) + + @torch.no_grad() + def step(self, new_params): + if not isinstance(new_params, list): + new_params = [new_params] + + assert len(self.params) == len(new_params) + momentum = self.momentum + for param_group, new_param_group in zip(self.params, new_params): + for p, p_new in zip(param_group, new_param_group): + p.data.mul_(momentum).add_((1 - momentum) * p_new.data) + + self.global_step += 1 + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--init-momentum", default=0.996, type=float, help="initial momentum" + ) + parser.add_argument( + "--momentum", default=0.996, type=float, help="final momentum" + ) + parser.add_argument( + "--warmup-steps", default=0, type=int, help="momentum warmup steps" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/optim/factory.py b/hyperion/torch/optim/factory.py index 4fa7b186..b01d3b62 100644 --- a/hyperion/torch/optim/factory.py +++ b/hyperion/torch/optim/factory.py @@ -2,17 +2,19 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import logging -from ...utils.misc import filter_args - import torch import torch.optim as optim +from jsonargparse import ActionParser, ArgumentParser + +from ...utils.misc import filter_args, filter_func_args from .radam import RAdam class OptimizerFactory(object): + """Factory class to create different types of optimizers.""" + @staticmethod def create( params, @@ -36,7 +38,6 @@ def create( max_iter=20, oss=False, ): - kwargs = locals() base_opt = None if opt_type == "sgd": @@ -149,29 +150,30 @@ def create( @staticmethod def filter_args(**kwargs): - valid_args = ( - "opt_type", - "lr", - "momentum", - "beta1", - "beta2", - "rho", - "eps", - "weight_decay", - "amsgrad", - "nesterov", - "lambd", - "asgd_alpha", - "t0", - "rmsprop_alpha", - "centered", - "lr_decay", - "init_acc_val", - "max_iter", - "oss", - ) - - return filter_args(valid_args, kwargs) + return filter_func_args(OptimizerFactory.create, kwargs) + # valid_args = ( + # "opt_type", + # "lr", + # "momentum", + # "beta1", + # "beta2", + # "rho", + # "eps", + # "weight_decay", + # "amsgrad", + # "nesterov", + # "lambd", + # "asgd_alpha", + # "t0", + # "rmsprop_alpha", + # "centered", + # "lr_decay", + # "init_acc_val", + # "max_iter", + # "oss", + # ) + + # return filter_args(valid_args, kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -320,6 +322,5 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='optimizer options') add_argparse_args = add_class_args diff --git a/hyperion/torch/optim/radam.py b/hyperion/torch/optim/radam.py index 459646c1..1aa98517 100644 --- a/hyperion/torch/optim/radam.py +++ b/hyperion/torch/optim/radam.py @@ -1,9 +1,9 @@ """ Code taken from https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam/radam.py """ -# import math + import torch from torch.optim.optimizer import Optimizer, required @@ -61,13 +61,11 @@ def __setstate__(self, state): super().__setstate__(state) def step(self, closure=None): - loss = None if closure is not None: loss = closure() for group in self.param_groups: - for p in group["params"]: if p.grad is None: continue diff --git a/hyperion/torch/seq_embed/__init__.py b/hyperion/torch/seq_embed/__init__.py index 24ee9555..8ecc2cf8 100644 --- a/hyperion/torch/seq_embed/__init__.py +++ b/hyperion/torch/seq_embed/__init__.py @@ -6,9 +6,9 @@ # xvectors had been moved to models # we import them here for backwards compatibility -from ..models.xvector import XVector -from ..models.tdnn_xvector import TDNNXVector -from ..models.resnet_xvector import ResNetXVector from ..models.efficient_net_xvector import EfficientNetXVector -from ..models.transformer_xvector_v1 import TransformerXVectorV1 +from ..models.resnet_xvector import ResNetXVector from ..models.spinenet_xvector import SpineNetXVector +from ..models.tdnn_xvector import TDNNXVector +from ..models.transformer_xvector_v1 import TransformerXVectorV1 +from ..models.xvector import XVector diff --git a/hyperion/torch/tokenizers/__init__.py b/hyperion/torch/tokenizers/__init__.py new file mode 100644 index 00000000..42afcaf1 --- /dev/null +++ b/hyperion/torch/tokenizers/__init__.py @@ -0,0 +1,7 @@ +""" + Copyright 2024 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .hyp_tokenizer import HypTokenizer +from .sp_tokenizer import SPTokenizer diff --git a/hyperion/torch/tokenizers/hyp_tokenizer.py b/hyperion/torch/tokenizers/hyp_tokenizer.py new file mode 100644 index 00000000..0d6e9efb --- /dev/null +++ b/hyperion/torch/tokenizers/hyp_tokenizer.py @@ -0,0 +1,44 @@ +""" + Copyright 2024 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from pathlib import Path + +import yaml + +from ...utils.misc import PathLike + + +class HypTokenizer: + """Base class for tokenizers in Hyperion""" + + registry = {} + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + HypTokenizer.registry[cls.__name__] = cls + + def normalize(self, text): + return text + + def encode(self, x): + pass + + def decode(self, x): + pass + + @staticmethod + def auto_load(file_path: PathLike): + file_path = Path(file_path) + with open(file_path, "r") as f: + cfg = yaml.safe_load(f) + + class_name = cfg["class_name"] + del cfg["class_name"] + if class_name in HypTokenizer.registry: + class_obj = HypTokenizer.registry[class_name] + else: + raise Exception("unknown object with class_name=%s" % (class_name)) + + return class_obj.load(file_path) diff --git a/hyperion/torch/tokenizers/sp_tokenizer.py b/hyperion/torch/tokenizers/sp_tokenizer.py new file mode 100644 index 00000000..c3fa35f9 --- /dev/null +++ b/hyperion/torch/tokenizers/sp_tokenizer.py @@ -0,0 +1,93 @@ +""" + Copyright 2024 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from pathlib import Path +from typing import Dict + +import sentencepiece as spm +import yaml + +from ...utils.misc import PathLike +from .hyp_tokenizer import HypTokenizer + + +class SPTokenizer(HypTokenizer): + """Sentence Piece Tokenizer""" + + def __init__( + self, sp_model: spm.SentencePieceProcessor, uppercase_text: bool = True + ): + super().__init__() + self.sp_model = sp_model + self.uppercase_text = uppercase_text + self.blank_id = self.sp_model.piece_to_id("") + self.vocab_size = self.sp_model.get_piece_size() + self._token2id = None + + @property + def token2id(self): + if self._token2id is not None: + return self._token2id + + token2id: Dict[str, int] = { + self.sp_model.id_to_piece(i): i for i in range(self.sp_model.vocab_size()) + } + self._token2id = token2id + return token2id + + def normalize(self, text): + if self.uppercase_text: + text = text.upper() + return text + + def encode(self, text): + return self.sp_model.encode(text, out_type=int) + + def decode(self, tokens): + return self.sp_model.decoder(tokens) + + def save(self, file_path: PathLike, sp_model_prefix: str = "tokenizer"): + file_path = Path(file_path) + if file_path.suffix != ".yaml": + output_dir = file_path + file_path = output_dir / (sp_model_prefix + ".yaml") + else: + output_dir = file_path.parent + + output_dir.mkdir(parents=True, exist_ok=True) + sp_model_file = sp_model_prefix + ".model" + sp_tokens_file = sp_model_prefix + ".tokens" + cfg = { + "class_name": self.__class__.__name__, + "sp_model": sp_model_file, + "sp_tokens": sp_tokens_file, + "uppercase_text": self.uppercase_text, + } + with open(file_path, "w") as f: + yaml.dump(cfg, f) + + with open(output_dir / sp_tokens_file, "w", encoding="utf-8") as f: + for sym, i in self.token2id.items(): + f.write(f"{sym} {i}\n") + + @classmethod + def load(cls, file_path: PathLike): + file_path = Path(file_path) + if file_path.suffix == ".model": + sp_model = spm.SentencePieceProcessor() + sp_model.load(str(file_path)) + return cls(sp_model) + + with open(file_path, "r") as f: + cfg = yaml.safe_load(f) + + sp_model_file = Path(cfg["sp_model"]) + if not sp_model_file.is_file(): + sp_model_file = file_path.parent / sp_model_file + assert sp_model_file.is_file(), f"{sp_model_file} not found" + + sp_model = spm.SentencePieceProcessor() + sp_model.load(str(sp_model_file)) + return cls(sp_model) diff --git a/hyperion/torch/torch_defs.py b/hyperion/torch/torch_defs.py index a567de50..b08beaeb 100644 --- a/hyperion/torch/torch_defs.py +++ b/hyperion/torch/torch_defs.py @@ -5,7 +5,6 @@ import torch - str2torch_dtype = { "float32": torch.float32, "float64": torch.float64, diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 66c4d028..242402bc 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -2,32 +2,116 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - +import logging +from collections import OrderedDict as ODict from copy import deepcopy +from pathlib import Path +from typing import Callable, Dict, Optional, Union import torch import torch.nn as nn +from ..utils.misc import PathLike + class TorchModel(nn.Module): + """Base class for all Pytorch Models and NNet architectures""" + + registry = {} + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + TorchModel.registry[cls.__name__] = cls + + def __init__(self, bias_weight_decay=None): + super().__init__() + self._train_mode = "full" + self.bias_weight_decay = bias_weight_decay + def get_config(self): config = {"class_name": self.__class__.__name__} - return config def copy(self): return deepcopy(self) - def save(self, file_path): - file_dir = os.path.dirname(file_path) - if not (os.path.isdir(file_dir)): - os.makedirs(file_dir, exist_ok=True) + def clone(self): + return deepcopy(self) - config = self.get_config() - torch.save( - {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()} + def trainable_parameters(self, recurse: bool = True): + for param in self.parameters(recurse=recurse): + if param.requires_grad: + yield param + + def non_trainable_parameters(self, recurse: bool = True): + for param in self.parameters(recurse=recurse): + if not param.requires_grad: + yield param + + def trainable_named_parameters(self, recurse: bool = True): + for name, param in self.named_parameters(recurse=recurse): + if param.requires_grad: + yield name, param + + def non_trainable_named_parameters(self, recurse: bool = True): + for name, param in self.named_parameters(recurse=recurse): + if not param.requires_grad: + yield name, param + + def parameter_summary(self, verbose: bool = False): + trainable_params = sum(p.numel() for p in self.trainable_parameters()) + non_trainable_params = sum(p.numel() for p in self.non_trainable_parameters()) + buffer_params = sum(p.numel() for p in self.buffers()) + non_trainable_total = non_trainable_params + buffer_params + total_params = trainable_params + non_trainable_total + if verbose: + logging.info( + "total-params=%d, trainable-params=%d, non-trainable-params+buffers=%d, non-trainable-params=%d, buffer-params=%d", + total_params, + trainable_params, + non_trainable_total, + non_trainable_params, + buffer_params, + ) + return ( + total_params, + trainable_params, + non_trainable_total, + non_trainable_params, + buffer_params, ) + def print_parameter_list(self): + for n, p in self.trainable_named_parameters(): + logging.info("trainable: %s", n) + + for n, p in self.non_trainable_named_parameters(): + logging.info("non_trainable: %s", n) + + for n, p in self.named_buffers(): + logging.info("buffers: %s", n) + + def has_param_groups(self): + return self.bias_weight_decay is not None + + def trainable_param_groups(self): + if self.bias_weight_decay is None: + return [{"params": self.trainable_parameters()}] + + regularized = [] + not_regularized = [] + for name, param in self.trainable_named_parameters(): + # we do not regularize biases nor Norm parameters + if name.endswith(".bias") or len(param.shape) == 1: + not_regularized.append(param) + else: + regularized.append(param) + + return [ + {"params": regularized}, + {"params": not_regularized, "weight_decay": self.bias_weight_decay}, + ] + def freeze(self): for param in self.parameters(): param.requires_grad = False @@ -36,10 +120,68 @@ def unfreeze(self): for param in self.parameters(): param.requires_grad = True + def change_dropouts(self, dropout_rate): + """Changes all dropout rates of the model.""" + for module in self.modules(): + if isinstance(module, nn.modules.dropout._DropoutNd): + module.p = dropout_rate + if isinstance(module, nn.RNNBase): + module.dropout = dropout_rate + + if hasattr(self, "dropout_rate"): + assert dropout_rate == 0 or self.dropout_rate > 0 + self.dropout_rate = dropout_rate + + @property + def train_mode(self): + return self._train_mode + + @train_mode.setter + def train_mode(self, mode): + self.set_train_mode(mode) + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + + self._train_mode = mode + + def _train(self, train_mode: str): + if train_mode == "full": + super().train(True) + elif train_mode == "frozen": + super().train(False) + else: + raise ValueError(f"invalid train_mode={train_mode}") + + def train(self, mode: bool = True): + if not mode: + super().train(False) + return + + self._train(self.train_mode) + + @staticmethod + def valid_train_modes(): + return ["full", "frozen"] + + def save(self, file_path): + file_path = Path(file_path) + file_path.parent.mkdir(parents=True, exist_ok=True) + torch.save( + {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()}, + file_path, + ) + @staticmethod def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): model_data = None - if cfg is None: + if cfg is None or state_dict is None: assert file_path is not None model_data = torch.load(file_path) if cfg is None: @@ -80,3 +222,170 @@ def device(self): ) return next(iter(devices)) + + @staticmethod + def _remove_module_prefix(state_dict): + import re + + p = re.compile("^(module\.)+") + if p.match(list(state_dict.keys())[0]) is not None: + state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items()) + + return state_dict + + @staticmethod + def _fix_xvector_cfg(cfg): + # We renamed AM-softmax scale parameer s to cos_scale + if "s" in cfg: + cfg["cos_scale"] = cfg.pop("s") + + return cfg + + @staticmethod + def _fix_hf_wav2xvector(cfg, state_dict): + key = "feat_fusion_method" + if key in cfg: + fuser_type = cfg.pop(key) + feat_fuser = { + "feat_fuser": {"fuser_type": fuser_type}, + "mvn": None, + "spec_augment": None, + } + cfg["feat_fuser"] = feat_fuser + state_dict["feat_fuser.feat_fuser.feat_fuser"] = state_dict.pop( + "feat_fuser" + ) + + return cfg, state_dict + + @staticmethod + def _fix_model_compatibility(class_obj, cfg, state_dict): + """Function that fixed compatibility issues with deprecated models + + Args: + class_obj: class type of the model. + cfg: configuration dictiory that inits the model. + + Returns: + Fixed configuration dictionary. + """ + # for compatibility with older x-vector models + XVector = TorchModel.registry["XVector"] + if issubclass(class_obj, XVector): + cfg = TorchModel._fix_xvector_cfg(cfg) + + # switch old feature fuser to new feature fuser in w2v x-vectors + HFWav2XVector = TorchModel.registry["HFWav2XVector"] + if issubclass(class_obj, HFWav2XVector): + cfg, state_dict = TorchModel._fix_hf_wav2xvector(cfg, state_dict) + + return cfg, state_dict + + @staticmethod + def _is_hf_path(file_path: Path): + # hf path can have only 2 dir levels + return len(file_path.parents) == 2 + + @staticmethod + def _get_from_hf( + file_path: Path, cache_dir: PathLike = None, local_dir: PathLike = None + ): + from huggingface_hub import hf_hub_download + + return hf_hub_download( + repo_id=file_path.parent, + filename=file_path.name, + cache_dir=cache_dir, + local_dir=local_dir, + ) + + @staticmethod + def _try_to_get_from_hf( + file_path: Path, cache_dir: PathLike = None, local_dir: PathLike = None + ): + if str(file_path)[:3] == "hf:": + # hf: prefix indicates to download from hub + file_path = Path(str(file_path)[3:]) + assert TorchModel._is_hf_path( + file_path + ), f"{file_path} is not a valid HF path" + file_path = TorchModel._get_from_hf( + file_path, cache_dir=cache_dir, local_dir=local_dir + ) + return Path(file_path) + elif not file_path.is_file(): + # if no prefix but file not in local dir try to get it from hub + if not TorchModel._is_hf_path(file_path): + return file_path + + try: + file_path = TorchModel._get_from_hf(file_path) + return Path(file_path) + except: + return file_path + + else: + # file is local + return file_path + + @staticmethod + def auto_load( + file_path: PathLike, + model_name: Optional[str] = None, + extra_objs: dict = {}, + map_location: Optional[ + Union[ + Callable[[torch.Tensor, str], torch.Tensor], + torch.device, + str, + Dict[str, str], + ] + ] = None, + cache_dir: PathLike = None, + local_dir: PathLike = None, + ): + file_path = Path(file_path) + file_path = TorchModel._try_to_get_from_hf( + file_path, cache_dir=cache_dir, local_dir=local_dir + ) + + assert file_path.is_file(), f"TorchModel file: {file_path} not found" + + if map_location is None: + map_location = torch.device("cpu") + + model_data = torch.load(file_path, map_location=map_location) + cfg = model_data["model_cfg"] + class_name = cfg["class_name"] + del cfg["class_name"] + if class_name in TorchModel.registry: + class_obj = TorchModel.registry[class_name] + elif class_name in extra_objs: + class_obj = extra_objs[class_name] + else: + raise Exception("unknown object with class_name=%s" % (class_name)) + + if model_name is None: + model_name = "model" + state_dict = model_data[f"{model_name}_state_dict"] + + if "n_averaged" in state_dict: + del state_dict["n_averaged"] + + state_dict = TorchModel._remove_module_prefix(state_dict) + cfg, state_dict = TorchModel._fix_model_compatibility( + class_obj, cfg, state_dict + ) + + return class_obj.load(cfg=cfg, state_dict=state_dict) + # num_tries = 3 + # for tries in range(num_tries): + # try: + # return class_obj.load(cfg=cfg, state_dict=state_dict) + # except RuntimeError as err: + # # remove module prefix when is trained with dataparallel + # if tries == num_tries - 1: + # # if it failed the 3 trials raise exception + # raise err + # # remove module prefix when is trained with dataparallel + # state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items()) diff --git a/hyperion/torch/torch_model_loader.py b/hyperion/torch/torch_model_loader.py index 142656d1..2273bee8 100644 --- a/hyperion/torch/torch_model_loader.py +++ b/hyperion/torch/torch_model_loader.py @@ -3,13 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from collections import OrderedDict as ODict import re +from collections import OrderedDict as ODict import torch -from .narchs import * from .models import * +from .narchs import * class TorchModelLoader(object): @@ -55,7 +55,7 @@ def load(file_path, extra_objs={}, map_location=None): if "n_averaged" in state_dict: del state_dict["n_averaged"] - cfg = self._fix_compatibilty(class_obj, cfg) + cfg = TorchModelLoader._fix_compatibility(class_obj, cfg) p = re.compile("^module\.") num_tries = 3 diff --git a/hyperion/torch/tpm/__init__.py b/hyperion/torch/tpm/__init__.py new file mode 100644 index 00000000..e3a17e4f --- /dev/null +++ b/hyperion/torch/tpm/__init__.py @@ -0,0 +1,6 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .hf import HFHubert, HFWav2Vec2, HFWavLM diff --git a/hyperion/torch/tpm/hf/__init__.py b/hyperion/torch/tpm/hf/__init__.py new file mode 100644 index 00000000..d0f91785 --- /dev/null +++ b/hyperion/torch/tpm/hf/__init__.py @@ -0,0 +1,8 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .hf_hubert import HFHubert +from .hf_wav2vec2 import HFWav2Vec2 +from .hf_wavlm import HFWavLM diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py new file mode 100644 index 00000000..638bf561 --- /dev/null +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -0,0 +1,749 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +from typing import Callable, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import HubertConfig, HubertModel + +from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs +from .hf_wav2vec_base import HFWav2VecBase + + +class HFHubert(HFWav2VecBase): + r"""This is wrapper over HuggingFace Hubert model. + See documentation: https://huggingface.co/docs/transformers/main/en/model_doc/hubert + + This wrapper makes the HugginFace model to have the same interface + as other hyperion models. It also add extra functionalities. + + The config. parameters are the same as in the HuggingFace HubertConfig class. + + Attributes: + pretrained_model_path (`str`, defaults to None): file path or HuggingFace Hub path to + pre-trained model. + normalize_input (`bool`, defaults to True): whether or not to zero-mean unit-variance + normalize the input. + use_input_attention_mask (`bool`, defaults to False): whether we should input an + attention mask to the wav2vec model. + vocab_size (`int`, defaults to 32): vocabulary size of the + model. Defines the different tokens that can be represented by the + *inputs_ids* passed to the forward method. + hidden_size (`int`, defaults to 768): dimensionality of the encoder layers and + the pooler layer. + num_hidden_layers (`int`, defaults to 12): number of hidden layers in the + Transformer encoder. + num_attention_heads (`int`, defaults to 12): number of attention heads for + each attention layer in the Transformer encoder. + intermediate_size (`int`, defaults to 3072): dimensionality of the + feed-forward layer in the Transformer encoder. + hidden_act (`str` or `function`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the encoder and pooler. + If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout (`float`, defaults to 0.1): the dropout probability for all + fully connected layers in the embeddings, encoder, and pooler. + activation_dropout (`float`, defaults to 0.1): the dropout probability for all + intermediate layer in feedforward transformer layers. + attention_dropout (`float`, defaults to 0.1): the dropout ratio for the + attention probabilities. + layerdrop (`float`, defaults to 0.1): prob. of dropping a layer. + initializer_range (`float`, defaults to 0.02): the standard deviation of the + truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, defaults to 1e-12): the epsilon used by the layer + normalization layers. + feat_extract_norm (`str`, defaults to `"group"`): + the norm to be applied to 1D convolutional layers in feature encoder. + One of `"group"` for group normalization of only the first 1D convolutional + layer or `"layer"` for layer normalization of all 1D convolutional layers. + feat_proj_dropout (`float`, defaults to 0.0): the dropout probability for output + of the feature encoder. + feat_extract_activation (`str, `optional`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the 1D convolutional layers of the feature + extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + conv_dim (`Tuple[int]`, defaults to `(512, 512, 512, 512, 512, 512, 512)`): + a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. + conv_stride (`Tuple[int]`, defaults to `(5, 2, 2, 2, 2, 2, 2)`): + a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length + of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. + conv_kernel (`Tuple[int]`, defaults to `(10, 3, 3, 3, 3, 3, 3)`): + a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The + length of *conv_kernel* defines the number of convolutional layers and has to match the length of + *conv_dim*. + conv_bias (`bool`, defaults to `False`): whether the 1D convolutional layers have a bias. + num_conv_pos_embeddings (`int`, defaults to 128): + number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional + embeddings layer. + num_conv_pos_embedding_groups (`int`, defaults to 16): + number of groups of 1D convolutional positional embeddings layer. + do_stable_layer_norm (`bool`, defaults to `False`): + whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is + True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is + False` corresponds to applying layer norm after the attention layer. + apply_spec_augment (`bool`, defaults to `True`): + whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see + [SpecAugment: A Simple Data Augmentation Method for Automatic Speech + Recognition](https://arxiv.org/abs/1904.08779). + mask_time_prob (`float`, defaults to 0.05): + percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the + actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. + mask_time_length (`int`, defaults to 10): + length of vector span along the time axis. + mask_time_min_masks (`int`, defaults to 2),: + the minimum number of masks of length `mask_time_length` generated along the time axis, each time step, + irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < + mask_time_min_masks'' + mask_feature_prob (`float`, defaults to 0.0): + percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap + may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is + True`. + mask_feature_length (`int`, defaults to 10): + length of vector span along the feature axis. + mask_feature_min_masks (`int`, defaults to 0): + The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time + step, irrespectively of `mask_feature_prob`. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' + cache_dir (str or os.PathLike): path to a directory in which a downloaded pretrained + model configuration should be cached if the standard cache should not be used. + force_download (`bool`, defaults to `False`): whether or not to force the (re-)download + the model weights and configuration files and override the + cached versions if they exist. + resume_download (`bool`, defaults to `False`): whether or not to delete incompletely + received files. Will attempt to resume the download if such a file exists. + revision(`str`, defaults to `"main"`): the specific model version to use. + It can be a branch name, a tag name, or a commit id. + ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path + and inits the model from the configuration. This is set to True for models that have already + been finetuned. + override_dropouts (`bool` defaults to False): if True, it ingnores the dropout probs. in the pretrained model + and uses the ones passed as arguments. + override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. + configuration in the pretrained model and uses the ones passed in the arguments. + left_encoder_context (`int`): past context frames used by the transformer encoder when the signal is evaluated + chunk by chunk, if it is too long to fit in GPU. + right_encoder_context: (`int`): future context frames used by the transformer encoder. + sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. + """ + + def __init__( + self, + pretrained_model_path: Optional[Union[str, os.PathLike]] = None, + normalize_input: bool = True, + use_input_attention_mask: bool = False, + vocab_size: int = 32, + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: Union[str, Callable] = "gelu", + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + layerdrop: float = 0.1, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + feat_extract_norm: str = "group", + feat_proj_dropout: float = 0.0, + feat_extract_activation: Union[str, Callable] = "gelu", + conv_dim: Tuple[int] = (512, 512, 512, 512, 512, 512, 512), + conv_stride: Tuple[int] = (5, 2, 2, 2, 2, 2, 2), + conv_kernel: Tuple[int] = (10, 3, 3, 3, 3, 3, 3), + conv_bias: bool = False, + num_conv_pos_embeddings: int = 128, + num_conv_pos_embedding_groups: int = 16, + do_stable_layer_norm: bool = False, + apply_spec_augment: bool = True, + mask_time_prob: float = 0.05, + mask_time_length: int = 10, + mask_time_min_masks: int = 2, + mask_feature_prob: float = 0.0, + mask_feature_length: int = 10, + mask_feature_min_masks: int = 0, + cache_dir: Union[str, os.PathLike] = "./.cache/hyperion_hf", + force_download: bool = False, + resume_download: bool = False, + revision: str = "main", + drop_layers_gt: Optional[int] = None, + ignore_pretrained: bool = False, + override_dropouts: bool = False, + override_spec_augment: bool = False, + left_encoder_context: int = 16, + right_encoder_context: int = 16, + sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, + ): + super().__init__( + pretrained_model_path=pretrained_model_path, + normalize_input=normalize_input, + use_input_attention_mask=use_input_attention_mask, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + drop_layers_gt=drop_layers_gt, + ignore_pretrained=ignore_pretrained, + override_dropouts=override_dropouts, + override_spec_augment=override_spec_augment, + left_encoder_context=left_encoder_context, + right_encoder_context=right_encoder_context, + sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, + ) + + if pretrained_model_path is not None and not ignore_pretrained: + rank = ddp_get_rank() + if rank == 0: + logging.info(f"Downloading HF model from {pretrained_model_path}") + # rank 0 downloads the model from HF web + self.hf_model = HubertModel.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + ) + # all ranks wait until the model is downloaded + ddp_wait_for_all_procs() + if rank > 0: + # the rest of ranks should read the configuration from the cache. + self.hf_model = HubertModel.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=False, + resume_download=False, + revision=revision, + ) + ddp_wait_for_all_procs() + self.hf_model.config.layerdrop = 0.0 + self.change_config( + override_dropouts=self.override_dropouts, + override_spec_augment=self.override_spec_augment, + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + ) + else: + hf_config = HubertConfig( + vocab_size=vocab_size, + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + hidden_act=hidden_act, + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + layerdrop=0.0, # layerdrop, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + feat_extract_norm=feat_extract_norm, + feat_extract_activation=feat_extract_activation, + conv_dim=conv_dim, + conv_stride=conv_stride, + conv_kernel=conv_kernel, + conv_bias=conv_bias, + num_conv_pos_embeddings=num_conv_pos_embeddings, + num_conv_pos_embedding_groups=num_conv_pos_embedding_groups, + do_stable_layer_norm=do_stable_layer_norm, + apply_spec_augment=apply_spec_augment, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + ) + self.hf_model = HubertModel(hf_config) + + if drop_layers_gt is not None: + self.drop_upper_layers(drop_layers_gt) + + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + + self.ignore_pretrained = True + + @property + def num_encoder_layers(self): + return self.hf_config.num_hidden_layers + + @property + def hidden_size(self): + return self.hf_config.hidden_size + + def change_dropouts( + self, + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + feat_proj_dropout: float = 0.1, + **kwargs, + ): + import transformers.models.hubert.modeling_hubert as t + + self.hf_model.config.hidden_dropout = hidden_dropout + self.hf_model.config.activation_dropout = activation_dropout + self.hf_model.config.attention_dropout = attention_dropout + self.hf_model.config.feat_proj_dropout = feat_proj_dropout + + self.hf_model.feature_projection.dropout.p = feat_proj_dropout + for module in self.hf_model.encoder.modules(): + if isinstance(module, nn.Dropout): + module.p = hidden_dropout + + for module in self.hf_model.encoder.modules(): + if isinstance(module, t.HubertAttention): + module.dropout = activation_dropout + if isinstance(module, t.HubertFeatureProjection): + module.intermediate_dropout.p = activation_dropout + + def drop_upper_layers(self, max_layers: int): + if max_layers >= self.hf_config.num_hidden_layers: + return + + layers = self.hf_model.encoder.layers + self.hf_model.encoder.layers = nn.ModuleList( + [l for i, l in enumerate(layers) if i < max_layers] + ) + self.hf_config.num_hidden_layers = max_layers + + def get_config(self): + """Returns the configuration arguments for the object in a dictionary.""" + config = self.hf_model.config.to_dict() + config = self.filter_args(**config) + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + args_base = HFWav2VecBase.filter_args(**kwargs) + valid_args = ( + "vocab_size", + "hidden_size", + "num_hidden_layers", + "num_attention_heads", + "intermediate_size", + "hidden_act", + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "layerdrop", + "initializer_range", + "layer_norm_eps", + "feat_extract_norm", + "feat_extract_activation", + "conv_dim", + "conv_stride", + "conv_kernel", + "conv_bias", + "num_conv_pos_embeddings", + "num_conv_pos_embedding_groups", + "do_stable_layer_norm", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_class_args(parser) + + parser.add_argument( + "--vocab-size", + default=32, + type=int, + help=( + "vocabulary size of the " + "model. Defines the different tokens that can be represented by the " + "*inputs_ids* passed to the forward method." + ), + ) + parser.add_argument( + "--hidden-size", + default=768, + type=int, + help=("dimensionality of the encoder layers and the pooler layer."), + ) + parser.add_argument( + "--num-hidden-layers", + default=12, + type=int, + help=("number of hidden layers in the Transformer encoder"), + ) + parser.add_argument( + "--num-attention-heads", + default=12, + type=int, + help=( + "number of attention heads for " + "each attention layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--intermediate-size", + default=3072, + type=int, + help=( + "dimensionality of the " "feed-forward layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--hidden-act", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear " + "activation function (function or string) in the encoder and pooler" + ), + ) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--layerdrop", + default=0.1, + type=float, + help=("prob. of dropping a layer"), + ) + parser.add_argument( + "--initializer-range", + default=0.02, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--layer-norm-eps", + default=1e-12, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--feat-extract-norm", + default="group", + choices=["group", "layer"], + help=( + "the norm to be applied to 1D convolutional layers in feature encoder. " + "One of `group` for group normalization of only the first 1D convolutional " + "layer or `layer` for layer normalization of all 1D convolutional layers" + ), + ) + parser.add_argument( + "--feat-proj-dropout", + default=0.1, + type=float, + help=("the dropout probability for output of the feature encoder"), + ) + parser.add_argument( + "--feat-extract-activation", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear activation function (function or string) in the 1D " + "convolutional layers of the feature extractor" + ), + ) + parser.add_argument( + "--conv-dim", + default=[512, 512, 512, 512, 512, 512, 512], + nargs="+", + type=int, + help=( + "a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the " + "feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers" + ), + ) + parser.add_argument( + "--conv-stride", + default=[5, 2, 2, 2, 2, 2, 2], + nargs="+", + type=int, + help=( + "a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-kernel", + default=[10, 3, 3, 3, 3, 3, 3], + nargs="+", + type=int, + help=( + "a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-bias", + default=False, + action=ActionYesNo, + help=("whether the 1D convolutional layers have a bias"), + ) + parser.add_argument( + "--num-conv-pos-embeddings", + default=128, + type=int, + help=( + "number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional " + "embeddings layer" + ), + ) + parser.add_argument( + "--num-conv-pos-embedding-groups", + default=16, + type=int, + help=("number of groups of 1D convolutional positional embeddings layer"), + ) + parser.add_argument( + "--do-stable-layer-norm", + default=False, + action=ActionYesNo, + help=( + "whether to apply *stable* layer norm architecture of the Transformer encoder" + ), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) + valid_args = ( + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_finetune_args(parser) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py new file mode 100644 index 00000000..5b59d79a --- /dev/null +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -0,0 +1,864 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +from typing import Callable, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2Config, Wav2Vec2Model + +from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs +from .hf_wav2vec_base import HFWav2VecBase + + +class HFWav2Vec2(HFWav2VecBase): + r"""This is wrapper over HuggingFace Wav2Vec2 model. + See documentation: https://huggingface.co/docs/transformers/model_doc/wav2vec2 + + This wrapper makes the HugginFace model to have the same interface + as other hyperion models. It also add extra functionalities. + + The config. parameters are the same as in the HuggingFace Wav2Vec2Config class. + + Attributes: + pretrained_model_path (`str`, defaults to None): file path or HuggingFace Hub path to + pre-trained model. + normalize_input (`bool`, defaults to True): whether or not to zero-mean unit-variance + normalize the input. + use_input_attention_mask (`bool`, defaults to False): whether we should input an + attention mask to the wav2vec model. + vocab_size (`int`, defaults to 32): vocabulary size of the + model. Defines the different tokens that can be represented by the + *inputs_ids* passed to the forward method. + hidden_size (`int`, defaults to 768): dimensionality of the encoder layers and + the pooler layer. + num_hidden_layers (`int`, defaults to 12): number of hidden layers in the + Transformer encoder. + num_attention_heads (`int`, defaults to 12): number of attention heads for + each attention layer in the Transformer encoder. + intermediate_size (`int`, defaults to 3072): dimensionality of the + feed-forward layer in the Transformer encoder. + hidden_act (`str` or `function`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the encoder and pooler. + If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout (`float`, defaults to 0.1): the dropout probability for all + fully connected layers in the embeddings, encoder, and pooler. + activation_dropout (`float`, defaults to 0.1): the dropout probability for all + intermediate layer in feedforward transformer layers. + attention_dropout (`float`, defaults to 0.1): the dropout ratio for the + attention probabilities. + layerdrop (`float`, defaults to 0.1): prob. of dropping a layer. + initializer_range (`float`, defaults to 0.02): the standard deviation of the + truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, defaults to 1e-12): the epsilon used by the layer + normalization layers. + feat_extract_norm (`str`, defaults to `"group"`): + the norm to be applied to 1D convolutional layers in feature encoder. + One of `"group"` for group normalization of only the first 1D convolutional + layer or `"layer"` for layer normalization of all 1D convolutional layers. + feat_proj_dropout (`float`, defaults to 0.0): the dropout probability for output + of the feature encoder. + feat_extract_activation (`str, `optional`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the 1D convolutional layers of the feature + extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + conv_dim (`Tuple[int]`, defaults to `(512, 512, 512, 512, 512, 512, 512)`): + a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. + conv_stride (`Tuple[int]`, defaults to `(5, 2, 2, 2, 2, 2, 2)`): + a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length + of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. + conv_kernel (`Tuple[int]`, defaults to `(10, 3, 3, 3, 3, 3, 3)`): + a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The + length of *conv_kernel* defines the number of convolutional layers and has to match the length of + *conv_dim*. + conv_bias (`bool`, defaults to `False`): whether the 1D convolutional layers have a bias. + num_conv_pos_embeddings (`int`, defaults to 128): + number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional + embeddings layer. + num_conv_pos_embedding_groups (`int`, defaults to 16): + number of groups of 1D convolutional positional embeddings layer. + do_stable_layer_norm (`bool`, defaults to `False`): + whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is + True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is + False` corresponds to applying layer norm after the attention layer. + apply_spec_augment (`bool`, defaults to `True`): + whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see + [SpecAugment: A Simple Data Augmentation Method for Automatic Speech + Recognition](https://arxiv.org/abs/1904.08779). + mask_time_prob (`float`, defaults to 0.05): + percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the + actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. + mask_time_length (`int`, defaults to 10): + length of vector span along the time axis. + mask_time_min_masks (`int`, defaults to 2),: + the minimum number of masks of length `mask_time_length` generated along the time axis, each time step, + irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < + mask_time_min_masks'' + mask_feature_prob (`float`, defaults to 0.0): + percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap + may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is + True`. + mask_feature_length (`int`, defaults to 10): + length of vector span along the feature axis. + mask_feature_min_masks (`int`, defaults to 0): + The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time + step, irrespectively of `mask_feature_prob`. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' + add_adapter (`bool`, defaults to `False`): + whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder. Can be very useful for + warm-starting Wav2Vec2 for SpeechEncoderDecoder models. + adapter_kernel_size (`int`, defaults to 3): + kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adapter_stride (`int`, defaults to 2): + stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + num_adapter_layers (`int`, defaults to 3): + number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is + True`. + output_hidden_size (`int`, defaults to None): + dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant + if `add_adapter is True`. + cache_dir (str or os.PathLike): path to a directory in which a downloaded pretrained + model configuration should be cached if the standard cache should not be used. + force_download (`bool`, defaults to `False`): whether or not to force the (re-)download + the model weights and configuration files and override the + cached versions if they exist. + resume_download (`bool`, defaults to `False`): whether or not to delete incompletely + received files. Will attempt to resume the download if such a file exists. + revision(`str`, defaults to `"main"`): the specific model version to use. + It can be a branch name, a tag name, or a commit id. + ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path + and inits the model from the configuration. This is set to True for models that have already + been finetuned. + override_dropouts (`bool` defaults to False): if True, it ingnores the dropout probs. in the pretrained model + and uses the ones passed as arguments. + override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. + configuration in the pretrained model and uses the ones passed in the arguments. + left_encoder_context (`int`): past context frames used by the transformer encoder when the signal is evaluated + chunk by chunk, if it is too long to fit in GPU. + right_encoder_context: (`int`): future context frames used by the transformer encoder. + sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. + """ + + def __init__( + self, + pretrained_model_path: Optional[Union[str, os.PathLike]] = None, + normalize_input: bool = True, + use_input_attention_mask: bool = False, + vocab_size: int = 32, + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: Union[str, Callable] = "gelu", + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + layerdrop: float = 0.1, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + feat_extract_norm: str = "group", + feat_proj_dropout: float = 0.0, + feat_extract_activation: Union[str, Callable] = "gelu", + conv_dim: Tuple[int] = (512, 512, 512, 512, 512, 512, 512), + conv_stride: Tuple[int] = (5, 2, 2, 2, 2, 2, 2), + conv_kernel: Tuple[int] = (10, 3, 3, 3, 3, 3, 3), + conv_bias: bool = False, + num_conv_pos_embeddings: int = 128, + num_conv_pos_embedding_groups: int = 16, + do_stable_layer_norm: bool = False, + apply_spec_augment: bool = True, + mask_time_prob: float = 0.05, + mask_time_length: int = 10, + mask_time_min_masks: int = 2, + mask_feature_prob: float = 0.0, + mask_feature_length: int = 10, + mask_feature_min_masks: int = 0, + add_adapter: bool = False, + adapter_kernel_size: int = 3, + adapter_stride: int = 2, + num_adapter_layers: int = 3, + output_hidden_size: Optional[int] = None, + cache_dir: Union[str, os.PathLike] = "./.cache/hyperion_hf", + force_download: bool = False, + resume_download: bool = False, + revision: str = "main", + drop_layers_gt: Optional[int] = None, + ignore_pretrained: bool = False, + override_dropouts: bool = False, + override_spec_augment: bool = False, + left_encoder_context: int = 16, + right_encoder_context: int = 16, + sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, + ): + super().__init__( + pretrained_model_path=pretrained_model_path, + normalize_input=normalize_input, + use_input_attention_mask=use_input_attention_mask, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + drop_layers_gt=drop_layers_gt, + ignore_pretrained=ignore_pretrained, + override_dropouts=override_dropouts, + override_spec_augment=override_spec_augment, + left_encoder_context=left_encoder_context, + right_encoder_context=right_encoder_context, + sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, + ) + + if pretrained_model_path is not None and not ignore_pretrained: + rank = ddp_get_rank() + if rank == 0: + # rank 0 downloads the model from HF web + logging.info(f"Downloading HF model from {pretrained_model_path}") + self.hf_model = Wav2Vec2Model.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + ) + # all ranks wait until the model is downloaded + ddp_wait_for_all_procs() + if rank > 0: + # the rest of ranks should read the configuration from the cache. + self.hf_model = Wav2Vec2Model.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=False, + resume_download=False, + revision=revision, + ) + ddp_wait_for_all_procs() + self.hf_model.config.layerdrop = 0.0 + self.change_config( + override_dropouts=self.override_dropouts, + override_spec_augment=self.override_spec_augment, + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + ) + else: + hf_config = Wav2Vec2Config( + vocab_size=vocab_size, + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + hidden_act=hidden_act, + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + layerdrop=0.0, # layerdrop, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + feat_extract_norm=feat_extract_norm, + feat_extract_activation=feat_extract_activation, + conv_dim=conv_dim, + conv_stride=conv_stride, + conv_kernel=conv_kernel, + conv_bias=conv_bias, + num_conv_pos_embeddings=num_conv_pos_embeddings, + num_conv_pos_embedding_groups=num_conv_pos_embedding_groups, + do_stable_layer_norm=do_stable_layer_norm, + apply_spec_augment=apply_spec_augment, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + add_adapter=add_adapter, + adapter_kernel_size=adapter_kernel_size, + adapter_stride=adapter_stride, + num_adapter_layers=num_adapter_layers, + output_hidden_size=output_hidden_size, + ) + self.hf_model = Wav2Vec2Model(hf_config) + + if drop_layers_gt is not None: + self.drop_upper_layers(drop_layers_gt) + + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + + self.ignore_pretrained = True + + @property + def num_encoder_layers(self): + return self.hf_config.num_hidden_layers + + @property + def hidden_size(self): + return self.hf_config.hidden_size + + def change_dropouts( + self, + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + feat_proj_dropout: float = 0.0, + **kwargs, + ): + import transformers.models.wav2vec2.modeling_wav2vec2 as t + + self.hf_model.config.hidden_dropout = hidden_dropout + self.hf_model.config.activation_dropout = activation_dropout + self.hf_model.config.attention_dropout = attention_dropout + self.hf_model.config.feat_proj_dropout = feat_proj_dropout + + self.hf_model.feature_projection.dropout.p = feat_proj_dropout + for module in self.hf_model.encoder.modules(): + if isinstance(module, nn.Dropout): + module.p = hidden_dropout + + for module in self.hf_model.encoder.modules(): + if isinstance(module, t.Wav2Vec2Attention): + module.dropout = activation_dropout + if isinstance(module, t.Wav2Vec2FeatureProjection): + module.intermediate_dropout.p = activation_dropout + + def drop_upper_layers(self, max_layers: int): + if max_layers >= self.hf_config.num_hidden_layers: + return + + layers = self.hf_model.encoder.layers + self.hf_model.encoder.layers = nn.ModuleList( + [l for i, l in enumerate(layers) if i < max_layers] + ) + self.hf_config.num_hidden_layers = max_layers + + if self.hf_model.adapter is not None: + del self.hf_model.adapter + self.hf_model.adapter = None + self.hf_config.add_adapter = False + + def get_config(self): + """Returns the configuration arguments for the object in a dictionary.""" + config = self.hf_model.config.to_dict() + config = self.filter_args(**config) + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + args_base = HFWav2VecBase.filter_args(**kwargs) + valid_args = ( + "vocab_size", + "hidden_size", + "num_hidden_layers", + "num_attention_heads", + "intermediate_size", + "hidden_act", + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "layerdrop", + "initializer_range", + "layer_norm_eps", + "feat_extract_norm", + "feat_extract_activation", + "conv_dim", + "conv_stride", + "conv_kernel", + "conv_bias", + "num_conv_pos_embeddings", + "num_conv_pos_embedding_groups", + "do_stable_layer_norm", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + "add_adapter", + "adapter_kernel_size", + "adapter_stride", + "num_adapter_layers", + "output_hidden_size", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_class_args(parser) + + parser.add_argument( + "--vocab-size", + default=32, + type=int, + help=( + "vocabulary size of the " + "model. Defines the different tokens that can be represented by the " + "*inputs_ids* passed to the forward method." + ), + ) + parser.add_argument( + "--hidden-size", + default=768, + type=int, + help=("dimensionality of the encoder layers and the pooler layer."), + ) + parser.add_argument( + "--num-hidden-layers", + default=12, + type=int, + help=("number of hidden layers in the Transformer encoder"), + ) + parser.add_argument( + "--num-attention-heads", + default=12, + type=int, + help=( + "number of attention heads for " + "each attention layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--intermediate-size", + default=3072, + type=int, + help=( + "dimensionality of the " "feed-forward layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--hidden-act", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear " + "activation function (function or string) in the encoder and pooler" + ), + ) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--layerdrop", + default=0.1, + type=float, + help=("prob. of dropping a layer"), + ) + parser.add_argument( + "--initializer-range", + default=0.02, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--layer-norm-eps", + default=1e-12, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--feat-extract-norm", + default="group", + choices=["group", "layer"], + help=( + "the norm to be applied to 1D convolutional layers in feature encoder. " + "One of `group` for group normalization of only the first 1D convolutional " + "layer or `layer` for layer normalization of all 1D convolutional layers" + ), + ) + parser.add_argument( + "--feat-proj-dropout", + default=0.1, + type=float, + help=("the dropout probability for output of the feature encoder"), + ) + parser.add_argument( + "--feat-extract-activation", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear activation function (function or string) in the 1D " + "convolutional layers of the feature extractor" + ), + ) + parser.add_argument( + "--conv-dim", + default=[512, 512, 512, 512, 512, 512, 512], + nargs="+", + type=int, + help=( + "a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the " + "feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers" + ), + ) + parser.add_argument( + "--conv-stride", + default=[5, 2, 2, 2, 2, 2, 2], + nargs="+", + type=int, + help=( + "a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-kernel", + default=[10, 3, 3, 3, 3, 3, 3], + nargs="+", + type=int, + help=( + "a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-bias", + default=False, + action=ActionYesNo, + help=("whether the 1D convolutional layers have a bias"), + ) + parser.add_argument( + "--num-conv-pos-embeddings", + default=128, + type=int, + help=( + "number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional " + "embeddings layer" + ), + ) + parser.add_argument( + "--num-conv-pos-embedding-groups", + default=16, + type=int, + help=("number of groups of 1D convolutional positional embeddings layer"), + ) + parser.add_argument( + "--do-stable-layer-norm", + default=False, + action=ActionYesNo, + help=( + "whether to apply *stable* layer norm architecture of the Transformer encoder" + ), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + parser.add_argument( + "--add-adapter", + default=False, + action=ActionYesNo, + help=( + "whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder" + ), + ) + parser.add_argument( + "--adapter-kernel-size", + default=3, + type=int, + help=("kernel size of the convolutional layers in the adapter network"), + ) + parser.add_argument( + "--adapter-stride", + default=2, + type=int, + help=("stride of the convolutional layers in the adapter network"), + ) + parser.add_argument( + "--num-adapter-layers", + default=3, + type=int, + help=( + "number of convolutional layers that should be used in the adapter network" + ), + ) + parser.add_argument( + "--output-hidden-size", + default=None, + type=int, + help=( + "dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*." + " Only relevant if `add_adapter is True" + ), + ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) + valid_args = ( + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_finetune_args(parser) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + """ + Things I think I don't need: + feat_quantizer_dropout (`float`, defaults to 0.0): the dropout probabilitiy for quantized feature encoder states. + final_dropout (`float`, defaults to 0.1): the dropout probability for the + final projection layer of [`Wav2Vec2ForCTC`]. + um_codevectors_per_group (`int`, defaults to 320): + number of entries in each quantization codebook (group). + num_codevector_groups (`int`, defaults to 2): + number of codevector groups for product codevector quantization. + contrastive_logits_temperature (`float`, defaults to 0.1): + the temperature *kappa* in the contrastive loss. + feat_quantizer_dropout (`float`, defaults to 0.0): + the dropout probabilitiy for the output of the feature encoder that's used by the quantizer. + num_negatives (`int`, defaults to 100): + number of negative samples for the contrastive loss. + codevector_dim (`int`, defaults to 256): + dimensionality of the quantized feature vectors. + proj_codevector_dim (`int`, defaults to 256): + dimensionality of the final projection of both the quantized and the transformer features. + diversity_loss_weight (`int`, defaults to 0.1): + the weight of the codebook diversity loss component. + ctc_loss_reduction (`str`, defaults to `"sum"`): + Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an + instance of [`Wav2Vec2ForCTC`]. + ctc_zero_infinity (`bool`, defaults to `False`): + whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly + occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance + of [`Wav2Vec2ForCTC`]. + use_weighted_layer_sum (`bool`, defaults to `False`): + whether to use a weighted average of layer outputs with learned weights. Only relevant when using an + instance of [`Wav2Vec2ForSequenceClassification`]. + classifier_proj_size (`int`, defaults to 256): + dimensionality of the projection before token mean-pooling for classification. + tdnn_dim (`Tuple[int]`, defaults to `(512, 512, 512, 512, 1500)`): + a tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN* + module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers. + tdnn_kernel (`Tuple[int]`, defaults to `(5, 3, 3, 1, 1)`): + a tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the + *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*. + tdnn_dilation (`Tuple[int]`, defaults to `(1, 2, 3, 1, 1)`): + a tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the + *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*. + xvector_output_dim (`int`, defaults to 512): + dimensionality of the *XVector* embedding vectors. + """ diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py new file mode 100644 index 00000000..2cb95a53 --- /dev/null +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -0,0 +1,964 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +from turtle import right +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor + +from ....utils.misc import filter_func_args +from ...layers import LoRAFactory +from ...torch_model import TorchModel +from ...utils import scale_seq_lengths, seq_lengths_to_mask +from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs + + +class HFWav2VecBase(TorchModel): + """Base class for Wav2Vec style models (Wav2Vec2, Hubert, WavLM, ...) in HuggingFace. + + This class includes the proprocessing steps, common to all models. + + Attributes: + pretrained_model_path (`str`, or os.PathLike, defaults to None): file path or + HuggingFace Hub path to pre-trained model. + normalize_input (`bool`, defaults to True): whether or not to zero-mean unit-variance + normalize the input. + use_input_attention_mask (`bool`, defaults to False): whether we should input an + attention mask to the wav2vec model. + cache_dir (str or os.PathLike): path to a directory in which a downloaded pretrained + model configuration should be cached if the standard cache should not be used. + force_download (`bool`, defaults to `False`): whether or not to force the (re-)download + the model weights and configuration files and override the + cached versions if they exist. + resume_download (`bool`, defaults to `False`): whether or not to delete incompletely + received files. Will attempt to resume the download if such a file exists. + revision(`str`, defaults to `"main"`): the specific model version to use. + It can be a branch name, a tag name, or a commit id. + drop_layers_gt (`int` defaults to None): drop encoder layers greater than this value (in [1, num_encoder_layers]). + If None, the model is not changed. + ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path + and inits the model from the configuration. This is set to True for models that have already + been finetuned. + override_dropouts (`bool` defaults to False): if True, it ingnores the dropout probs. in the pretrained model + and uses the ones passed as arguments. + override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. + configuration in the pretrained model and uses the ones passed in the arguments. + left_encoder_context (`int`): past context frames used by the transformer encoder when the signal is evaluated + chunk by chunk, if it is too long to fit in GPU. + right_encoder_context: (`int`): future context frames used by the transformer encoder. + sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. + """ + + def __init__( + self, + pretrained_model_path: Optional[Union[str, os.PathLike]] = None, + normalize_input: bool = True, + use_input_attention_mask: bool = False, + cache_dir: Union[str, os.PathLike] = "./.cache/hyperion_hf", + force_download: bool = False, + resume_download: bool = False, + revision: str = "main", + drop_layers_gt: Optional[int] = None, + ignore_pretrained: bool = False, + override_dropouts: bool = False, + override_spec_augment: bool = False, + left_encoder_context: int = 16, + right_encoder_context: int = 16, + sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, + ): + super().__init__() + self.pretrained_model_path = pretrained_model_path + self.cache_dir = cache_dir + self.force_download = force_download + self.resume_download = resume_download + self.revision = revision + self.drop_layers_gt = drop_layers_gt + self.ignore_pretrained = ignore_pretrained + self.override_dropouts = override_dropouts + self.override_spec_augment = override_spec_augment + self.right_encoder_context = right_encoder_context + self.left_encoder_context = left_encoder_context + self.feat_extract_lr = feat_extract_lr + self.encoder_lr = encoder_lr + self.use_lora = use_lora + self.lora_components = lora_components + self.lora_rank = lora_rank + self.lora_alpha = lora_alpha + self.lora_dropout = lora_dropout + self.lora_merge_weights = lora_merge_weights + + if pretrained_model_path is not None and not ignore_pretrained: + rank = ddp_get_rank() + if rank == 0: + logging.info( + f"Downloading config for HF preprocessor from {pretrained_model_path}" + ) + # rank 0 downloads the model from HF web + try: + # some models donot have config for processor because do not have + # tokenizer, first we try to donwload feature_extractor config + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + ) + except: + # if fails, we try to download full processor config + processor = Wav2Vec2Processor.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + ) + feature_extractor = processor.feature_extractor + + # all ranks wait until the model is downloaded + ddp_wait_for_all_procs() + if rank > 0: + # the rest of ranks should read the configuration from the cache. + try: + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=False, + resume_download=False, + revision=revision, + ) + except: + # if fails, we try to download full processor config + processor = Wav2Vec2Processor.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=False, + resume_download=False, + revision=revision, + ) + feature_extractor = processor.feature_extractor + + ddp_wait_for_all_procs() + normalize_input = feature_extractor.do_normalize + use_input_attention_mask = feature_extractor.return_attention_mask + sample_frequency = feature_extractor.sampling_rate + + self.normalize_input = normalize_input + self.use_input_attention_mask = use_input_attention_mask + self.sample_frequency = sample_frequency + + self._feature_encoder_context = None + self._frame_shift = None + self.hf_model = None + + def __deepcopy__(self, memo): + """Reimplementation of deepcopy for Hugging Face models. + The weight_norm in the Conv. Pos. Encoder of Wav2Vec models make the default deepcopy to fail. + """ + cls = self.__class__ # Extract the class of the object + cfg = self.get_config() + del cfg["class_name"] + # Create a new instance of the object based on extracted class + new_obj = cls(**cfg) + memo[id(self)] = new_obj + new_obj.load_state_dict(self.state_dict()) + device = next(self.parameters()).device + new_obj.to(device) + return new_obj + + @property + def feature_encoder_context(self): + if self._feature_encoder_context is not None: + return self._feature_encoder_context + + total_context = 0 + total_stride = 1 + for kernel, stride in zip( + self.hf_model.config.conv_kernel, self.hf_model.config.conv_stride + ): + total_context += total_stride * (kernel - 1) / 2 + total_stride *= stride + + self._feature_encoder_context = (int(total_context + 0.5), int(total_context)) + return self._feature_encoder_context + + @property + def frame_shift(self): + if self._frame_shift is not None: + return self._frame_shift + + total_stride = 1 + for stride in self.hf_model.config.conv_stride: + total_stride *= stride + + self._frame_shift = total_stride + return total_stride + + @property + def context(self): + left, right = self.feature_encoder_context + left += self.left_encoder_context + right += self.right_encoder_context + return left, right + + def max_out_length(self, max_in_length): + return self.hf_model._get_feat_extract_output_lengths(max_in_length).item() + # left_context, right_context = self.feature_encoder_context + # max_in_length = max_in_length - left_context - right_context + # return max_in_length // self.frame_shift + + def out_lengths(self, in_lengths): + return self.hf_model._get_feat_extract_output_lengths(in_lengths) + # left_context, right_context = self.feature_encoder_context + # in_lengths = in_lengths - left_context - right_context + # return torch.div(in_lengths, self.frame_shift, rounding_mode="floor") + + def out_shape(self, in_shape): + out_length = self.max_out_length(in_shape[1]) + C = self.hf_model.config.hidden_size + return (in_shape[0], out_length, C) + + def change_config( + self, + override_dropouts: bool, + override_spec_augment: bool, + override_lora: bool = False, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, + **kwargs, + ): + if override_spec_augment: + logging.info(f"overriding speech augment with args={kwargs}") + self.change_spec_augment(**kwargs) + + if override_dropouts: + logging.info(f"overriding hf model dropouts with args={kwargs}") + self.change_dropouts(**kwargs) + + if override_lora: + logging.info("overriding LoRA config") + self.change_lora( + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, + ) + + self.feat_extract_lr = feat_extract_lr + self.encoder_lr = encoder_lr + + def change_spec_augment( + self, + apply_spec_augment: bool = True, + mask_time_prob: float = 0.05, + mask_time_length: int = 10, + mask_time_min_masks: int = 2, + mask_feature_prob: float = 0.0, + mask_feature_length: int = 10, + mask_feature_min_masks: int = 0, + **kwargs, + ): + self.hf_model.config.apply_spec_augment = apply_spec_augment + self.hf_model.config.mask_time_prob = mask_time_prob + self.hf_model.config.mask_time_length = mask_time_length + self.hf_model.config.mask_time_min_masks = mask_time_min_masks + self.hf_model.config.mask_feature_prob = mask_feature_prob + self.hf_model.config.mask_feature_length = mask_feature_length + self.hf_model.config.mask_feature_min_masks = mask_feature_min_masks + + def change_lora( + self, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, + ): + if not self.use_lora: + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + pass + else: + # TODO + pass + else: + if use_lora: + # TODO + pass + else: + # TODO + pass + + self.use_lora = use_lora + self.lora_components = lora_components + self.lora_rank = lora_rank + self.lora_alpha = lora_alpha + self.lora_dropout = lora_dropout + self.lora_merge_weights = lora_merge_weights + + def _make_lora_layers( + self, + lora_components: List[str], + lora_rank: int, + lora_alpha: int, + lora_dropout: float, + lora_merge_weights: bool, + ): + counts = {k: 0 for k in lora_components} + self._recursive_replace_layer_by_lora( + self.hf_model, + counts, + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + for k, v in counts.items(): + logging.info("count of LoRA layers for %s = %d", k, v) + assert v > 0, f"did not make any {k} LoRA" + + @staticmethod + def _recursive_replace_layer_by_lora( + model: nn.Module, + counts: dict, + lora_components: List[str], + lora_rank: int, + lora_alpha: int, + lora_dropout: float, + lora_merge_weights: bool, + ): + for name, module in model.named_children(): + if len(list(module.children())) > 0: + HFWav2VecBase._recursive_replace_layer_by_lora( + module, + counts, + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + if isinstance(module, nn.Linear) and name in lora_components: + lora_layer = LoRAFactory.create_from_pretrained( + module, + r=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=lora_merge_weights, + ) + setattr(model, name, lora_layer) + counts[name] += 1 + + def change_dropouts(self, **kwargs): + pass # needs to be overloaded + + def freeze_feature_encoder(self): + self.hf_model.freeze_feature_encoder() + + def freeze_except_lora(self, bias=None): + bias = "none" if bias is None else bias + from ...layers.lora import mark_only_lora_as_trainable + + mark_only_lora_as_trainable(self.hf_model, bias=bias) + + def has_param_groups(self): + return self.feat_extract_lr is not None or self.encoder_lr is not None + + def trainable_param_groups(self): + if not self.has_param_groups(): + return self.trainable_parameters() + + if self.feat_extract_lr == self.encoder_lr: + return [{"params": self.trainable_parameters(), "lr": self.encoder_lr}] + + param_groups = [ + {"params": self.hf_model.feature_extractor.parameters()}, + {"params": self.hf_model.feature_projection.parameters()}, + {"params": self.hf_model.encoder.parameters()}, + ] + if self.hf_model.adapter is not None: + param_groups.append({"params": self.hf_model.adapter.parameters()}) + + if self.feat_extract_lr is not None: + param_groups[0]["lr"] = self.feat_extract_lr + param_groups[1]["lr"] = self.feat_extract_lr + + if self.encoder_lr is not None: + param_groups[2]["lr"] = self.encoder_lr + if len(param_groups) == 4: + param_groups[3]["lr"] = self.encoder_lr + + return param_groups + + @property + def hf_config(self): + return self.hf_model.config + + def _normalize(self, x, x_mask=None): + """Normalizes the audio to have zero mean and unit variance.""" + if x_mask is None: + x = x - x.mean(dim=1, keepdim=True) + std = torch.sqrt((x**2).mean(dim=1, keepdim=True) + 1e-7) + x = x / std + else: + x_mask = x_mask.to(dtype=x.dtype) + x_samples = torch.mean(x_mask, dim=1, keepdim=True) + x_mean = torch.mean(x * x_mask, dim=1, keepdim=True) / x_samples + x2_mean = torch.mean(x**2 * x_mask, dim=1, keepdim=True) / x_samples + std = torch.sqrt(x2_mean - x_mean**2 + 1e-7) + x = (x - x_mean) / std + return x + + def _preprocess(self, x, x_lengths=None): + """Prepares input audio to be used as input to wav2vec style model.""" + x_mask = seq_lengths_to_mask(x_lengths, x.size(-1), dtype=torch.long) + if self.normalize_input: + x = self._normalize(x, x_mask) + + if self.use_input_attention_mask: + x_mask = None + + return x, x_mask + + def forward( + self, + x: torch.Tensor, + x_lengths: Optional[torch.LongTensor] = None, + return_attentions: bool = False, + return_hid_states: bool = False, + chunk_length: float = 0, + detach_chunks: bool = True, + ): + r"""Forward function for long utterances that do not fit in GPU memory. + + Args: + x: input audio of shape = (batch, sequence_length). + x_lengths: lengths of the audio waveforms in samples with shape = (batch,). + return_attentions: whether or not to return the attentions tensors of + all attention layers. + return_hid_states: whether or not to return the hidden states of all layers. + chunk_size: chunk size in seconds. + + Returns: + Dictionary with: + last_hidden_state: sequence of hidden-states at the output of the last + layer of the model (torch.FloatTensor of shape + (batch_size, sequence_length, hidden_size)). + extract_features: sequence of extracted feature vectors of the last + convolutional layer of the model. (torch.FloatTensor of shape + (batch_size, sequence_length, conv_dim[-1]) + hidden_states: hidden-states of the model at the output of each layer + plus the initial embedding outputs (tuple(torch.FloatTensor)). + attentions: Attentions weights after the attention softmax, used to + compute the weighted average in the self-attention heads + (tuple(torch.FloatTensor)). + """ + if chunk_length == 0 or x.size(1) < chunk_length * self.sample_frequency: + return self.forward_impl(x, x_lengths, return_attentions, return_hid_states) + else: + return self.forward_long_impl( + x, + x_lengths, + return_attentions, + return_hid_states, + chunk_length, + detach_chunks, + ) + + def forward_impl( + self, + x: torch.Tensor, + x_lengths: Optional[torch.LongTensor] = None, + return_attentions: bool = False, + return_hid_states: bool = False, + ): + r"""Forward function for wav2vec style models. + + Args: + x: input audio of shape = (batch, sequence_length). + x_lengths: lengths of the audio waveforms in samples with shape = (batch,). + return_attentions: whether or not to return the attentions tensors of + all attention layers. + return_hid_states: whether or not to return the hidden states of all layers. + + Returns: + Dictionary with: + last_hidden_state: sequence of hidden-states at the output of the last + layer of the model (torch.FloatTensor of shape + (batch_size, sequence_length, hidden_size)). + extract_features: sequence of extracted feature vectors of the last + convolutional layer of the model. (torch.FloatTensor of shape + (batch_size, sequence_length, conv_dim[-1]) + hidden_states: hidden-states of the model at the output of each layer + plus the initial embedding outputs (tuple(torch.FloatTensor)). + attentions: Attentions weights after the attention softmax, used to + compute the weighted average in the self-attention heads + (tuple(torch.FloatTensor)). + """ + max_in_length = x.size(-1) + x, x_mask = self._preprocess(x, x_lengths) + # if ddp_get_rank() == 0: + # lora_layer = self.hf_model.encoder.layers[0].attention.v_proj + # print( + # "lora\nw=", + # lora_layer.weight[:3, :3], + # "\na=", + # lora_layer.lora_A[:3, :3], + # "\nb=", + # lora_layer.lora_B[:3, :3], + # "\n", + # "merged=", + # lora_layer.merged, + # "training=", + # lora_layer.training, + # flush=True, + # ) + # assert self.training == lora_layer.training + # assert self.training == (not lora_layer.merged) + output = self.hf_model( + x, + x_mask, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) + max_out_length = output.last_hidden_state.size(1) + feat_lengths = ( + None + if x_lengths is None + else scale_seq_lengths(x_lengths, max_out_length, max_in_length) + ) + output["hidden_states_lengths"] = feat_lengths + + return output + + def forward_long_impl( + self, + x: torch.Tensor, + x_lengths: Optional[torch.LongTensor] = None, + return_attentions: bool = False, + return_hid_states: bool = False, + chunk_length: float = 120.0, + detach_chunks: bool = True, + ): + r"""Forward function for long utterances that do not fit in GPU memory. + + Args: + x: input audio of shape = (batch, sequence_length). + x_lengths: lengths of the audio waveforms in samples with shape = (batch,). + return_attentions: whether or not to return the attentions tensors of + all attention layers. + return_hid_states: whether or not to return the hidden states of all layers. + chunk_size: chunk size in seconds. + + Returns: + Dictionary with: + last_hidden_state: sequence of hidden-states at the output of the last + layer of the model (torch.FloatTensor of shape + (batch_size, sequence_length, hidden_size)). + extract_features: sequence of extracted feature vectors of the last + convolutional layer of the model. (torch.FloatTensor of shape + (batch_size, sequence_length, conv_dim[-1]) + hidden_states: hidden-states of the model at the output of each layer + plus the initial embedding outputs (tuple(torch.FloatTensor)). + attentions: Attentions weights after the attention softmax, used to + compute the weighted average in the self-attention heads + (tuple(torch.FloatTensor)). + """ + # output0 = self.forward_impl(x, x_lengths) + # mol0 = output0.last_hidden_state.size(1) + print("long", flush=True) + max_in_length = x.size(-1) + x, x_mask = self._preprocess(x, x_lengths) + # we transform the chunk length from seconds to samples, + # making sure that the chunk_length corresponds to an integer number of output samples. + chunk_frames = int(chunk_length * self.sample_frequency) // self.frame_shift + chunk_length = chunk_frames * self.frame_shift + num_chunks = (x.size(1) + chunk_length - 1) // chunk_length + left_context, right_context = self.context + max_out_length = self.max_out_length(x.size(1)) + start = 0 + outputs = [] + for i in range(num_chunks): + if i < num_chunks - 1: + start_i = max(start - left_context, 0) + else: + # last chunk has special treatment, we forward pass + # a chunk with chunk_length size ending at the end. + # but we will just use the output frames that don't overlap + # with the second last chunk. + start_i = max(x.size(1) - chunk_length - left_context, 0) + + stop_i = min(start + chunk_length + right_context, x.size(1)) + x_i = x[:, start_i:stop_i] + x_mask_i = None if x_mask is None else x_mask[start_i:stop_i] + output_i = self.hf_model( + x_i, + x_mask_i, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) + + if i < num_chunks - 1: + start_out_i = max( + output_i.last_hidden_state.size(1) + - chunk_frames + - self.right_encoder_context, + 0, + ) + stop_out_i = start_out_i + chunk_frames + else: + # we just use the frames that do not overlap + # with the second last chunk + remaining_frames = max_out_length - i * chunk_frames + start_out_i = -remaining_frames + stop_out_i = output_i.last_hidden_state.size(1) + + output_i.last_hidden_state = output_i.last_hidden_state[ + :, start_out_i:stop_out_i + ] + if detach_chunks: + output_i.last_hidden_state.detach_() + + if return_hid_states: + output_i.hidden_states = [ + h[:, start_out_i:stop_out_i] for h in output_i.hidden_states + ] + if detach_chunks: + output_i.hidden_states = [ + h.detach() for h in output_i.hidden_states + ] + + outputs.append(output_i) + start += chunk_length + + # concatenate outputs from different chunks + output = outputs[0] + output.last_hidden_state = torch.cat( + [o.last_hidden_state for o in outputs], dim=1 + ) + if return_hid_states: + hidden_states = [] + for j in range(len(outputs[0].hidden_states)): + hidden_states_j = torch.cat( + [o.hidden_states[j] for o in outputs], dim=1 + ) + hidden_states.append(hidden_states_j) + output.hidden_states = hidden_states + + if return_attentions: + attentions = [] + for j in range(len(outputs[0].attentions)): + attentions_j = [o.attentions[j] for o in outputs] + attentions.append(attentions_j) + output.attentions = attentions + + feat_lengths = ( + None + if x_lengths is None + else scale_seq_lengths(x_lengths, max_out_length, max_in_length) + ) + output["hidden_states_lengths"] = feat_lengths + return output + + def get_config(self): + """Returns the configuration arguments for the object in a dictionary.""" + + config = { + "pretrained_model_path": self.pretrained_model_path, + "normalize_input": self.normalize_input, + "use_input_attention_mask": self.use_input_attention_mask, + "cache_dir": self.cache_dir, + "force_download": self.force_download, + "resume_download": self.resume_download, + "revision": self.revision, + "drop_layers_gt": self.drop_layers_gt, + "ignore_pretrained": self.ignore_pretrained, + "override_dropouts": self.override_dropouts, + "override_spec_augment": self.override_spec_augment, + "left_encoder_context": self.left_encoder_context, + "right_encoder_context": self.right_encoder_context, + "sample_frequency": self.sample_frequency, + "feat_extract_lr": self.feat_extract_lr, + "encoder_lr": self.encoder_lr, + "use_lora": self.use_lora, + "lora_components": self.lora_components, + "lora_rank": self.lora_rank, + "lora_alpha": self.lora_alpha, + "lora_dropout": self.lora_dropout, + "lora_merge_weights": self.lora_merge_weights, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save(self, file_path: str): + """Saves the model to disk.""" + self.ignore_pretrained = True + self.save(file_path) + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(HFWav2VecBase.__init__, kwargs) + # valid_args = ( + # "pretrained_model_path", + # "normalize_input", + # "use_input_attention_mask", + # "cache_dir", + # "force_download", + # "resume_download", + # "revision", + # "drop_layers_gt", + # "ignore_pretrained", + # "override_dropouts", + # "override_spec_augment", + # "left_encoder_context", + # "right_encoder_context", + # "sample_frequency", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return args + + @staticmethod + def _add_lr_args(parser): + parser.add_argument( + "--feat-extract-lr", + default=None, + type=float, + help=( + "lr for conv feature extractor, it serves to set a lr " + "different than the global one." + ), + ) + parser.add_argument( + "--encoder-lr", + default=None, + type=float, + help=( + "lr for transformer encoder, it serves to set a lr " + "different than the global one." + ), + ) + + @staticmethod + def _add_lora_args(parser): + parser.add_argument( + "--use-lora", + default=False, + action=ActionYesNo, + help="use low-rank adapters", + ) + parser.add_argument( + "--lora-components", + default=["q_proj", "v_proj"], + nargs="+", + choices=[ + "k_proj", + "q_proj", + "v_proj", + "out_proj", + "intermediate_dense", + "output_dense", + ], + help="list of components where we apply LoRA, eg [Wq, Wv]", + ) + parser.add_argument("--lora-rank", default=4, help="rank of LoRA") + parser.add_argument("--lora-alpha", default=1.0, help="scale for LoRA") + parser.add_argument("--lora-dropout", default=0.0, help="dropout rate for LoRA") + parser.add_argument( + "--lora-merge-weights", + default=True, + action=ActionYesNo, + help="lora weights are merged with the pretrained weights at inference.", + ) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--pretrained-model-path", + default=None, + help=("file path or HuggingFace Hub path to pre-trained model"), + ) + + parser.add_argument( + "--normalize-input", + default=True, + action=ActionYesNo, + help=("whether or not to zero-mean unit-variance normalize the input"), + ) + parser.add_argument( + "--use-input-attention-mask", + default=False, + action=ActionYesNo, + help=("whether we should input an attention mask to the wav2vec model"), + ) + parser.add_argument( + "--cache-dir", + default="./.cache/hyperion_hf", + help=( + "path to a directory in which a downloaded pretrained model " + "configuration should be cached if the standard cache should not be used" + ), + ) + parser.add_argument( + "--force-download", + default=False, + action=ActionYesNo, + help=( + "whether or not to force the (re-)download the model weights " + "and configuration files and override thecached versions if they exist" + ), + ) + parser.add_argument( + "--resume-download", + default=False, + action=ActionYesNo, + help=( + "whether or not to delete incompletely received files. " + "Will attempt to resume the download if such a file exists" + ), + ) + parser.add_argument( + "--revision", + default="main", + help=( + "the specific model version to use. It can be a branch name, " + "a tag name, or a commit id. " + ), + ) + parser.add_argument( + "--drop-layers-gt", + default=None, + type=int, + help=("drop encoder layers greater than this value."), + ) + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + parser.add_argument( + "--override-spec-augment", + default=False, + action=ActionYesNo, + help=( + "whether to use the spec augment config. passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + parser.add_argument( + "--left-encoder-context", + default=16, + type=int, + help=( + "past context frames used by the transformer encoder " + "when the signal is evaluated chunk by chunk." + ), + ) + parser.add_argument( + "--right-encoder-context", + default=16, + type=int, + help=( + "future context frames used by the transformer encoder " + "when the signal is evaluated chunk by chunk." + ), + ) + + HFWav2VecBase._add_lr_args(parser) + HFWav2VecBase._add_lora_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + return filter_func_args(HFWav2VecBase.change_config, kwargs) + # valid_args = ( + # "override_dropouts", + # "override_spec_augment", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + parser.add_argument( + "--override-spec-augment", + default=False, + action=ActionYesNo, + help=( + "whether to use the spec augment config. passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + parser.add_argument( + "--override-lora", + default=False, + action=ActionYesNo, + help=("whether to change the config of LoRA layers in the model."), + ) + + HFWav2VecBase._add_lr_args(parser) + HFWav2VecBase._add_lora_args(parser) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py new file mode 100644 index 00000000..1db5fa23 --- /dev/null +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -0,0 +1,818 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +from typing import Callable, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import WavLMConfig, WavLMModel + +from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs +from .hf_wav2vec_base import HFWav2VecBase + + +class HFWavLM(HFWav2VecBase): + r"""This is wrapper over HuggingFace WavLM model. + See documentation: https://huggingface.co/docs/transformers/model_doc/wavlm + + This wrapper makes the HugginFace model to have the same interface + as other hyperion models. It also add extra functionalities. + + The config. parameters are the same as in the HuggingFace WavLMConfig class. + + Attributes: + pretrained_model_path (`str`, defaults to None): file path or HuggingFace Hub path to + pre-trained model. + normalize_input (`bool`, defaults to True): whether or not to zero-mean unit-variance + normalize the input. + use_input_attention_mask (`bool`, defaults to False): whether we should input an + attention mask to the wav2vec model. + vocab_size (`int`, defaults to 32): vocabulary size of the + model. Defines the different tokens that can be represented by the + *inputs_ids* passed to the forward method. + hidden_size (`int`, defaults to 768): dimensionality of the encoder layers and + the pooler layer. + num_hidden_layers (`int`, defaults to 12): number of hidden layers in the + Transformer encoder. + num_attention_heads (`int`, defaults to 12): number of attention heads for + each attention layer in the Transformer encoder. + intermediate_size (`int`, defaults to 3072): dimensionality of the + feed-forward layer in the Transformer encoder. + hidden_act (`str` or `function`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the encoder and pooler. + If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout (`float`, defaults to 0.1): the dropout probability for all + fully connected layers in the embeddings, encoder, and pooler. + activation_dropout (`float`, defaults to 0.1): the dropout probability for all + intermediate layer in feedforward transformer layers. + attention_dropout (`float`, defaults to 0.1): the dropout ratio for the + attention probabilities. + layerdrop (`float`, defaults to 0.1): prob. of dropping a layer. + initializer_range (`float`, defaults to 0.02): the standard deviation of the + truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, defaults to 1e-12): the epsilon used by the layer + normalization layers. + feat_extract_norm (`str`, defaults to `"group"`): + the norm to be applied to 1D convolutional layers in feature encoder. + One of `"group"` for group normalization of only the first 1D convolutional + layer or `"layer"` for layer normalization of all 1D convolutional layers. + feat_proj_dropout (`float`, defaults to 0.0): the dropout probability for output + of the feature encoder. + feat_extract_activation (`str, `optional`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the 1D convolutional layers of the feature + extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + conv_dim (`Tuple[int]`, defaults to `(512, 512, 512, 512, 512, 512, 512)`): + a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. + conv_stride (`Tuple[int]`, defaults to `(5, 2, 2, 2, 2, 2, 2)`): + a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length + of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. + conv_kernel (`Tuple[int]`, defaults to `(10, 3, 3, 3, 3, 3, 3)`): + a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The + length of *conv_kernel* defines the number of convolutional layers and has to match the length of + *conv_dim*. + conv_bias (`bool`, defaults to `False`): whether the 1D convolutional layers have a bias. + num_conv_pos_embeddings (`int`, defaults to 128): + number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional + embeddings layer. + num_conv_pos_embedding_groups (`int`, defaults to 16): + number of groups of 1D convolutional positional embeddings layer. + do_stable_layer_norm (`bool`, defaults to `False`): + whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is + True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is + False` corresponds to applying layer norm after the attention layer. + apply_spec_augment (`bool`, defaults to `True`): + whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see + [SpecAugment: A Simple Data Augmentation Method for Automatic Speech + Recognition](https://arxiv.org/abs/1904.08779). + mask_time_prob (`float`, defaults to 0.05): + percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the + actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. + mask_time_length (`int`, defaults to 10): + length of vector span along the time axis. + mask_time_min_masks (`int`, defaults to 2),: + the minimum number of masks of length `mask_time_length` generated along the time axis, each time step, + irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < + mask_time_min_masks'' + mask_feature_prob (`float`, defaults to 0.0): + percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap + may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is + True`. + mask_feature_length (`int`, defaults to 10): + length of vector span along the feature axis. + mask_feature_min_masks (`int`, defaults to 0): + The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time + step, irrespectively of `mask_feature_prob`. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' + add_adapter (`bool`, defaults to `False`): + whether a convolutional network should be stacked on top of the WavLM Encoder. Can be very useful for + warm-starting WavLM for SpeechEncoderDecoder models. + adapter_kernel_size (`int`, defaults to 3): + kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adapter_stride (`int`, defaults to 2): + stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + num_adapter_layers (`int`, defaults to 3): + number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is + True`. + output_hidden_size (`int`, defaults to None): + dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant + if `add_adapter is True`. + cache_dir (str or os.PathLike): path to a directory in which a downloaded pretrained + model configuration should be cached if the standard cache should not be used. + force_download (`bool`, defaults to `False`): whether or not to force the (re-)download + the model weights and configuration files and override the + cached versions if they exist. + resume_download (`bool`, defaults to `False`): whether or not to delete incompletely + received files. Will attempt to resume the download if such a file exists. + revision(`str`, defaults to `"main"`): the specific model version to use. + It can be a branch name, a tag name, or a commit id. + ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path + and inits the model from the configuration. This is set to True for models that have already + been finetuned. + override_dropouts (`bool` defaults to False): if True, it ingnores the dropout probs. in the pretrained model + and uses the ones passed as arguments. + override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. + configuration in the pretrained model and uses the ones passed in the arguments. + left_encoder_context (`int`): past context frames used by the transformer encoder when the signal is evaluated + chunk by chunk, if it is too long to fit in GPU. + right_encoder_context: (`int`): future context frames used by the transformer encoder. + sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. + """ + + def __init__( + self, + pretrained_model_path: Optional[Union[str, os.PathLike]] = None, + normalize_input: bool = True, + use_input_attention_mask: bool = False, + vocab_size: int = 32, + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: Union[str, Callable] = "gelu", + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + layerdrop: float = 0.1, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + feat_extract_norm: str = "group", + feat_proj_dropout: float = 0.0, + feat_extract_activation: Union[str, Callable] = "gelu", + conv_dim: Tuple[int] = (512, 512, 512, 512, 512, 512, 512), + conv_stride: Tuple[int] = (5, 2, 2, 2, 2, 2, 2), + conv_kernel: Tuple[int] = (10, 3, 3, 3, 3, 3, 3), + conv_bias: bool = False, + num_conv_pos_embeddings: int = 128, + num_conv_pos_embedding_groups: int = 16, + do_stable_layer_norm: bool = False, + apply_spec_augment: bool = True, + mask_time_prob: float = 0.05, + mask_time_length: int = 10, + mask_time_min_masks: int = 2, + mask_feature_prob: float = 0.0, + mask_feature_length: int = 10, + mask_feature_min_masks: int = 0, + add_adapter: bool = False, + adapter_kernel_size: int = 3, + adapter_stride: int = 2, + num_adapter_layers: int = 3, + output_hidden_size: Optional[int] = None, + cache_dir: Union[str, os.PathLike] = "./.cache/hyperion_hf", + force_download: bool = False, + resume_download: bool = False, + revision: str = "main", + drop_layers_gt: Optional[int] = None, + ignore_pretrained: bool = False, + override_dropouts: bool = False, + override_spec_augment: bool = False, + left_encoder_context: int = 16, + right_encoder_context: int = 16, + sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, + ): + super().__init__( + pretrained_model_path=pretrained_model_path, + normalize_input=normalize_input, + use_input_attention_mask=use_input_attention_mask, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + drop_layers_gt=drop_layers_gt, + ignore_pretrained=ignore_pretrained, + override_dropouts=override_dropouts, + override_spec_augment=override_spec_augment, + left_encoder_context=left_encoder_context, + right_encoder_context=right_encoder_context, + sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, + ) + + if pretrained_model_path is not None and not ignore_pretrained: + rank = ddp_get_rank() + if rank == 0: + logging.info(f"Downloading HF model from {pretrained_model_path}") + # rank 0 downloads the model from HF web + self.hf_model = WavLMModel.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + ) + # all ranks wait until the model is downloaded + ddp_wait_for_all_procs() + if rank > 0: + # the rest of ranks should read the configuration from the cache. + self.hf_model = WavLMModel.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=False, + resume_download=False, + revision=revision, + ) + ddp_wait_for_all_procs() + self.hf_model.config.layerdrop = 0.0 + self.change_config( + override_dropouts=self.override_dropouts, + override_spec_augment=self.override_spec_augment, + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + ) + else: + hf_config = WavLMConfig( + vocab_size=vocab_size, + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + hidden_act=hidden_act, + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + layerdrop=0.0, # layerdrop, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + feat_extract_norm=feat_extract_norm, + feat_extract_activation=feat_extract_activation, + conv_dim=conv_dim, + conv_stride=conv_stride, + conv_kernel=conv_kernel, + conv_bias=conv_bias, + num_conv_pos_embeddings=num_conv_pos_embeddings, + num_conv_pos_embedding_groups=num_conv_pos_embedding_groups, + do_stable_layer_norm=do_stable_layer_norm, + apply_spec_augment=apply_spec_augment, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + add_adapter=add_adapter, + adapter_kernel_size=adapter_kernel_size, + adapter_stride=adapter_stride, + num_adapter_layers=num_adapter_layers, + output_hidden_size=output_hidden_size, + ) + self.hf_model = WavLMModel(hf_config) + + if drop_layers_gt is not None: + self.drop_upper_layers(drop_layers_gt) + + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + + self.ignore_pretrained = True + + @property + def num_encoder_layers(self): + return self.hf_config.num_hidden_layers + + @property + def hidden_size(self): + return self.hf_config.hidden_size + + def change_dropouts( + self, + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + feat_proj_dropout: float = 0.1, + **kwargs, + ): + import transformers.models.wavlm.modeling_wavlm as t + + self.hf_model.config.hidden_dropout = hidden_dropout + self.hf_model.config.activation_dropout = activation_dropout + self.hf_model.config.attention_dropout = attention_dropout + self.hf_model.config.feat_proj_dropout = feat_proj_dropout + + self.hf_model.feature_projection.dropout.p = feat_proj_dropout + for module in self.hf_model.encoder.modules(): + if isinstance(module, nn.Dropout): + module.p = hidden_dropout + + for module in self.hf_model.encoder.modules(): + if isinstance(module, t.WavLMAttention): + module.dropout = activation_dropout + if isinstance(module, t.WavLMFeatureProjection): + module.intermediate_dropout.p = activation_dropout + + def drop_upper_layers(self, max_layers: int): + if max_layers >= self.hf_config.num_hidden_layers: + return + + layers = self.hf_model.encoder.layers + self.hf_model.encoder.layers = nn.ModuleList( + [l for i, l in enumerate(layers) if i < max_layers] + ) + self.hf_config.num_hidden_layers = max_layers + + if self.hf_model.adapter is not None: + del self.hf_model.adapter + self.hf_model.adapter = None + self.hf_config.add_adapter = False + + def get_config(self): + """Returns the configuration arguments for the object in a dictionary.""" + config = self.hf_model.config.to_dict() + config = self.filter_args(**config) + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + args_base = HFWav2VecBase.filter_args(**kwargs) + valid_args = ( + "vocab_size", + "hidden_size", + "num_hidden_layers", + "num_attention_heads", + "intermediate_size", + "hidden_act", + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "layerdrop", + "initializer_range", + "layer_norm_eps", + "feat_extract_norm", + "feat_extract_activation", + "conv_dim", + "conv_stride", + "conv_kernel", + "conv_bias", + "num_conv_pos_embeddings", + "num_conv_pos_embedding_groups", + "do_stable_layer_norm", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + "add_adapter", + "adapter_kernel_size", + "adapter_stride", + "num_adapter_layers", + "output_hidden_size", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_class_args(parser) + + parser.add_argument( + "--vocab-size", + default=32, + type=int, + help=( + "vocabulary size of the " + "model. Defines the different tokens that can be represented by the " + "*inputs_ids* passed to the forward method." + ), + ) + parser.add_argument( + "--hidden-size", + default=768, + type=int, + help=("dimensionality of the encoder layers and the pooler layer."), + ) + parser.add_argument( + "--num-hidden-layers", + default=12, + type=int, + help=("number of hidden layers in the Transformer encoder"), + ) + parser.add_argument( + "--num-attention-heads", + default=12, + type=int, + help=( + "number of attention heads for " + "each attention layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--intermediate-size", + default=3072, + type=int, + help=( + "dimensionality of the " "feed-forward layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--hidden-act", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear " + "activation function (function or string) in the encoder and pooler" + ), + ) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--layerdrop", + default=0.1, + type=float, + help=("prob. of dropping a layer"), + ) + parser.add_argument( + "--initializer-range", + default=0.02, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--layer-norm-eps", + default=1e-12, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--feat-extract-norm", + default="group", + choices=["group", "layer"], + help=( + "the norm to be applied to 1D convolutional layers in feature encoder. " + "One of `group` for group normalization of only the first 1D convolutional " + "layer or `layer` for layer normalization of all 1D convolutional layers" + ), + ) + parser.add_argument( + "--feat-proj-dropout", + default=0.1, + type=float, + help=("the dropout probability for output of the feature encoder"), + ) + parser.add_argument( + "--feat-extract-activation", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear activation function (function or string) in the 1D " + "convolutional layers of the feature extractor" + ), + ) + parser.add_argument( + "--conv-dim", + default=[512, 512, 512, 512, 512, 512, 512], + nargs="+", + type=int, + help=( + "a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the " + "feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers" + ), + ) + parser.add_argument( + "--conv-stride", + default=[5, 2, 2, 2, 2, 2, 2], + nargs="+", + type=int, + help=( + "a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-kernel", + default=[10, 3, 3, 3, 3, 3, 3], + nargs="+", + type=int, + help=( + "a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-bias", + default=False, + action=ActionYesNo, + help=("whether the 1D convolutional layers have a bias"), + ) + parser.add_argument( + "--num-conv-pos-embeddings", + default=128, + type=int, + help=( + "number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional " + "embeddings layer" + ), + ) + parser.add_argument( + "--num-conv-pos-embedding-groups", + default=16, + type=int, + help=("number of groups of 1D convolutional positional embeddings layer"), + ) + parser.add_argument( + "--do-stable-layer-norm", + default=False, + action=ActionYesNo, + help=( + "whether to apply *stable* layer norm architecture of the Transformer encoder" + ), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + parser.add_argument( + "--add-adapter", + default=False, + action=ActionYesNo, + help=( + "whether a convolutional network should be stacked on top of the WavLM Encoder" + ), + ) + parser.add_argument( + "--adapter-kernel-size", + default=3, + type=int, + help=("kernel size of the convolutional layers in the adapter network"), + ) + parser.add_argument( + "--adapter-stride", + default=2, + type=int, + help=("stride of the convolutional layers in the adapter network"), + ) + parser.add_argument( + "--num-adapter-layers", + default=3, + type=int, + help=( + "number of convolutional layers that should be used in the adapter network" + ), + ) + parser.add_argument( + "--output-hidden-size", + default=None, + type=int, + help=( + "dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*." + " Only relevant if `add_adapter is True" + ), + ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) + valid_args = ( + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_finetune_args(parser) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/__init__.py b/hyperion/torch/trainers/__init__.py index 8fef7df5..94326857 100644 --- a/hyperion/torch/trainers/__init__.py +++ b/hyperion/torch/trainers/__init__.py @@ -3,17 +3,16 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .dino_xvector_trainer import DINOXVectorTrainer +from .dvae_trainer import DVAETrainer from .torch_trainer import TorchTrainer - +from .transducer_trainer import TransducerTrainer +from .vae_trainer import VAETrainer +from .vq_dvae_trainer import VQDVAETrainer +from .vq_vae_trainer import VQVAETrainer +from .xvector_adv_trainer import XVectorAdvTrainer +from .xvector_adv_trainer_from_wav import XVectorAdvTrainerFromWav from .xvector_trainer import XVectorTrainer from .xvector_trainer_deep_feat_reg import XVectorTrainerDeepFeatReg -from .xvector_adv_trainer import XVectorAdvTrainer - -from .xvector_trainer_from_wav import XVectorTrainerFromWav from .xvector_trainer_deep_feat_reg_from_wav import XVectorTrainerDeepFeatRegFromWav -from .xvector_adv_trainer_from_wav import XVectorAdvTrainerFromWav - -from .vae_trainer import VAETrainer -from .dvae_trainer import DVAETrainer -from .vq_vae_trainer import VQVAETrainer -from .vq_dvae_trainer import VQDVAETrainer +from .xvector_trainer_from_wav import XVectorTrainerFromWav diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 8646c79f..4004a565 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -3,16 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - import torch +import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ..utils import MetricAcc -from .torch_trainer import TorchTrainer +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType, TorchTrainer class AETrainer(TorchTrainer): @@ -34,6 +36,7 @@ class AETrainer(TorchTrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -43,8 +46,10 @@ class AETrainer(TorchTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp - + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -56,15 +61,18 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -73,38 +81,16 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x", + target_key="x", ): - if loss is None: loss = nn.MSELoss() - super().__init__( - model, - loss, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) def train_epoch(self, data_loader): """Training epoch loop @@ -112,26 +98,21 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning features and class labels. """ - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, data in enumerate(data_loader): - - if isinstance(data, (tuple, list)): - data, _ = data - self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data = data.to(self.device) - batch_size = data.shape[0] - - with self.amp_autocast(): - output = self.model(data) - loss = self.loss(output, data).mean() / self.grad_acc_steps + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + with amp.autocast(enabled=self.use_amp): + output = self.model(input_data) + loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -139,53 +120,68 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, data) + batch_metrics[k] = metric(output, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) - # total_batches += 1 logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): - if isinstance(data, (tuple, list)): - data, _ = data - - data = data.to(self.device) - batch_size = data.shape[0] - with self.amp_autocast(): - output = self.model(data) - loss = self.loss(output, data) + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + with amp.autocast(enabled=self.use_amp): + output = self.model(input_data) + loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, data) + batch_metrics[k] = metric(output, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + TorchTrainer.add_class_args( + parser, train_modes, skip=skip.union({"target_key"}) + ) + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="x", help="dict. key for nnet targets" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/dino_xvector_trainer.py b/hyperion/torch/trainers/dino_xvector_trainer.py new file mode 100644 index 00000000..6573c21a --- /dev/null +++ b/hyperion/torch/trainers/dino_xvector_trainer.py @@ -0,0 +1,485 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +from collections import OrderedDict as ODict + +import torch +import torch.cuda.amp as amp +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from torch.distributed.elastic.multiprocessing.errors import record + +from ...utils.misc import filter_func_args +from ..optim import ExpMovingAvg as EMA +from ..utils import MetricAcc, TorchDDP, tensors_subset +from .torch_trainer import AMPDType, DDPType, TorchTrainer + + +class DINOXVectorTrainer(TorchTrainer): + """Trainer to train x-vector style models. + + Attributes: + model: x-Vector model object. + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict + teacher_optim: teacher EMA momentum + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + amp_dtype: float16 | bfloat16 + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch + cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. + """ + + def __init__( + self, + student_model, + teacher_model, + loss, + optim, + teacher_optim, + cosine_loss=None, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + eff_batch_size=None, + device=None, + metrics=None, + lrsched=None, + wdsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + train_mode="full", + freeze_output_layer_steps=3000, + use_amp=False, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + save_interval_steps=None, + cpu_offload=False, + input_key="x", + ): + super_args = filter_func_args(super().__init__, locals()) + self.teacher_model = teacher_model + self.teacher_optim = teacher_optim + self.freeze_output_layer_steps = freeze_output_layer_steps + self.cosine_loss = cosine_loss + super().__init__(student_model, **super_args) + + def prepare_models_for_training(self): + super().prepare_models_for_training() + self.teacher_model, self.teacher_optimizer = self._prepare_model_for_ema( + self.teacher_model, + self.teacher_optim, + self.device, + self.ddp, + ) + + def _prepare_model_for_ema(self, model, optim, device, ddp): + if device is not None: + model.to(device) + + optimizer = EMA(model.parameters(), **optim) + + if ddp: + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) + + return model, optimizer + + def set_train_mode(self): + super().set_train_mode() + self.teacher_model.freeze() + + @torch.no_grad() + def update_teacher_model(self): + self.teacher_optimizer.step(self.model.parameters()) + + @staticmethod + def get_augs_keys(batch, base_key, subset, skip=set()): + base_key = f"{base_key}_{subset}" + keys = [] + + chunk_idx = 0 + while True: + found_chunk = 0 + chunk_key = f"{base_key}_{chunk_idx}" + if chunk_key in batch: + if chunk_key not in skip: + keys.append(chunk_key) + found_chunk = True + aug_idx = 0 + while True: + aug_key = f"{chunk_key}_aug_{aug_idx}" + if aug_key in batch: + if aug_key not in skip: + keys.append(aug_key) + + aug_idx += 1 + found_chunk = True + else: + break + + if not found_chunk: + break + + chunk_idx += 1 + + return keys + + @record + def train_epoch(self, data_loader): + """Training epoch loop + + Args: + data_loader: pytorch data loader returning features and class labels. + """ + metric_acc = MetricAcc(device=self.device) + batch_metrics = ODict() + self.model.train() + self.teacher_model.train() + self.loss.update_temp(self.cur_epoch) + self.loss.train() + if self.cosine_loss is not None: + self.cosine_loss.update_scale(self.cur_epoch) + + for batch, data in enumerate(data_loader): + self.loggers.on_batch_begin(batch) + + if batch % self.grad_acc_steps == 0: + self.optimizer.zero_grad() + + teacher_keys = self.get_augs_keys(data, self.input_key, "teacher") + student_keys = self.get_augs_keys(data, self.input_key, "student") + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + with torch.no_grad(): + teacher_data = tensors_subset(data, teacher_keys, self.device) + batch_size = teacher_data[0].size(0) + num_teacher_crops = len(teacher_data) + teacher_data = torch.cat(teacher_data, dim=0) + teacher_out = self.teacher_model(teacher_data) + assert not torch.any( + torch.isnan(teacher_out.logits) + ), "teacher is nan" + assert not torch.any( + torch.isinf(teacher_out.logits) + ), "teacher is inf" + + if num_teacher_crops > 1: + student_out1 = self.model(teacher_data) + assert not torch.any(torch.isnan(student_out1.logits)), "s1 is nan" + assert not torch.any(torch.isinf(student_out1.logits)), "s1 is inf" + + student_data = tensors_subset(data, student_keys, self.device) + num_student_crops = len(student_data) + student_data = torch.cat(student_data, dim=0) + student_out2 = self.model(student_data) + assert not torch.any(torch.isnan(student_out2.logits)), "s2 is nan" + assert not torch.any(torch.isinf(student_out2.logits)), "s2 is inf" + if num_teacher_crops > 1: + student_out_logits = torch.cat( + (student_out1.logits, student_out2.logits), dim=0 + ) + if self.cosine_loss is not None: + student_out_embeds = torch.cat( + (student_out1.xvector, student_out2.xvector), dim=0 + ) + num_student_crops += num_teacher_crops + else: + student_out_logits = student_out2.logits + student_out_embeds = student_out2.xvector + + loss_dino = self.loss( + student_out_logits, + teacher_out.logits, + num_student_crops, + num_teacher_crops, + ) + loss = loss_dino + if self.cosine_loss is not None: + scaled_loss_cosine, loss_cosine = self.cosine_loss( + student_out_embeds, + teacher_out.xvector, + num_student_crops, + num_teacher_crops, + ) + loss = loss_dino + scaled_loss_cosine + + loss = loss / self.grad_acc_steps + assert not torch.isnan( + loss + ), f"loss is nan {batch} {torch.mean(teacher_out)} {torch.mean(student_out1)} {torch.mean(student_out2)}" + + if self.use_amp: + self.grad_scaler.scale(loss).backward() + else: + loss.backward() + + if (batch + 1) % self.grad_acc_steps == 0: + self.cur_batch = batch + 1 + if self.freeze_output_layer_steps > self.global_step: + self.model.cancel_output_layer_grads() + + self.update_model() + self.update_teacher_model() + self.save_checkpoint(partial=True) + + batch_metrics["loss"] = loss.item() * self.grad_acc_steps + if self.cosine_loss is not None: + batch_metrics["loss_dino"] = loss_dino.item() + batch_metrics["loss_cosine"] = loss_cosine.item() + + metric_acc.update(batch_metrics, batch_size) + logs = metric_acc.metrics + lrs = self._get_lrs() + logs.update(lrs) + logs["ema_momentum"] = self.teacher_optimizer.momentum + self.loggers.on_batch_end(logs=logs, batch_size=batch_size) + + logs = metric_acc.metrics + logs = ODict(("train_" + k, v) for k, v in logs.items()) + lrs = self._get_lrs() + logs.update(lrs) + logs.update(self._get_wds()) + logs["ema_momentum"] = self.teacher_optimizer.momentum + return logs + + @torch.no_grad() + def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + + Args: + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. + """ + metric_acc = MetricAcc(self.device) + batch_metrics = ODict() + self.teacher_model.eval() + self.loss.eval() + + if swa_update_bn: + self.model.train() + else: + log_tag = "val_" + self.model.eval() + + for batch, data in enumerate(data_loader): + teacher_keys = self.get_augs_keys(data, self.input_key, "teacher") + student_keys = self.get_augs_keys(data, self.input_key, "student") + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + teacher_data = tensors_subset(data, teacher_keys, self.device) + batch_size = teacher_data[0].size(0) + num_teacher_crops = len(teacher_data) + teacher_data = torch.cat(teacher_data, dim=0) + teacher_out = self.teacher_model(teacher_data) + assert not torch.any(torch.isnan(teacher_out.logits)), "teacher is nan" + assert not torch.any(torch.isinf(teacher_out.logits)), "teacher is inf" + + if num_teacher_crops > 1: + student_out1 = self.model(teacher_data) + assert not torch.any(torch.isnan(student_out1.logits)), "s1 is nan" + assert not torch.any(torch.isinf(student_out1.logits)), "s1 is inf" + + student_data = tensors_subset(data, student_keys, self.device) + num_student_crops = len(student_data) + student_data = torch.cat(student_data, dim=0) + student_out2 = self.model(student_data) + assert not torch.any(torch.isnan(student_out2.logits)), "s2 is nan" + assert not torch.any(torch.isinf(student_out2.logits)), "s2 is inf" + if num_teacher_crops > 1: + student_out_logits = torch.cat( + (student_out1.logits, student_out2.logits), dim=0 + ) + if self.cosine_loss is not None: + student_out_embeds = torch.cat( + (student_out1.xvector, student_out2.xvector), dim=0 + ) + num_student_crops += num_teacher_crops + else: + student_out_logits = student_out2.logits + student_out_embeds = student_out2.xvector + + loss_dino = self.loss( + student_out_logits, + teacher_out.logits, + num_student_crops, + num_teacher_crops, + ) + loss = loss_dino + if self.cosine_loss is not None: + scaled_loss_cosine, loss_cosine = self.cosine_loss( + student_out_embeds, + teacher_out.xvector, + num_student_crops, + num_teacher_crops, + ) + loss = loss_dino + scaled_loss_cosine + + batch_metrics["loss"] = loss.item() + if self.cosine_loss is not None: + batch_metrics["loss_dino"] = loss_dino.item() + batch_metrics["loss_cosine"] = loss_cosine.item() + # for k, metric in self.metrics.items(): + # batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + + logs = metric_acc.metrics + logs = ODict((log_tag + k, v) for k, v in logs.items()) + return logs + + def _old_load_checkpoint(self, checkpoint): + self.teacher_model.load_state_dict(checkpoint["teacher_model_state_dict"]) + # self.teacher_model.load_state_dict(checkpoint["teacher_state_dict"]) + self.teacher_optimizer.load_state_dict( + checkpoint["teacher_optimizer_state_dict"] + ) + return super()._load_checkpoint(checkpoint) + + def _load_checkpoint(self, checkpoint, teacher_checkpoint, loss_checkpoint=None): + self.teacher_model.load_state_dict(teacher_checkpoint["model_state_dict"]) + self.teacher_optimizer.load_state_dict( + teacher_checkpoint["optimizer_state_dict"] + ) + if loss_checkpoint is not None: + self.loss.load_state_dict(loss_checkpoint["model_state_dict"]) + return super()._load_checkpoint(checkpoint) + + def load_checkpoint(self, epoch, step): + checkpoint = self.load_model_checkpoint("model", epoch, step) + teacher_checkpoint = self.load_model_checkpoint("teacher_model", epoch, step) + try: + loss_checkpoint = self.load_model_checkpoint("dino_loss", epoch, step) + except: + logging.warning( + "dino loss checkpoint not found, initial center will be zero-vector" + ) + loss_checkpoint = None + return self._load_checkpoint(checkpoint, teacher_checkpoint, loss_checkpoint) + + def checkpoint(self, logs=None): + checkpoint = super().checkpoint(logs) + self.teacher_model.train() + checkpoint["teacher_model_state_dict"] = self.teacher_model.state_dict() + checkpoint["teacher_optimizer_state_dict"] = self.teacher_optimizer.state_dict() + return checkpoint + + def teacher_checkpoint(self, logs=None): + """Creates a checkpoint of the teacher model, to save and posterior recovery + + Args: + logs: logs containing the current value of the metrics. + """ + self.teacher_model.train() + checkpoint = { + "epoch": self.cur_epoch, + "batch": self.cur_batch, + "global_step": self.global_step, + "model_cfg": self.teacher_model.get_config(), + "model_state_dict": self.teacher_model.state_dict(), + "optimizer_state_dict": self.teacher_optimizer.state_dict(), + } + + if logs is not None: + checkpoint["logs"] = logs + + return checkpoint + + def dino_loss_checkpoint(self, logs=None): + self.loss.train() + checkpoint = { + "epoch": self.cur_epoch, + "batch": self.cur_batch, + "global_step": self.global_step, + "model_state_dict": self.loss.state_dict(), + } + return checkpoint + + def save_checkpoint(self, logs=None, partial: bool = False): + """Saves a checkpoint of the training status + + Args: + logs: logs containing the current value of the metrics. + partial: if True, it is saving in the middle of the epoch + """ + if partial and not self.save_partial_checkpoint(): + return + + if self.ddp and ( + self.ddp_type == DDPType.OSS_DDP or self.ddp_type == DDPType.OSS_SHARDED_DDP + ): + # Not sure what this does, just copying from the example in + # https://github.com/facebookresearch/fairscale/blob/master/benchmarks/oss.py + # Check the checkpointing in the case of the OSS optimizer + # Memory usage could spill over from there + # optimizer = cast(OSS, optimizer) + self.optimizer.consolidate_state_dict() + + if self.rank != 0: + return + + checkpoint = self.checkpoint(logs) + self.save_model_checkpoint("model", checkpoint, partial=partial) + + teacher_checkpoint = self.teacher_checkpoint(logs) + self.save_model_checkpoint("teacher_model", teacher_checkpoint, partial=partial) + + loss_checkpoint = self.dino_loss_checkpoint() + self.save_model_checkpoint("dino_loss", loss_checkpoint, partial=partial) + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(DINOXVectorTrainer.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + skip.add("teacher_key") + TorchTrainer.add_class_args(parser, train_modes=train_modes) + EMA.add_class_args(parser, prefix="teacher_optim") + parser.add_argument( + "--freeze-output-layer-steps", + default=1500, + type=int, + help="freeze the output layer during the first updates of the model", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index 0d9b1de3..6b391912 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -3,16 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - import torch +import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ..utils import MetricAcc -from .torch_trainer import TorchTrainer +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType, TorchTrainer class DVAETrainer(TorchTrainer): @@ -33,6 +35,7 @@ class DVAETrainer(TorchTrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -42,8 +45,10 @@ class DVAETrainer(TorchTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp - + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -54,15 +59,18 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -71,36 +79,42 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x_aug", + target_key="x", ): - - super().__init__( - model, - None, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # None, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) def train_epoch(self, data_loader): """Training epoch loop @@ -108,28 +122,21 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning noisy and clean features """ - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, data in enumerate(data_loader): - - assert isinstance(data, (tuple, list)) - x = data[0] - x_target = data[1] - self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - x = x.to(self.device) - x_target = x_target.to(self.device) - batch_size = x.shape[0] - - with self.amp_autocast(): - output = self.model(x, x_target=x_target, return_x_mean=True) + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(input_data, x_target=target, return_x_mean=True) elbo = output["elbo"].mean() loss = -elbo / self.grad_acc_steps @@ -141,24 +148,26 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["elbo"] = elbo.item() for metric in ["log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x_target) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -167,37 +176,53 @@ def validation_epoch(self, data_loader, swa_update_bn=False): Args: data_loader: PyTorch data loader return input/output pairs """ + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): - - assert isinstance(data, (tuple, list)) - x = data[0] - x_target = data[1] - - x = x.to(self.device) - x_target = x_target.to(self.device) - batch_size = x.shape[0] - - with self.amp_autocast(): - output = self.model(x, x_target=x_target, return_x_mean=True) + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] for metric in ["elbo", "log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x_target) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + TorchTrainer.add_class_args( + parser, train_modes, skip=skip.union({"input_key", "target_key"}) + ) + if "input_key" not in skip: + parser.add_argument( + "--input-key", default="x_aug", help="dict. key for nnet input" + ) + + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="x", help="dict. key for nnet targets" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/plda_trainer.py b/hyperion/torch/trainers/plda_trainer.py index 4365ed56..cd0b17e8 100644 --- a/hyperion/torch/trainers/plda_trainer.py +++ b/hyperion/torch/trainers/plda_trainer.py @@ -2,18 +2,20 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -from collections import OrderedDict as ODict import logging +import os +from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn -from ..utils import MetricAcc -from ..utils.misc import get_selfsim_tarnon +from ...utils.misc import filter_func_args from ..losses import BCEWithLLR -from .torch_trainer import TorchTrainer +from ..utils import MetricAcc, tensors_subset +from ..utils.misc import get_selfsim_tarnon +from .torch_trainer import AMPDType, TorchTrainer class PLDATrainer(TorchTrainer): @@ -35,9 +37,9 @@ class PLDATrainer(TorchTrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) loss: if None, it uses cross-entropy loss_weights: dictionary with weights for multiclass and binary cross-entropies - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -47,7 +49,10 @@ class PLDATrainer(TorchTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -58,9 +63,11 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", @@ -69,7 +76,8 @@ def __init__( p_tar=0.5, train_mode="train", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -78,38 +86,16 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x", + target_key="class_id", ): - if loss is None: loss = nn.CrossEntropyLoss() - super().__init__( - model, - loss, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) self.loss_bce = BCEWithLLR(p_tar) self.loss_weights = loss_weights @@ -120,7 +106,7 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning features and class labels. """ - + batch_keys = [self.input_key, self.target_key] self.model.update_margin(self.cur_epoch) return_multi = self.loss_weights["multi"] > 0 @@ -129,21 +115,22 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc() batch_metrics = ODict() - self.set_train_mode() - for batch, (data, target) in enumerate(data_loader): + self.model.train() + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if return_bin: target_bin, mask_bin = get_selfsim_tarnon(target, return_mask=True) - with self.amp_autocast(): + + with amp.autocast(enabled=self.use_amp): output = self.model( - data, + input_data, target, return_multi=return_multi, return_bin=return_bin, @@ -167,9 +154,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps if return_bin: @@ -181,11 +168,13 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -194,7 +183,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): Args: data_loader: PyTorch data loader return input/output pairs """ - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc() batch_metrics = ODict() return_multi = self.loss_weights["multi"] > 0 @@ -203,20 +192,21 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() - for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + for batch, data in enumerate(data_loader): + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if return_bin: target_bin, mask_bin = get_selfsim_tarnon(target, return_mask=True) - with self.amp_autocast(): + + with amp.autocast(enabled=self.use_amp): output = self.model( - data, return_multi=return_multi, return_bin=return_bin + input_data, return_multi=return_multi, return_bin=return_bin ) loss = 0 if return_multi: diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 72f6d164..4d8adcf4 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -3,27 +3,39 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os import contextlib +import glob +import logging +import math +import os +import re from collections import OrderedDict as ODict from enum import Enum -from jsonargparse import ArgumentParser, ActionParser -import logging from pathlib import Path +from typing import Any, Dict, Optional import torch -import torch.nn as nn import torch.cuda.amp as amp -from torch.optim.swa_utils import AveragedModel, SWALR import torch.distributed as dist - +import torch.nn as nn from fairscale.optim.grad_scaler import ShardedGradScaler +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from torch.optim.swa_utils import SWALR, AveragedModel -from ..utils import MetricAcc, TorchDDP, FairShardedDDP, FairFullyShardedDDP -from ..loggers import LoggerList, CSVLogger, ProgLogger, TensorBoardLogger, WAndBLogger -from ..optim import OptimizerFactory as OF -from ..lr_schedulers import LRSchedulerFactory as LRSF +from ...utils.misc import filter_func_args +from ..loggers import CSVLogger, LoggerList, ProgLogger, TensorBoardLogger, WAndBLogger from ..lr_schedulers import LRScheduler as LRS +from ..lr_schedulers import LRSchedulerFactory as LRSF +from ..optim import OptimizerFactory as OF +from ..utils import ( + FairFullyShardedDDP, + FairShardedDDP, + MetricAcc, + TorchDDP, + tensors_subset, +) +from ..wd_schedulers import WDScheduler as WDS +from ..wd_schedulers import WDSchedulerFactory as WDSF class DDPType(str, Enum): @@ -32,6 +44,23 @@ class DDPType(str, Enum): OSS_SHARDED_DDP = "oss_sharded_ddp" FULLY_SHARDED_DDP = "fully_sharded_ddp" + @staticmethod + def choices(): + return [o.value for o in DDPType] + + +class AMPDType(str, Enum): + FLOAT16 = "float16" + BFLOAT16 = "bfloat16" + + @staticmethod + def choices(): + return [o.value for o in AMPDType] + + @staticmethod + def to_dtype(dtype): + return torch.float16 if dtype == AMPDType.FLOAT16 else torch.bfloat16 + ddp_choices = [o.value for o in DDPType] @@ -53,8 +82,9 @@ class TorchTrainer(object): loggers: LoggerList object, loggers write training progress to std. output and file. ddp: if True use distributed data parallel training ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + train_mode: training mode in ['full', 'frozen'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -64,7 +94,10 @@ class TorchTrainer(object): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -76,15 +109,18 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -93,16 +129,22 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x", + target_key="class_id", ): - self.model = model - # self.optimizer = optim self.loss = loss self.epochs = epochs self.cur_epoch = cur_epoch + self.cur_batch = 0 self.grad_acc_steps = grad_acc_steps + self.eff_batch_size = eff_batch_size self.exp_path = Path(exp_path) + self.optim = optim + self.lrsched = lrsched + self.wdsched = wdsched if loggers is None: self.loggers = self._default_loggers( @@ -113,96 +155,173 @@ def __init__( else: self.loggers = loggers - # self.lr_scheduler = lr_scheduler - self.metrics = metrics self.device = device self.train_mode = train_mode self.use_amp = use_amp + self.amp_dtype = AMPDType.to_dtype(amp_dtype) self.grad_clip = grad_clip self.grad_clip_norm = grad_clip_norm self.swa_start = swa_start self.do_swa = swa_start > 0 self.swa_lr = swa_lr self.swa_anneal_epochs = swa_anneal_epochs - self.amp_args = {} - - if device is not None: - self.model.to(device) - if loss is not None: - self.loss.to(device) - + self.input_key = input_key + self.target_key = target_key self.ddp = ddp self.ddp_type = ddp_type + self.cpu_offload = cpu_offload self.rank = 0 self.world_size = 1 + self.in_swa = False + self.global_step = 0 + self.save_interval_steps = save_interval_steps if ddp: self.rank = dist.get_rank() self.world_size = dist.get_world_size() + + self.set_train_mode() + self.prepare_models_for_training() + + def prepare_models_for_training(self): + self.loss = self._prepare_loss_for_training(self.loss, self.device) + ( + self.model, + self.optimizer, + self.lr_scheduler, + self.wd_scheduler, + self.grad_scaler, + self.swa_model, + self.swa_scheduler, + ) = self._prepare_model_for_training( + self.model, + self.optim, + self.lrsched, + self.wdsched, + self.device, + self.use_amp, + self.ddp, + self.ddp_type, + self.cpu_offload, + self.do_swa, + self.swa_lr, + self.swa_anneal_epochs, + ) + + def _prepare_loss_for_training(self, loss, device): + if loss is not None: + loss.to(device) + + return loss + + def _prepare_model_for_training( + self, + model, + optim, + lrsched, + wdsched, + device, + use_amp, + ddp, + ddp_type, + cpu_offload, + do_swa, + swa_lr, + swa_anneal_epochs, + ): + if device is not None: + model.to(device) + + if ddp: if ddp_type == DDPType.DDP or ddp_type == DDPType.OSS_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if self.rank == 0: logging.info( "training in multiple gpus with distributed-data-parallel" ) oss = False if ddp_type == DDPType.DDP else True - self.optimizer = self._make_optimizer(optim, self.model, oss=oss) - self.model = TorchDDP( - self.model, device_ids=[device], output_device=device + optimizer = self._make_optimizer(optim, model, oss=oss) + model = TorchDDP( + model, + device_ids=[device], + output_device=device, ) elif ddp_type == DDPType.OSS_SHARDED_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if self.rank == 0: logging.info( "training in multiple gpus with fair sharded-distributed-data-parallel" ) - self.optimizer = self._make_optimizer(optim, self.model, oss=True) - self.model = FairShardedDDP(self.model, self.optimizer) + optimizer = self._make_optimizer(optim, model, oss=True) + model = FairShardedDDP(model, optimizer) else: if self.rank == 0: logging.info( "training in multiple gpus with fair fully-sharded-distributed-data-parallel" ) # syncbathcnorm is not supported here, it raises exception - self.model = FairFullyShardedDDP( - self.model, - mixed_precision=self.use_amp, + model = FairFullyShardedDDP( + model, + mixed_precision=use_amp, move_params_to_cpu=cpu_offload, ) - self.optimizer = self._make_optimizer(optim, self.model, oss=False) + optimizer = self._make_optimizer(optim, model, oss=False) else: - self.optimizer = self._make_optimizer(optim, self.model) + optimizer = self._make_optimizer(optim, model) # make the learning rate scheduler - self.lr_scheduler = self._make_lr_sched(lrsched, self.optimizer) + lr_scheduler = self._make_lr_sched(lrsched, optimizer) - if self.use_amp: + # make weight decay scheduler if needed + wd_scheduler = self._make_wd_sched(wdsched, optimizer) + + grad_scaler = None + if use_amp: if ddp and ddp_type != DDPType.DDP: if self.rank == 0: logging.info( "using automatic mixed precision training with sharded-grad-scaler" ) - self.grad_scaler = ShardedGradScaler() + grad_scaler = ShardedGradScaler() else: if self.rank == 0: logging.info( "using automatic mixed precision training with grad-scaler" ) - self.grad_scaler = amp.GradScaler() - self.amp_autocast = amp.autocast - else: - self.amp_autocast = contextlib.nullcontext + grad_scaler = amp.GradScaler() - self.in_swa = False - if self.do_swa: + swa_model = None + swa_scheduler = None + if do_swa: if self.rank == 0: logging.info("init SWA model") - self.swa_model = AveragedModel(self.model) - self.swa_scheduler = SWALR( - self.optimizer, swa_lr=self.swa_lr, anneal_epochs=self.swa_anneal_epochs + swa_model = AveragedModel(model) + swa_scheduler = SWALR( + optimizer, swa_lr=swa_lr, anneal_epochs=swa_anneal_epochs ) + return ( + model, + optimizer, + lr_scheduler, + wd_scheduler, + grad_scaler, + swa_model, + swa_scheduler, + ) + + def set_epoch(self, data_loader, cur_batch: int = 0): + try: + data_loader.dataset.set_epoch(self.cur_epoch) + except AttributeError: + logging.warning("dataset doesn't have set_epoch member function") + + try: + data_loader.batch_sampler.set_epoch(self.cur_epoch, cur_batch) + except AttributeError: + logging.warning("sampler doesn't have set_epoch member function") + def fit(self, train_data, val_data=None): """Training function, it performs the training and validation epochs @@ -211,8 +330,7 @@ def fit(self, train_data, val_data=None): val_data: PyTorch data loader for the validation loop """ self.exp_path.mkdir(parents=True, exist_ok=True) - # if not os.path.exists(self.exp_path): - # os.makedirs(self.exp_path) + self._compute_grad_acc_steps(train_data) if self.do_swa and self.cur_epoch >= self.swa_start: self.in_swa = True @@ -220,15 +338,20 @@ def fit(self, train_data, val_data=None): val_logs = {} self.loggers.on_train_begin(epochs=self.epochs) for epoch in range(self.cur_epoch, self.epochs): - + self.set_epoch(train_data, self.cur_batch) self.loggers.on_epoch_begin(epoch, batches=len(train_data)) if self.lr_scheduler is not None: # this is needed by cosine scheduler epoch_updates = int(len(train_data) / self.grad_acc_steps) self.lr_scheduler.on_epoch_begin(epoch, epoch_updates=epoch_updates) + if self.wd_scheduler is not None: + self.wd_scheduler.on_epoch_begin(epoch) + logs = self.train_epoch(train_data) + self.cur_batch = 0 if val_data is not None: + self.set_epoch(val_data) val_logs = self.validation_epoch(val_data) logs.update(val_logs) @@ -242,6 +365,8 @@ def fit(self, train_data, val_data=None): else: if self.lr_scheduler is not None: self.lr_scheduler.on_epoch_end(logs) + if self.wd_scheduler is not None: + self.wd_scheduler.on_epoch_end() self.save_checkpoint(logs) @@ -259,10 +384,10 @@ def fit(self, train_data, val_data=None): self.save_swa_model(logs) def set_train_mode(self): - if self.train_mode == "train": - self.model.train() - else: - self.model.train_mode(self.train_mode) + self.model.set_train_mode(self.train_mode) + if self.rank == 0: + self.model.parameter_summary(verbose=True) + self.model.print_parameter_list() def train_epoch(self, data_loader): """Training epoch loop @@ -270,19 +395,21 @@ def train_epoch(self, data_loader): Args: data_loader: PyTorch data loader return input/output pairs """ + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() - for batch, (data, target) in enumerate(data_loader): + self.model.train() + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - with self.amp_autocast(): - output = self.model(data) - loss = self.loss(output, target).mean() / self.grad_acc_steps + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + + with amp.autocast(enabled=self.use_amp): + output = self.model(input_data) + loss = self.loss(output, target) / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -290,49 +417,50 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) - self._reduce_metric(loss) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) - # total_batches += 1 logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) + logs.update(self._get_wds()) return logs def validation_epoch(self, data_loader, swa_update_bn=False): """Validation epoch loop Args: - data_loader: PyTorch data loader return input/output pairs + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. """ - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(self.device) batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() - for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - - with self.amp_autocast(): - output = self.model(data, **self.amp_args) + for batch, data in enumerate(data_loader): + x, target = tensors_subset(data, batch_keys, self.device) + batch_size = x.size(0) + with amp.autocast(enabled=self.use_amp): + output = self.model(x) loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() @@ -347,7 +475,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): def bn_update_epoch(self, data_loader): logs = self.validation_epoch(data_loader, swa_update_bn=True) - logs["lr"] = self._get_lr() + logs.update(self._get_lrs()) return logs def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): @@ -371,26 +499,57 @@ def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): model.parameters(), grad_clip, norm_type=grad_clip_norm ) - def update_model(self): + def _update_model_by_optim( + self, model, optimizer, grad_clip, grad_clip_norm, use_amp, grad_scaler + ): + """Updates the model and does gradding clipping.""" + if use_amp: + if grad_clip > 0: + grad_scaler.unscale_(optimizer) + self._clip_grad_norm(model, optimizer, grad_clip, grad_clip_norm) + + grad_scaler.step(optimizer) + grad_scaler.update() + else: + if grad_clip > 0: + self._clip_grad_norm(model, optimizer, grad_clip, grad_clip_norm) - if self.use_amp: - if self.grad_clip > 0: - self.grad_scaler.unscale_(self.optimizer) - self._clip_grad_norm( - self.model, self.optimizer, self.grad_clip, self.grad_clip_norm - ) + optimizer.step() - self.grad_scaler.step(self.optimizer) - self.grad_scaler.update() - else: - if self.grad_clip > 0: - self._clip_grad_norm( - self.model, self.optimizer, self.grad_clip, self.grad_clip_norm - ) + def update_model(self): + """Updates the model and does gradding clipping.""" + if self.lr_scheduler is not None and not self.in_swa: + self.lr_scheduler.on_opt_step() + + self._update_model_by_optim( + self.model, + self.optimizer, + self.grad_clip, + self.grad_clip_norm, + self.use_amp, + self.grad_scaler, + ) + self.global_step += 1 - self.optimizer.step() + # if self.use_amp: + # if self.grad_clip > 0: + # self.grad_scaler.unscale_(self.optimizer) + # self._clip_grad_norm( + # self.model, self.optimizer, self.grad_clip, self.grad_clip_norm + # ) + + # self.grad_scaler.step(self.optimizer) + # self.grad_scaler.update() + # else: + # if self.grad_clip > 0: + # self._clip_grad_norm( + # self.model, self.optimizer, self.grad_clip, self.grad_clip_norm + # ) + + # self.optimizer.step() def _make_optimizer(self, optim, model, oss=False): + """Makes an optimizer object.""" if isinstance(optim, torch.optim.Optimizer): return optim @@ -399,20 +558,34 @@ def _make_optimizer(self, optim, model, oss=False): opt_args["oss"] = oss if self.rank == 0: logging.info("optimizer args={}".format(opt_args)) - optimizer = OF.create(model.parameters(), **opt_args) + + optimizer = OF.create(model.trainable_param_groups(), **opt_args) return optimizer def _make_lr_sched(self, lr_sched, optim): + """Makes a Learning Rate scheduler object.""" if lr_sched is None or isinstance(lr_sched, LRS): return lr_sched assert isinstance(lr_sched, dict) args = LRSF.filter_args(**lr_sched) if self.rank == 0: - logging.info("lr scheduler args={}".format(args)) + logging.info(f"lr scheduler args={args}") lr_sched = LRSF.create(optim, **args) return lr_sched + def _make_wd_sched(self, wd_sched, optim): + """Makes a Learning Rate scheduler object.""" + if wd_sched is None or isinstance(wd_sched, WDS): + return wd_sched + + assert isinstance(wd_sched, dict) + args = WDSF.filter_args(**wd_sched) + if self.rank == 0: + logging.info(f"wd scheduler args={args}") + wd_sched = WDSF.create(optim, **args) + return wd_sched + def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb): """Creates the default data loaders""" prog_log = ProgLogger(interval=log_interval) @@ -432,8 +605,74 @@ def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb): def _get_lr(self): """Returns the current learning rate to show in the loggers""" - for param_group in self.optimizer.param_groups: - return param_group["lr"] + lrs = [param_group["lr"] for param_group in self.optimizer.param_groups] + return max(lrs) + + def _get_lrs(self): + """Returns the current learning rates of all param groups to show in the loggers""" + lrs = { + f"lr_{i}": param_group["lr"] + for i, param_group in enumerate(self.optimizer.param_groups) + } + if len(lrs) == 1: + lrs["lr"] = lrs.pop("lr_0") + + return lrs + + def _get_wd(self): + """Returns the current learning rate to show in the loggers""" + wds = [ + param_group["weight_decay"] for param_group in self.optimizer.param_groups + ] + return max(wds) + + def _get_wds(self, if_scheduler=True): + """Returns the current learning rates of all param groups to show in the loggers""" + if if_scheduler and self.wd_scheduler is None: + return {} + + wds = { + f"wd_{i}": param_group["weight_decay"] + for i, param_group in enumerate(self.optimizer.param_groups) + } + if len(wds) == 1: + wds["wd"] = wds.pop("wd_0") + + return wds + + def _compute_grad_acc_steps(self, data_loader): + if self.eff_batch_size is None: + return + + if data_loader.batch_sampler is not None: + try: + batch_size = data_loader.batch_sampler.avg_batch_size + except: + logging.warning( + "batch sampler doesn't have avg_batch_size property, " + "we cannot estimate grad_acc_steps, using grad_acc_steps=%d", + self.grad_acc_steps, + ) + return + + self.grad_acc_steps = int( + math.ceil(self.eff_batch_size / batch_size / self.world_size) + ) + logging.info( + "Setting grad_acc_steps=%d for " + "eff_batch_size=%d, avg_batch_size=%d, world_size=%d", + self.grad_acc_steps, + self.eff_batch_size, + batch_size, + self.world_size, + ) + return + + logging.warning( + "We cannot determine the batch_size, " + "we cannot estimate grad_acc_steps, using grad_acc_steps=%d", + self.grad_acc_steps, + ) def checkpoint(self, logs=None): """Creates a checkpoint of the training, to save and posterior recovery @@ -441,19 +680,25 @@ def checkpoint(self, logs=None): Args: logs: logs containing the current value of the metrics. """ + self.model.train() checkpoint = { "epoch": self.cur_epoch, + "batch": self.cur_batch, + "global_step": self.global_step, "rng_state": torch.get_rng_state(), "model_cfg": self.model.get_config(), "model_state_dict": self.model.state_dict(), "optimizer_state_dict": self.optimizer.state_dict(), - "loss_state_dict": self.loss.state_dict() - if self.loss is not None - else None, + "loss_state_dict": ( + self.loss.state_dict() if self.loss is not None else None + ), } if self.lr_scheduler is not None: checkpoint["lr_scheduler_state_dict"] = self.lr_scheduler.state_dict() + if self.wd_scheduler is not None: + checkpoint["wd_scheduler_state_dict"] = self.wd_scheduler.state_dict() + if logs is not None: checkpoint["logs"] = logs @@ -463,12 +708,67 @@ def checkpoint(self, logs=None): return checkpoint - def save_checkpoint(self, logs=None): + def save_partial_checkpoint(self): + return ( + self.save_interval_steps is not None + and self.global_step % self.save_interval_steps == 0 + ) + + def save_checkpoint(self, logs=None, partial: bool = False): + """Saves a checkpoint of the training status + + Args: + logs: logs containing the current value of the metrics. + partial: if True, it is saving in the middle of the epoch + """ + if partial and not self.save_partial_checkpoint(): + return + + if self.ddp and ( + self.ddp_type == DDPType.OSS_DDP or self.ddp_type == DDPType.OSS_SHARDED_DDP + ): + # Not sure what this does, just copying from the example in + # https://github.com/facebookresearch/fairscale/blob/master/benchmarks/oss.py + # Check the checkpointing in the case of the OSS optimizer + # Memory usage could spill over from there + # optimizer = cast(OSS, optimizer) + self.optimizer.consolidate_state_dict() + + if self.rank != 0: + return + + checkpoint = self.checkpoint(logs) + self.save_model_checkpoint("model", checkpoint, partial=partial) + + def save_model_checkpoint( + self, model_name: str, checkpoint: Dict[str, Any], partial: bool = False + ): + if partial: + file_path = "%s/%s_ep%04d_step%010d.pth" % ( + self.exp_path, + model_name, + self.cur_epoch, + self.global_step, + ) + else: + file_path = "%s/%s_ep%04d.pth" % (self.exp_path, model_name, self.cur_epoch) + + logging.info("saving %s to %s", model_name, file_path) + torch.save(checkpoint, file_path) + + def old_save_checkpoint(self, logs=None, partial: bool = False): """Saves a checkpoint of the training status Args: logs: logs containing the current value of the metrics. + partial: if True, it is saving in the middle of the epoch """ + if partial and ( + self.save_interval_steps is None + or self.global_step % self.save_interval_steps != 0 + ): + return + if self.ddp and ( self.ddp_type == DDPType.OSS_DDP or self.ddp_type == DDPType.OSS_SHARDED_DDP ): @@ -481,8 +781,16 @@ def save_checkpoint(self, logs=None): if self.rank != 0: return + checkpoint = self.checkpoint(logs) - file_path = "%s/model_ep%04d.pth" % (self.exp_path, self.cur_epoch) + if partial: + file_path = "%s/model_ep%04d_step%010d.pth" % ( + self.exp_path, + self.cur_epoch, + self.global_step, + ) + else: + file_path = "%s/model_ep%04d.pth" % (self.exp_path, self.cur_epoch) torch.save(checkpoint, file_path) @@ -502,13 +810,7 @@ def save_swa_model(self, logs=None): torch.save(checkpoint, file_path) - def load_checkpoint(self, file_path): - """Loads a training checkpoint from file. - - Args: - file_path: checkpoint file path - """ - checkpoint = torch.load(file_path, map_location=torch.device("cpu")) + def _load_checkpoint(self, checkpoint): rng_state = checkpoint["rng_state"] torch.set_rng_state(rng_state) if self.rank > 0: @@ -518,6 +820,11 @@ def load_checkpoint(self, file_path): del dummy self.cur_epoch = checkpoint["epoch"] + if "batch" in checkpoint: + self.cur_batch = checkpoint["batch"] + else: + self.cur_batch = 0 + try: self.model.load_state_dict(checkpoint["model_state_dict"]) except: @@ -527,6 +834,14 @@ def load_checkpoint(self, file_path): self.loss.load_state_dict(checkpoint["loss_state_dict"]) if self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"]) + if self.wd_scheduler is not None: + self.wd_scheduler.load_state_dict(checkpoint["wd_scheduler_state_dict"]) + + if "global_step" in checkpoint: + self.global_step = checkpoint["global_step"] + elif self.lr_scheduler is not None: + # this for older models that didn't save the global step + self.global_step = self.lr_scheduler.step # if self.use_amp: # amp.load_state_dict(checkpoint['amp']) @@ -548,46 +863,114 @@ def load_checkpoint(self, file_path): logs = checkpoint["logs"] del checkpoint - if self.device is not None: - torch.cuda.empty_cache() + # this was added before to try to release as much GPU memory as possible + # Recently has started to cause CUDA not available devices error + # Commenting for now. + # if self.device is not None: + # torch.cuda.empty_cache() return logs + def find_last_checkpoint(self, model_name="model"): + """finds the last checkpoint epoch and step in the experiment dir""" + last_epoch = 0 + last_step = 0 + file_pattern = "%s/%s_ep[0-9]*.pth" % (self.exp_path, model_name) + file_paths = sorted(glob.glob(file_pattern)) + if len(file_paths) > 0: + last_epoch = int(re.search(r"ep[0-9]*", file_paths[-1]).group()[2:]) + + file_pattern = "%s/%s_ep%04d_step[0-9]*.pth" % ( + self.exp_path, + model_name, + last_epoch, + ) + file_paths = sorted(glob.glob(file_pattern)) + if len(file_paths) > 0: + last_step = int(re.search(r"step[0-9]*", file_paths[-1]).group()[4:]) + + return last_epoch, last_step + def load_last_checkpoint(self): + """Loads the last training checkpoint in the experiment dir.""" + last_epoch, last_step = self.find_last_checkpoint() + if last_epoch > 0 or last_step > 0: + return self.load_checkpoint(last_epoch, last_step) + + return None + + def load_model_checkpoint(self, model_name="model", epoch=0, step=0): + if step == 0: + file_path = "%s/%s_ep%04d.pth" % (self.exp_path, model_name, epoch) + else: + file_path = "%s/%s_ep%04d_steps%10d.pth" % ( + self.exp_path, + model_name, + epoch, + step, + ) + logging.info("loading %s from %s", model_name, file_path) + return torch.load(file_path, map_location=torch.device("cpu")) + + def load_checkpoint(self, epoch, step): + checkpoint = self.load_model_checkpoint("model", epoch, step) + return self._load_checkpoint(checkpoint) + + def old_load_checkpoint(self, file_path): + """Loads a training checkpoint from file. + + Args: + file_path: checkpoint file path + """ + checkpoint = torch.load(file_path, map_location=torch.device("cpu")) + return self._load_checkpoint(checkpoint) + + def old_load_last_checkpoint(self): """Loads the last training checkpoint in the experiment dir.""" for epoch in range(self.epochs, 0, -1): - file_path = "%s/model_ep%04d.pth" % (self.exp_path, epoch) - if os.path.isfile(file_path): + file_path = Path("%s/model_ep%04d.pth" % (self.exp_path, epoch)) + if file_path.is_file(): + steps_pattern = "%s/model_ep%04d_steps*.pth" % (self.exp_path, epoch) + steps_file_paths = sorted(glob.glob(steps_pattern)) + if len(steps_file_paths) > 0: + file_path = steps_file_paths[-1] + return self.load_checkpoint(file_path) return None @staticmethod - def filter_args(**kwargs): - valid_args = ( - "grad_acc_steps", - "epochs", - "log_interval", - "use_amp", - "ddp_type", - "grad_clip", - "swa_start", - "swa_lr", - "swa_anneal_epochs", - "exp_path", - "optim", - "lrsched", - "cpu_offload", - "use_tensorboard", - "use_wandb", - "wandb", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + def get_augs_keys(batch, base_key, skip=set()): + keys = [] + if base_key in batch and base_key not in skip: + keys.append(base_key) + + aug_idx_1 = 0 + while True: + aug_idx_2 = 0 + while True: + aug_key = f"{base_key}_aug_{aug_idx_1}_{aug_idx_2}" + if aug_key in batch: + if aug_key not in skip: + keys.append(aug_key) + aug_idx_2 += 1 + else: + break + + if aug_idx_2 == 0: + break + + aug_idx_1 += 1 + + return keys + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(TorchTrainer.__init__, kwargs) return args @staticmethod - def add_class_args(parser, prefix=None, skip=[]): + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") @@ -598,27 +981,49 @@ def add_class_args(parser, prefix=None, skip=[]): if "lrsched" not in skip: LRSF.add_class_args(parser, prefix="lrsched") + if "wdsched" not in skip: + WDSF.add_class_args(parser, prefix="wdsched") + parser.add_argument( "--grad-acc-steps", type=int, default=1, - help="gradient accumulation batches before weigth update", + help="gradient accumulation batches before weight update", + ) + parser.add_argument( + "--eff-batch-size", + type=int, + default=None, + help="effective total batch size, if given, it overrides grad_acc_steps", ) parser.add_argument("--epochs", type=int, default=200, help="number of epochs") + if train_modes is not None: + parser.add_argument( + "--train-mode", + default="full", + choices=train_modes, + help=f"Available train modes for the model in {train_modes}", + ) parser.add_argument( "--log-interval", type=int, - default=10, + default=1000, help="how many batches to wait before logging training status", ) + parser.add_argument( + "--save-interval-steps", + default=None, + type=int, + help="number of steps between model saves, if None only saves at the end of the epoch", + ) parser.add_argument( "--use-tensorboard", - action="store_true", + action=ActionYesNo, default=False, help="use tensorboard logger", ) parser.add_argument( - "--use-wandb", action="store_true", default=False, help="use wandb logger" + "--use-wandb", action=ActionYesNo, default=False, help="use wandb logger" ) parser.add_argument("--wandb.project", default=None, help="wandb project name") parser.add_argument("--wandb.group", default=None, help="wandb group name") @@ -636,18 +1041,21 @@ def add_class_args(parser, prefix=None, skip=[]): parser.add_argument( "--ddp-type", default="ddp", - choices=ddp_choices, + choices=DDPType.choices(), help="DDP type in {}".format(ddp_choices), ) parser.add_argument( "--use-amp", - action="store_true", + action=ActionYesNo, default=False, help="use mixed precision training", ) + parser.add_argument( + "--amp-dtype", default=AMPDType.FLOAT16.value, choices=AMPDType.choices() + ) parser.add_argument( "--cpu-offload", - action="store_true", + action=ActionYesNo, default=False, help="CPU offload of gradients when using fully_sharded_ddp", ) @@ -677,9 +1085,16 @@ def add_class_args(parser, prefix=None, skip=[]): ) parser.add_argument("--exp-path", help="experiment path") + if "input_key" not in skip: + parser.add_argument( + "--input-key", default="x", help="dict. key for nnet input" + ) + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="class_id", help="dict. key for nnet targets" + ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='trainer options') add_argparse_args = add_class_args diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py new file mode 100644 index 00000000..c9cbb60b --- /dev/null +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -0,0 +1,217 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +from collections import OrderedDict as ODict + +import torch +import torch.cuda.amp as amp +import torch.nn as nn +import torchaudio +from jsonargparse import ActionParser, ArgumentParser +from torch.distributed.elastic.multiprocessing.errors import record + +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType, TorchTrainer + + +class TransducerTrainer(TorchTrainer): + """Trainer to train ASR style models. + + Attributes: + model: ASR model object. + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch + cpu_offload: CPU offload of gradients when using fully sharded ddp + """ + + def __init__( + self, + model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + eff_batch_size=None, + device=None, + metrics=None, + lrsched=None, + wdsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + loss=None, + train_mode="full", + use_amp=False, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + save_interval_steps=None, + cpu_offload=False, + input_key="x", + target_key="text", + ): + loss = None + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + @record + def train_epoch(self, data_loader): + """Training epoch loop + + Args: + data_loader: pytorch data loader returning features and class labels. + """ + batch_keys = [self.input_key, f"{self.input_key}_lengths", self.target_key] + metric_acc = MetricAcc(device=self.device) + batch_metrics = ODict() + self.model.train() + + for batch, data in enumerate(data_loader): + self.loggers.on_batch_begin(batch) + + if batch % self.grad_acc_steps == 0: + self.optimizer.zero_grad() + + # # TODO: Check and Modify data, target + # data, audio_length, target = data.to(self.device), audio_length.to( + # self.device), target.to(self.device) + # print(data.keys(), batch_keys, flush=True) + input_data, input_lengths, target = tensors_subset( + data, batch_keys, self.device + ) + batch_size = input_data.shape[0] + + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(input_data, x_lengths=input_lengths, y=target) + loss = output.loss + loss = loss.mean() / self.grad_acc_steps + + if self.use_amp: + self.grad_scaler.scale(loss).backward() + else: + loss.backward() + + if (batch + 1) % self.grad_acc_steps == 0: + self.cur_batch = batch + 1 + self.update_model() + self.save_checkpoint(partial=True) + + for k, v in output.items(): + if "loss" in k and v is not None: + batch_metrics[k] = output[k].item() + + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + logs = metric_acc.metrics + lrs = self._get_lrs() + logs.update(lrs) + self.loggers.on_batch_end(logs=logs, batch_size=batch_size) + + logs = metric_acc.metrics + logs = ODict(("train_" + k, v) for k, v in logs.items()) + lrs = self._get_lrs() + logs.update(lrs) + return logs + + def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + + Args: + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. + """ + batch_keys = [self.input_key, f"{self.input_key}_lengths", self.target_key] + metric_acc = MetricAcc(self.device) + batch_metrics = ODict() + with torch.no_grad(): + if swa_update_bn: + log_tag = "train_" + self.train() + else: + log_tag = "val_" + self.model.eval() + + for batch, data in enumerate(data_loader): + input_data, input_lengths, target = tensors_subset( + data, batch_keys, self.device + ) + batch_size = input_data.shape[0] + + # data, audio_length, target = data.to( + # self.device), audio_length.to(self.device), target.to( + # self.device) + # batch_size = data.shape[0] + # data, target = data.to(self.device), target.to(self.device) + # batch_size = data.shape[0] + + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(input_data, x_lengths=input_lengths, y=target) + + for k, v in output.items(): + if "loss" in k and v is not None: + batch_metrics[k] = output[k].item() + + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + + logs = metric_acc.metrics + logs = ODict((log_tag + k, v) for k, v in logs.items()) + return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + super_skip = skip.copy() + super_skip.add("target_key") + TorchTrainer.add_class_args(parser, train_modes=train_modes, skip=super_skip) + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="text", help="dict. key for nnet targets" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index 53486c7b..27d485ff 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -3,16 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - import torch +import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ..utils import MetricAcc -from .torch_trainer import TorchTrainer +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType, TorchTrainer class VAETrainer(TorchTrainer): @@ -33,6 +35,7 @@ class VAETrainer(TorchTrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger @@ -42,8 +45,10 @@ class VAETrainer(TorchTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp - + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -54,15 +59,18 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -71,58 +79,65 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x", + target_key="x", ): - - super().__init__( - model, - None, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # None, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) def train_epoch(self, data_loader): + """Training epoch loop + + Args: + data_loader: pytorch data loader returning noisy and clean features + """ + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, data in enumerate(data_loader): - - if isinstance(data, (tuple, list)): - data, _ = data - self.loggers.on_batch_begin(batch) - if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data = data.to(self.device) - batch_size = data.shape[0] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) - with self.amp_autocast(): - output = self.model(data, return_x_mean=True) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(input_data, x_target=target, return_x_mean=True) elbo = output["elbo"].mean() loss = -elbo / self.grad_acc_steps x_hat = output["x_mean"] @@ -133,57 +148,78 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["elbo"] = elbo.item() for metric in ["log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, data) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + logs = ODict(("train_" + k, v) for k, v in logs.items()) + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + + Args: + data_loader: PyTorch data loader return input/output pairs + """ + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): - if isinstance(data, (tuple, list)): - data, _ = data - - data = data.to(self.device) - batch_size = data.shape[0] - - with self.amp_autocast(): - output = self.model(data, return_x_mean=True) + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] for metric in ["elbo", "log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, data) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + TorchTrainer.add_class_args( + parser, train_modes, skip=skip.union({"target_key"}) + ) + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="x", help="dict. key for nnet targets" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index a2da616c..1488f5e5 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -2,17 +2,20 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -from collections import OrderedDict as ODict import logging import math +import os +from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ..utils import MetricAcc -from .dvae_trainer import DVAETrainer +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .dvae_trainer import AMPDType, DVAETrainer class VQDVAETrainer(DVAETrainer): @@ -33,6 +36,7 @@ class VQDVAETrainer(DVAETrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -42,8 +46,10 @@ class VQDVAETrainer(DVAETrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp - + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -54,15 +60,18 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -71,62 +80,31 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x_aug", + target_key="x", ): - - super().__init__( - model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) def train_epoch(self, data_loader): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, data in enumerate(data_loader): - - assert isinstance(data, (tuple, list)) - x = data[0] - x_target = data[1] - self.loggers.on_batch_begin(batch) - if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - x = x.to(self.device) - x_target = x_target.to(self.device) - batch_size = x.shape[0] - - with self.amp_autocast(): - output = self.model(x, x_target=x_target, return_x_mean=True) - loss = output["loss"] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(input_data, x_target=target, return_x_mean=True) + loss = output["loss"] / self.grad_acc_steps x_hat = output["x_mean"] - loss = loss.mean() / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -134,9 +112,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for metric in ["elbo", "log_px", "kldiv_z", "vq_loss"]: @@ -145,42 +123,37 @@ def train_epoch(self, data_loader): output["log_perplexity"].mean().item() ) for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x_target) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: - log_tag = "train" - self.set_train_mode() + log_tag = "train_" + self.model.train() else: log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): - - assert isinstance(data, (tuple, list)) - x = data[0] - x_target = data[1] - - x = x.to(self.device) - x_target = x_target.to(self.device) - batch_size = x.shape[0] - - with self.amp_autocast(): - output = self.model(x, x_target=x_target, return_x_mean=True) + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] for metric in ["loss", "elbo", "log_px", "kldiv_z", "vq_loss"]: @@ -190,10 +163,32 @@ def validation_epoch(self, data_loader, swa_update_bn=False): ) for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x_target) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + DVAETrainer.add_class_args( + parser, train_modes, skip=skip.union({"input_key", "target_key"}) + ) + if "input_key" not in skip: + parser.add_argument( + "--input-key", default="x_aug", help="dict. key for nnet input" + ) + + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="x", help="dict. key for nnet targets" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index d187af79..2331a2b8 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -2,17 +2,20 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -from collections import OrderedDict as ODict import logging import math +import os +from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ..utils import MetricAcc -from .vae_trainer import VAETrainer +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .vae_trainer import AMPDType, VAETrainer class VQVAETrainer(VAETrainer): @@ -33,6 +36,7 @@ class VQVAETrainer(VAETrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -42,8 +46,10 @@ class VQVAETrainer(VAETrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp - + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -54,15 +60,18 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -71,62 +80,32 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x", + target_key="x", ): - - super().__init__( - model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) def train_epoch(self, data_loader): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, data in enumerate(data_loader): - - if isinstance(data, (tuple, list)): - x = data[0] - else: - x = data - self.loggers.on_batch_begin(batch) - if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - x = x.to(self.device) - batch_size = x.shape[0] - - with self.amp_autocast(): - output = self.model(x, return_x_mean=True) + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(input_data, x_target=target, return_x_mean=True) loss = output["loss"] x_hat = output["x_mean"] - loss = loss.mean() / self.grad_acc_steps + loss = loss / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -134,9 +113,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for metric in ["elbo", "log_px", "kldiv_z", "vq_loss"]: @@ -145,42 +124,37 @@ def train_epoch(self, data_loader): output["log_perplexity"].mean().item() ) for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): - - if isinstance(data, (tuple, list)): - x = data[0] - else: - x = data - - x = x.to(self.device) - batch_size = x.shape[0] - - with self.amp_autocast(): - output = self.model(x, return_x_mean=True) + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] for metric in ["loss", "elbo", "log_px", "kldiv_z", "vq_loss"]: @@ -190,10 +164,25 @@ def validation_epoch(self, data_loader, swa_update_bn=False): ) for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + VAETrainer.add_class_args(parser, train_modes, skip=skip.union({"target_key"})) + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="x", help="dict. key for nnet targets" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 0784a2ea..12ff506a 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -2,16 +2,20 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -from collections import OrderedDict as ODict -import time import logging +import os +import time +from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ..utils import MetricAcc +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType from .xvector_trainer import XVectorTrainer @@ -37,6 +41,7 @@ class XVectorAdvTrainer(XVectorTrainer): loss: if None, it uses cross-entropy train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger @@ -46,7 +51,10 @@ class XVectorAdvTrainer(XVectorTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -58,18 +66,21 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, p_attack=0.8, p_val_attack=0, device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", loss=None, - train_mode="train", + train_mode="full", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -78,37 +89,13 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x", + target_key="class_id", ): - - super().__init__( - model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) - + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) self.attack = attack self.attack.to(device) self.p_attack = p_attack * self.grad_acc_steps @@ -125,42 +112,35 @@ def __init__( % (p_attack, 1.0 / self.grad_acc_steps) ) - # if data_parallel: - # # change model in attack by the data parallel version - # self.attack.model = self.model - # # make loss function in attack data parallel - # self.attack.make_data_parallel() - def train_epoch(self, data_loader): - + batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) - - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if batch % self.grad_acc_steps == 0: if torch.rand(1) < self.p_attack: # generate adversarial attacks - logging.info("generating adv attack for batch=%d" % (batch)) + logging.info("generating adv attack for batch=%d", batch) self.model.eval() - data_adv = self.attack.generate(data, target) + data_adv = self.attack.generate(input_data, target) max_delta = torch.max(torch.abs(data_adv - data)).item() - logging.info("adv attack max perturbation=%f" % (max_delta)) - data = data_adv - self.set_train_mode() + logging.info("adv attack max perturbation=%f", max_delta) + input_data = data_adv + self.model.train() self.optimizer.zero_grad() - with self.amp_autocast(): - output = self.model(data, target) - loss = self.loss(output, target).mean() / self.grad_acc_steps + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(input_data, target) + loss = self.loss(output.logits, target) / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -168,55 +148,57 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() - for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + for batch, data in enumerate(data_loader): + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if torch.rand(1) < self.p_val_attack: # generate adversarial attacks self.model.eval() - data = self.attack.generate(data, target) + data = self.attack.generate(input_data, target) if swa_update_bn: - self.set_train_mode() + self.model.train() with torch.no_grad(): - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(data, **self.amp_args) - loss = self.loss(output, target) + loss = self.loss(output.logits, target) - batch_metrics["loss"] = loss.mean().item() + batch_metrics["loss"] = loss.item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) @@ -233,7 +215,7 @@ def filter_args(**kwargs): return args @staticmethod - def add_class_args(parser, prefix=None, skip=[]): + def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 75c3ece8..01676300 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -2,16 +2,20 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -from collections import OrderedDict as ODict -import time import logging +import os +import time +from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ..utils import MetricAcc # , TorchDataParallel +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType from .xvector_trainer_from_wav import XVectorTrainerFromWav @@ -39,6 +43,7 @@ class XVectorAdvTrainerFromWav(XVectorTrainerFromWav): loss: if None, it uses cross-entropy train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -47,7 +52,10 @@ class XVectorAdvTrainerFromWav(XVectorTrainerFromWav): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -60,18 +68,21 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, p_attack=0.8, p_val_attack=0, device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", loss=None, - train_mode="train", + train_mode="full", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -80,38 +91,13 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x", + target_key="class_id", ): - - super().__init__( - model, - feat_extractor, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) - + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) self.attack = attack self.attack.to(device) self.p_attack = p_attack * self.grad_acc_steps @@ -124,51 +110,42 @@ def __init__( "first step of the gradient acc. loop given that" "adv optimization over-writes the gradients " "stored in the model" - ) - % (p_attack, 1.0 / self.grad_acc_steps) + ), + p_attack, + 1.0 / self.grad_acc_steps, ) - # if data_parallel: - # # change model in attack by the data parallel version - # self.attack.model = TorchDataParallel(self.attack.model) - # # make loss function in attack data parallel - # self.attack.make_data_parallel() - def train_epoch(self, data_loader): - + batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) - - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if batch % self.grad_acc_steps == 0: if torch.rand(1) < self.p_attack: # generate adversarial attacks # logging.info('generating adv attack for batch=%d' % (batch)) self.model.eval() - data_adv = self.attack.generate(data, target) + data_adv = self.attack.generate(input_data, target) max_delta = torch.max(torch.abs(data_adv - data)).item() - # z = torch.abs(data_adv-data) > 100 - # logging.info('zz {} {}'.format(data[z], data_adv[z])) - # logging.info('adv attack max perturbation=%f' % (max_delta)) - data = data_adv - self.set_train_mode() + input_data = data_adv + self.model.train() self.optimizer.zero_grad() with torch.no_grad(): - feats = self.feat_extractor(data) + feats = self.feat_extractor(input_data) - with self.amp_autocast(): - output = self.model(feats, target) - loss = self.loss(output, target).mean() / self.grad_acc_steps + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(feats, y=target) + loss = self.loss(output.logits, target) / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -176,56 +153,57 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if torch.rand(1) < self.p_val_attack: # generate adversarial attacks self.model.eval() - data = self.attack.generate(data, target) + input_data = self.attack.generate(input_data, target) if swa_update_bn: - self.set_train_mode() + self.model.train() with torch.no_grad(): - feats = self.feat_extractor(data) - with self.amp_autocast(): + feats = self.feat_extractor(input_data) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(feats) - loss = self.loss(output, target) + loss = self.loss(output.logits, target) - batch_metrics["loss"] = loss.mean().item() + batch_metrics["loss"] = loss.item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) @@ -242,7 +220,7 @@ def filter_args(**kwargs): return args @staticmethod - def add_class_args(parser, prefix=None, skip=[]): + def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") @@ -263,4 +241,3 @@ def add_class_args(parser, prefix=None, skip=[]): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='trainer options') diff --git a/hyperion/torch/trainers/xvector_finetuner.py b/hyperion/torch/trainers/xvector_finetuner.py deleted file mode 100644 index cf833257..00000000 --- a/hyperion/torch/trainers/xvector_finetuner.py +++ /dev/null @@ -1,117 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import os -from collections import OrderedDict as ODict - -import time -import logging - -import torch -import torch.nn as nn - -from ..utils import MetricAcc -from .xvector_trainer import XVectorTrainer - - -class XVectorFinetuner(XVectorTrainer): - def __init__( - self, - model, - optimizer, - epochs, - exp_path, - cur_epoch=0, - grad_acc_steps=1, - device=None, - metrics=None, - lr_scheduler=None, - loggers=None, - data_parallel=False, - loss=None, - finetune_mode="ft-embed-affine", - ): - - super(XVectorFinetuner, self).__init__( - model, - optimizer, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lr_scheduler=lr_scheduler, - loggers=loggers, - data_parallel=data_parallel, - loss=loss, - ) - - self.finetune_mode = finetune_mode - - def train_epoch(self, data_loader): - # epoch_batches = len(data_loader.dataset) - # total_batches = self.cur_epoch * epoch_batches - - self.model.update_loss_margin(self.cur_epoch) - - metric_acc = MetricAcc() - batch_metrics = ODict() - # self.model.train_mode(self.finetune_mode) - self.model.eval() - for batch, (data, target) in enumerate(data_loader): - self.loggers.on_batch_begin(batch) - - if batch % self.grad_acc_steps == 0: - self.optimizer.zero_grad() - - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - - output = self.model(data, target) - loss = self.loss(output, target).mean() / self.grad_acc_steps - loss.backward() - - if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None: - self.lr_scheduler.on_opt_step() - self.optimizer.step() - - batch_metrics["loss"] = loss.item() * self.grad_acc_steps - for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) - - # logging.info('batch={} shape={} loss={} acc={}'.format(batch,data.shape, batch_metrics['loss'], batch_metrics['acc'])) - - # if batch > 63: - # logging.info(str(self.model.classif_net.fc_blocks[0].linear.weight)) - # logging.info(str(self.model.classif_net.fc_blocks[0].linear.weight.grad)) - # if batch > 63 : - # t=torch.nn.functional.cross_entropy(output, target, reduction='none') - # logging.info(str(t)) - # if batch == 65: - # #torch.set_printoptions(profile="full") - # #logging.info(str(data[1])) - # #logging.info(str(target[1])) - # #logging.info(str(output[1])) - - # #logging.info(str(data[33])) - # #logging.info(str(target[33])) - # logging.info(str(output[33, target[33]])) - # #time.sleep(1000) - # #torch.set_printoptions(profile="default") - - # #logging.info(str(torch.sum(torch.isnan(data)))) - # #logging.info(str(torch.sum(torch.isnan(target)))) - # #logging.info(str(torch.sum(torch.isnan(output)))) - - metric_acc.update(batch_metrics, batch_size) - logs = metric_acc.metrics - logs["lr"] = self._get_lr() - self.loggers.on_batch_end(logs=logs, batch_size=batch_size) - # total_batches +=1 - - logs = metric_acc.metrics - logs["lr"] = self._get_lr() - return logs diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 190b2a30..e8a91bb0 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -2,16 +2,19 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -from collections import OrderedDict as ODict import logging +import os +from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn +from torch.distributed.elastic.multiprocessing.errors import record -from ..utils import MetricAcc -from .torch_trainer import TorchTrainer +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType, TorchTrainer class XVectorTrainer(TorchTrainer): @@ -34,6 +37,7 @@ class XVectorTrainer(TorchTrainer): loss: if None, it uses cross-entropy train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -43,7 +47,10 @@ class XVectorTrainer(TorchTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -54,16 +61,19 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", loss=None, - train_mode="train", + train_mode="full", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -72,84 +82,113 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x", + target_key="class_id", ): - if loss is None: loss = nn.CrossEntropyLoss() - super().__init__( - model, - loss, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + @record def train_epoch(self, data_loader): """Training epoch loop Args: data_loader: pytorch data loader returning features and class labels. """ - + # batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() - for batch, (data, target) in enumerate(data_loader): + self.model.train() + + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - - with self.amp_autocast(): - output = self.model(data, target, **self.amp_args) - loss = self.loss(output, target).mean() / self.grad_acc_steps - - if self.use_amp: - self.grad_scaler.scale(loss).backward() - else: - loss.backward() + input_keys = self.get_augs_keys(data, self.input_key) + loss_scale = self.grad_acc_steps * len(input_keys) + loss_acc = 0.0 + for aug_key in input_keys: + batch_keys = [aug_key, self.target_key] + x, target = tensors_subset(data, batch_keys, self.device) + batch_size = x.size(0) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(x, y=target) + loss = self.loss(output.logits, target) / loss_scale + loss_acc += loss.item() + + if self.use_amp: + self.grad_scaler.scale(loss).backward() + else: + loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) - batch_metrics["loss"] = loss.item() * self.grad_acc_steps + batch_metrics["loss"] = loss_acc * self.grad_acc_steps for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) + logs.update(self._get_wds()) + return logs + + def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + + Args: + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. + """ + # batch_keys = [self.input_key, self.target_key] + metric_acc = MetricAcc(self.device) + batch_metrics = ODict() + with torch.no_grad(): + if swa_update_bn: + log_tag = "train_" + self.model.train() + else: + log_tag = "val_" + self.model.eval() + + for batch, data in enumerate(data_loader): + input_keys = self.get_augs_keys(data, self.input_key) + loss_scale = len(input_keys) + loss_acc = 0.0 + for aug_key in input_keys: + batch_keys = [aug_key, self.target_key] + x, target = tensors_subset(data, batch_keys, self.device) + batch_size = x.size(0) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(x) + loss = self.loss(output.logits, target) / loss_scale + loss_acc += loss.item() + + batch_metrics["loss"] = loss_acc + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output.logits, target) + + metric_acc.update(batch_metrics, batch_size) + + logs = metric_acc.metrics + logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 7b7cb21c..f230372c 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -2,38 +2,21 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -from collections import OrderedDict as ODict import logging +import os +from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ..utils import MetricAcc # , TorchDataParallel +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType from .xvector_trainer import XVectorTrainer -# class DFRModelWrapper(nn.Module): -# """Wrapper class for the xvector model, which -# replace the forward method by the forward_hid_feats method - -# This is need because nn.DataParallel only support multi-gpu when colling the -# forward method, but not the other methods in the nn.Module classes. -# """ -# def __init__(self, model): -# super().__init__() -# self.model = model - -# def forward(self, x, y=None, enc_layers=None, classif_layers=None, -# return_output=False, use_amp=False): -# if use_amp: -# with torch.cuda.amp.autocast(): -# return self.model.forward_hid_feats( -# x, y, enc_layers, classif_layers, return_output) - -# return self.model.forward_hid_feats( -# x, y, enc_layers, classif_layers, return_output) - class XVectorTrainerDeepFeatReg(XVectorTrainer): """Trainer to train x-vector style models. @@ -60,6 +43,7 @@ class XVectorTrainerDeepFeatReg(XVectorTrainer): reg_loss: nn.Module loss used for regularization, if None it uses L1 loss. train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -69,7 +53,10 @@ class XVectorTrainerDeepFeatReg(XVectorTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -81,6 +68,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, reg_layers_enc=None, reg_layers_classif=None, reg_weight_enc=0.1, @@ -88,14 +76,16 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", loss=None, reg_loss=None, - train_mode="train", + train_mode="full", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -104,36 +94,13 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x", + target_key="class_id", ): - - super().__init__( - model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) self.prior_model = prior_model if reg_loss is None or reg_loss == "l1": @@ -149,67 +116,47 @@ def __init__( if device is not None: self.prior_model.to(device) - # self.model_wrapper = DFRModelWrapper(self.model) - # self.prior_model_wrapper = DFRModelWrapper(self.prior_model) - - # if device is not None: - # self.model_wrapper.to(device) - # self.prior_model_wrapper.to(device) - # self.reg_loss.to(device) - - # if data_parallel: - # self.model_wrapper = TorchDataParallel(self.model_wrapper) - # self.prior_model_wrapper = TorchDataParallel(self.prior_model_wrapper) - # self.reg_loss = TorchDataParallel(self.reg_loss) - def train_epoch(self, data_loader): """Training epoch loop Args: data_loader: PyTorch data loader return input/output pairs """ + batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) - if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - - with self.amp_autocast(): - # h_enc, h_classif, output = self.model_wrapper( - # data, target, self.reg_layers_enc, self.reg_layers_classif, - # return_output=True, **self.amp_args) + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): outputs = self.model( - data, - target, - self.reg_layers_enc, - self.reg_layers_classif, + input_data, + y=target, + return_enc_layers=self.reg_layers_enc, + return_classif_layers=self.reg_layers_classif, return_output=True, ) h_enc, h_classif, output = ( outputs["h_enc"], outputs["h_classif"], - outputs["output"], + outputs["logits"], ) - loss = self.loss( - output, target - ).mean() # you need to take the mean here because of the multi-gpu training + loss = self.loss(output, target) batch_metrics["loss-classif"] = loss.item() prior_outputs = self.prior_model( - data, - target, - self.reg_layers_enc, - self.reg_layers_classif, + input_data, + return_enc_layers=self.reg_layers_enc, + return_classif_layers=self.reg_layers_classif, return_output=False, ) prior_h_enc, prior_h_classif = ( @@ -246,9 +193,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) @@ -256,12 +203,13 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) - # total_batches +=1 logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs @staticmethod @@ -319,4 +267,3 @@ def add_class_args(parser, prefix=None, skip=[]): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='trainer options') diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 29964322..3d1a8ccf 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -2,16 +2,18 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -from collections import OrderedDict as ODict import logging +import os +from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn -from ..utils import MetricAcc # , TorchDataParallel -from .torch_trainer import TorchTrainer +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType from .xvector_trainer_deep_feat_reg import XVectorTrainerDeepFeatReg @@ -40,6 +42,7 @@ class XVectorTrainerDeepFeatRegFromWav(XVectorTrainerDeepFeatReg): reg_loss: nn.Module loss used for regularization, if None it uses L1 loss. train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -49,7 +52,10 @@ class XVectorTrainerDeepFeatRegFromWav(XVectorTrainerDeepFeatReg): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -62,6 +68,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, reg_layers_enc=None, reg_layers_classif=None, reg_weight_enc=0.1, @@ -69,13 +76,15 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", loss=None, reg_loss=None, - train_mode="train", + train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=10, use_tensorboard=False, use_wandb=False, @@ -85,104 +94,64 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x", + target_key="class_id", ): - - super().__init__( - model, - prior_model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - reg_layers_enc=reg_layers_enc, - reg_layers_classif=reg_layers_classif, - reg_weight_enc=reg_weight_enc, - reg_weight_classif=reg_weight_classif, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - reg_loss=reg_loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) self.feat_extractor = feat_extractor if device is not None: self.feat_extractor.to(device) - # if data_parallel: - # self.feat_extractor = TorchDataParallel(self.feat_extractor) - def train_epoch(self, data_loader): """Training epoch loop Args: data_loader: PyTorch data loader return input/output pairs """ + batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) - if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with torch.no_grad(): - feats = self.feat_extractor(data) + feats = self.feat_extractor(input_data) - with self.amp_autocast(): - # h_enc, h_classif, output = self.model_wrapper( - # feats, target, self.reg_layers_enc, self.reg_layers_classif, - # return_output=True, **self.amp_args) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): outputs = self.model( feats, - target, - self.reg_layers_enc, - self.reg_layers_classif, + y=target, + return_enc_layers=self.reg_layers_enc, + return_classif_layers=self.reg_layers_classif, return_output=True, ) h_enc, h_classif, output = ( outputs["h_enc"], outputs["h_classif"], - outputs["output"], + outputs["logits"], ) loss = self.loss( output, target - ).mean() # you need to take the mean here because of the multi-gpu training + ) # you need to take the mean here because of the multi-gpu training batch_metrics["loss-classif"] = loss.item() - # prior_h_enc, prior_h_classif = self.prior_model_wrapper( - # feats, target, self.reg_layers_enc, self.reg_layers_classif, - # return_output=False, **self.amp_args) prior_outputs = self.prior_model( feats, - target, - self.reg_layers_enc, - self.reg_layers_classif, + return_enc_layers=self.reg_layers_enc, + return_classif_layers=self.reg_layers_classif, return_output=False, ) prior_h_enc, prior_h_classif = ( @@ -219,21 +188,23 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -242,28 +213,29 @@ def validation_epoch(self, data_loader, swa_update_bn=False): Args: data_loader: PyTorch data loader return input/output pairs """ + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() - for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + for batch, data in enumerate(data_loader): + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) - feats = self.feat_extractor(data) - with self.amp_autocast(): + feats = self.feat_extractor(input_data) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(feats) - loss = self.loss(output, target) + loss = self.loss(output.logits, target) - batch_metrics["loss"] = loss.mean().item() + batch_metrics["loss"] = loss.item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 06086d32..2f1fd18a 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -2,15 +2,18 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -from collections import OrderedDict as ODict import logging +import os +from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn -from ..utils import MetricAcc, TorchDDP +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, TorchDDP, tensors_subset +from .torch_trainer import AMPDType from .xvector_trainer import XVectorTrainer @@ -34,6 +37,7 @@ class XVectorTrainerFromWav(XVectorTrainer): loss: if None, it uses cross-entropy train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -43,7 +47,10 @@ class XVectorTrainerFromWav(XVectorTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -55,16 +62,19 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", loss=None, - train_mode="train", + train_mode="full", use_amp=False, - log_interval=10, + amp_dtype=AMPDType.FLOAT16, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -73,70 +83,43 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, + input_key="x", + target_key="class_id", ): - - super().__init__( - model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) - + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) self.feat_extractor = feat_extractor if device is not None: self.feat_extractor.to(device) - # if ddp: - # self.feat_extractor = TorchDDP(self.feat_extractor) - def train_epoch(self, data_loader): """Training epoch loop Args: data_loader: pytorch data loader returning features and class labels. """ - + batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() - - for batch, (data, target) in enumerate(data_loader): + self.feat_extractor.train() + self.model.train() + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + audio, target = tensors_subset(data, batch_keys, self.device) + batch_size = audio.size(0) with torch.no_grad(): - feats = self.feat_extractor(data) + feats, feats_lengths = self.feat_extractor(audio) - with self.amp_autocast(): - output = self.model(feats, target) - loss = self.loss(output, target).mean() / self.grad_acc_steps + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(feats, feats_lengths, y=target) + loss = self.loss(output.logits, target) / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -144,52 +127,57 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): """Validation epoch loop Args: - data_loader: PyTorch data loader return input/output pairs + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. """ + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() + self.feat_extractor.eval() with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() - for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + for batch, data in enumerate(data_loader): + audio, target = tensors_subset(data, batch_keys, self.device) + batch_size = audio.size(0) - feats = self.feat_extractor(data) - with self.amp_autocast(): - output = self.model(feats, **self.amp_args) - loss = self.loss(output, target) + feats, feats_lengths = self.feat_extractor(audio) + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): + output = self.model(feats, feats_lengths) + loss = self.loss(output.logits, target) batch_metrics["loss"] = loss.mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/torch/utils/__init__.py b/hyperion/torch/utils/__init__.py index 6db39ef3..cbfab5ed 100644 --- a/hyperion/torch/utils/__init__.py +++ b/hyperion/torch/utils/__init__.py @@ -3,8 +3,22 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .devices import open_device -from .metric_acc import MetricAcc -from .eval_utils import eval_nnet_by_chunks, eval_nnet_overlap_add +from .collation import ( + collate_seqs_1d, + collate_seqs_2d, + collate_seqs_nd, + list_of_dicts_to_list, +) from .data_parallel import TorchDataParallel -from .ddp import TorchDDP, FairShardedDDP, FairFullyShardedDDP +from .ddp import FairFullyShardedDDP, FairShardedDDP, TorchDDP +from .devices import ( + open_device, + tensors_subset, + tensors_to_cpu, + tensors_to_device, + tensors_to_numpy, +) +from .eval_utils import eval_nnet_by_chunks, eval_nnet_overlap_add +from .masking import make_attn_mask_causal, scale_seq_lengths, seq_lengths_to_mask +from .metric_acc import MetricAcc +from .vad_utils import remove_silence diff --git a/hyperion/torch/utils/collation.py b/hyperion/torch/utils/collation.py new file mode 100644 index 00000000..2b18a87a --- /dev/null +++ b/hyperion/torch/utils/collation.py @@ -0,0 +1,199 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import torch +import torch.nn as nn +from torch.nn.utils.rnn import pad_sequence + + +def list_of_dicts_to_list(list_of_dicts, key): + """Takes a list of dictionaries and a key, + and returns a list of the items corresponding to the key + """ + output = [] + for item in list_of_dicts: + output.append(item[key]) + + return output + + +def collate_seqs_1d(x, pad_value=0): + """Combines a list/tuple of vectors with different lengths + into a single tensor. + + Args: + x: input lits/tuple of vectors. + + Returns: + 2D tensor with shape (num_vectors, max_vector_length). + 1D long tensor containing the vector lengths. + """ + if not isinstance(x[0], torch.Tensor): + x = [torch.from_numpy(x_i) for x_i in x] + + assert x[0].dim() == 1 + x_lengths = [] + for x_i in x: + x_lengths.append(x_i.size(0)) + + x_lengths = torch.as_tensor(x_lengths) + x = pad_sequence(x, batch_first=True, padding_value=pad_value) + return x, x_lengths + + +def collate_seqs_2d(x, pad_value=0, pad_dim=0): + """Combines a list/tuple of matrices with different sizes in one of + the dimensions into a single 3d tensor. + Combines performing padding on the dimension which is not constant. + + Args: + x: input lits/tuple of matrices. + pad_dim: padding dimension. + + Returns: + 3D tensor with shape (num_vectors, max_length, feat_dim) or (num_vectors, feat_dim, length). + 1D long tensor containing the dimensions lengths. + """ + if not isinstance(x[0], torch.Tensor): + x = [torch.from_numpy(x_i) for x_i in x] + assert x[0].dim() == 2 + if pad_dim < 0: + pad_dim = 2 + pad_dim + + if pad_dim != 0: + x = [x_i.transpose(pad_dim, 0) for x_i in x] + + x_lengths = [] + for x_i in x: + x_lengths.append(x_i.size(0)) + + x_lengths = torch.as_tensor(x_lengths) + x = pad_sequence(x, batch_first=True, padding_value=pad_value) + if pad_dim != 0: + x = x.transpose(1, pad_dim + 1) + + return x, x_lengths + + +def collate_seqs_nd(x, pad_value=0, pad_dim=0): + """Combines a list/tuple of N-d tensors with different sizes in one of + the dimensions into a single (N+1)-d tensor. + Combines performing padding on the dimension which is not constant. + + Args: + x: input lits/tuple of matrices. + pad_dim: padding dimension. + + Returns: + (N+1)-d combined tensor. + 1D long tensor containing the dimensions lengths. + """ + if not isinstance(x[0], torch.Tensor): + x = [torch.from_numpy(x_i) for x_i in x] + + if x[0].dim() == 1: + return collate_seqs_1d(x, pad_value=pad_value) + + if pad_dim < 0: + pad_dim = x[0].dim() + pad_dim + + if pad_dim != 0: + x = [x_i.transpose(pad_dim, 0) for x_i in x] + + x_lengths = [] + for x_i in x: + x_lengths.append(x_i.size(0)) + + x_lengths = torch.as_tensor(x_lengths) + x = pad_sequence(x, batch_first=True, padding_value=pad_value) + if pad_dim != 0: + x = x.transpose(1, pad_dim + 1) + + return x, x_lengths + + +# def collate_seq_1d(x, pad_value=0): +# """Combines a list/tuple of vectors with different lengths +# into a single tensor. + +# Args: +# x: input lits/tuple of vectors. + +# Returns: +# 2D tensor with shape (num_vectors, max_vector_length). +# 1D long tensor containing the vector lengths. +# """ +# max_length = max([x_i.size(0) for x_i in x]) +# y = pad_value * torch.ones(len(x), max_length, dtype=x[0].dtype, device=x[0].device) +# y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) +# for i, x_i in enumerate(x): +# y[i, : x_i.size(0)] = x_i +# y_lengths[i] = x_i.size(0) + +# return y, y_lengths + + +# def collate_seq_2d(x, pad_value=0, pad_dim=-1): +# """Combines a list/tuple of matrices with different sizes in one of +# the dimensions into a single 3d tensor. +# Combines performing padding on the dimension which is not constant. + +# Args: +# x: input lits/tuple of matrices. +# pad_dim: padding dimension. + +# Returns: +# 3D tensor with shape (num_vectors, max_length, feat_dim) or (num_vectors, feat_dim, length). +# 1D long tensor containing the dimensions lengths. +# """ +# max_length = max([x_i.size(pad_dim) for x_i in x]) +# y_size = list(x[0].size()) +# y_size[pad_dim] = max_length +# y = pad_value * torch.ones(*y_size, dtype=x[0].dtype, device=x[0].device) +# y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) +# if pad_dim == -1 or pad_dim == 1: +# for i, x_i in enumerate(x): +# y[i, :, : x_i.size(pad_dim)] = x_i +# y_lengths[i] = x_i.size(pad_dim) +# else: +# for i, x_i in enumerate(x): +# y[i, : x_i.size(pad_dim)] = x_i +# y_lengths[i] = x_i.size(pad_dim) + +# return y, y_lengths + + +# def collate_seq_nd(x, pad_value=0, pad_dim=-1): +# """Combines a list/tuple of N-d tensors with different sizes in one of +# the dimensions into a single (N+1)-d tensor. +# Combines performing padding on the dimension which is not constant. + +# Args: +# x: input lits/tuple of matrices. +# pad_dim: padding dimension. + +# Returns: +# (N+1)-D combined tensor. +# 1D long tensor containing the dimensions lengths. +# """ +# if x[0].dim() == 1: +# return collate_seq_1d(x) + +# if x[0].dim() == 2: +# return collate_seq_2d(x) + +# # here the general case +# max_length = max([x_i.size(pad_dim) for x_i in x]) +# y_trans_size = list(x[0].transpose(0, pad_dim).size()) +# y = pad_value * torch.ones(*y_trans_size, dtype=x[0].dtype, device=x[0].device) +# y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) +# for i, x_i in enumerate(x): +# y[i, : x_i.size(pad_dim)] = x_i.transpose(0, pad_dim) +# y_lengths[i] = x_i.size(pad_dim) + +# if pad_dim > 0: +# pad_dim = pad_dim + 1 +# y = y.transpose(1, pad_dim).contiguous() +# return y, y_lengths diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py index 48a8bcfe..aa5efe37 100644 --- a/hyperion/torch/utils/ddp.py +++ b/hyperion/torch/utils/ddp.py @@ -2,20 +2,20 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os +import datetime import logging +import os import torch -import torch.nn as nn import torch.distributed as dist -from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP +import torch.nn as nn from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP +from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP from .devices import open_device def add_ddp_args(parser): - parser.add_argument( "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" ) @@ -47,7 +47,6 @@ def filter_ddp_args(**kwargs): def ddp_init( gpu_id, num_gpus, node_id=0, num_nodes=1, master_addr="localhost", master_port=None ): - rank = node_id * num_gpus + gpu_id world_size = num_nodes * num_gpus @@ -55,15 +54,22 @@ def ddp_init( device = open_device(num_gpus) return device, 0, 1 - os.environ["MASTER_ADDR"] = master_addr - os.environ["MASTER_PORT"] = master_port + os.environ["MASTER_ADDR"] = str(master_addr) + os.environ["MASTER_PORT"] = str(master_port) logging.info( - f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port}" + f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port} gpu_id={gpu_id}" ) - dist.init_process_group("nccl", rank=rank, world_size=world_size) + dist.init_process_group( + "nccl", + rank=rank, + world_size=world_size, + ) + torch.cuda.set_device(rank) torch.tensor([0]).to(gpu_id) - return gpu_id, rank, world_size + device = torch.device("cuda", gpu_id) + return device, rank, world_size + # return gpu_id, rank, world_size def ddp_cleanup(): @@ -73,6 +79,23 @@ def ddp_cleanup(): pass +def ddp_wait_for_all_procs(): + if dist.is_initialized(): + dist.barrier() + + +def ddp_get_rank_world_size(): + if dist.is_initialized(): + return dist.get_rank(), dist.get_world_size() + return 0, 1 + + +def ddp_get_rank(): + if dist.is_initialized(): + return dist.get_rank() + return 0 + + class TorchDDP(nn.parallel.DistributedDataParallel): def __getattr__(self, name): try: diff --git a/hyperion/torch/utils/devices.py b/hyperion/torch/utils/devices.py index 16c61a48..c0736f2f 100644 --- a/hyperion/torch/utils/devices.py +++ b/hyperion/torch/utils/devices.py @@ -2,9 +2,9 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os import subprocess -import logging import torch @@ -42,3 +42,66 @@ def find_free_gpus(num_gpus): except: gpu_ids = "0" return gpu_ids + + +def tensors_to_device(data, device): + if isinstance(data, dict): + for k in data: + data[k] = data[k].to(device) + elif isinstance(data, list): + for i, value in enumerate(data): + data[i] = value.to(device) + elif isinstance(data, tuple): + data = tuple(value.to(device) for value in data) + elif isinstance(data, torch.Tensor): + data = data.to(device) + else: + raise Exception(f"Unknown data type for {data}") + + return data + + +def tensors_to_cpu(data): + if isinstance(data, dict): + for k in data: + data[k] = data[k].cpu() + elif isinstance(data, list): + for i, value in enumerate(data): + data[i] = value.cpu() + elif isinstance(data, tuple): + data = tuple(value.cpu() for value in data) + elif isinstance(data, torch.Tensor): + data = data.cpu() + else: + raise Exception(f"Unknown data type for {data}") + + return data + + +def tensors_to_numpy(data): + if isinstance(data, dict): + for k in data: + data[k] = data[k].cpu().numpy() + elif isinstance(data, list): + for i, value in enumerate(data): + data[i] = value.cpu().numpy() + elif isinstance(data, tuple): + data = tuple(value.cpu().numpy() for value in data) + elif isinstance(data, torch.Tensor): + data = data.cpu().numpy() + else: + raise Exception(f"Unknown data type for {data}") + + return data + + +def tensors_subset(data, keys, device=None, return_dict=False): + if return_dict: + data = {k: data[k] for k in keys} + else: + data = tuple(data[k] for k in keys) + + if device is not None: + data = tensors_to_device(data, device) + + return data diff --git a/hyperion/torch/utils/dummy_k2.py b/hyperion/torch/utils/dummy_k2.py new file mode 100644 index 00000000..27d387de --- /dev/null +++ b/hyperion/torch/utils/dummy_k2.py @@ -0,0 +1,10 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +version = 0.0 + + +class RaggedTensor: + pass diff --git a/hyperion/torch/utils/eval_utils.py b/hyperion/torch/utils/eval_utils.py index e8fa9c86..d6a9063a 100644 --- a/hyperion/torch/utils/eval_utils.py +++ b/hyperion/torch/utils/eval_utils.py @@ -4,22 +4,20 @@ """ import math + import torch def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1): - # model_device = next(nnet.parameters()).device - # print(device, model_device, x.device) - # assume time is the last dimension - device = None if nnet.device == x.device else nnet.device - T = x.shape[time_dim] if T <= chunk_length or chunk_length == 0: if device is not None: x = x.to(device) y = nnet(x) + if isinstance(y, tuple): + y = y[0] if detach_chunks: y = y.detach() return y @@ -53,6 +51,8 @@ def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1 x_i = x_i.to(device) y_i = nnet(x_i) + if isinstance(y_i, tuple): + y_i = y_i[0] if detach_chunks: y_i = y_i.detach() @@ -102,7 +102,6 @@ def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1 def eval_nnet_overlap_add( x, nnet, chunk_length=0, chunk_overlap=None, detach_chunks=True, time_dim=-1 ): - device = None if nnet.device == x.device else nnet.device # assume time is the last dimension @@ -183,170 +182,3 @@ def eval_nnet_overlap_add( y = y.transpose(0, time_dim) / count return y - - -# """ -# Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -# """ - -# import math -# import torch - -# def eval_nnet_by_chunks(x, nnet, chunk_length=0, device=None, time_dim=-1): -# # model_device = next(nnet.parameters()).device -# # print(device, model_device, x.device) -# #assume time is the last dimension -# T = x.shape[time_dim] -# if T <= chunk_length or chunk_length == 0: -# if device is not None: -# x = x.to(device) -# return nnet(x) #.detach() - -# try: -# left_context, right_context = nnet.in_context() -# except: -# left_context = right_context = 0 - -# in_shape = x.shape -# chunk_shift_in = chunk_length - left_context - right_context - -# try: -# out_shape = nnet.out_shape(in_shape) -# T_out = out_shape[time_dim] -# r = float(T_out)/T -# except: -# out_shape = None - - -# num_chunks = int(math.ceil((T-chunk_length)/chunk_shift_in+1)) -# #move time dimension to dim 0 -# x = x.transpose(0, time_dim) -# y = None -# tbeg_in = 0 -# tbeg_out = 0 -# for i in range(num_chunks): -# tend_in = min(tbeg_in + chunk_length, x.shape[0]) -# #get slice and move back time dimension to last dim -# x_i = x[tbeg_in:tend_in].transpose(0, time_dim) -# if device is not None: -# x_i = x_i.to(device) - -# y_i = nnet(x_i).detach() -# chunk_length_out = y_i.shape[time_dim] -# if out_shape is None: -# # infer chunk_shift in the output -# r = float(chunk_length_out)/chunk_length - -# # infer total output length -# T_out = int(r * T) -# out_shape = list(y_i.shape) -# out_shape[time_dim] = T_out - -# if y is None: -# right_context_out = int(math.floor(r*right_context)) -# left_context_out = int(math.floor(r*left_context)) -# chunk_shift_out = chunk_length_out - right_context_out - left_context_out -# # create output tensor -# y = torch.zeros(out_shape) -# #move time dimension to dim 0 -# y = y.transpose(0, time_dim) - -# y_i = y_i.transpose(0, time_dim) - -# if i == 0: -# tend_out = min(tbeg_out + chunk_length_out, T_out) -# y[tbeg_out:tend_out] = y_i -# tbeg_out =+ (chunk_length_out - right_context_out) -# else: -# tend_out = min(int(round(tbeg_out)) + chunk_length_out - left_context_out, T_out) -# dt = tend_out - tbeg_out -# if dt > 0: -# #print('eu', tbeg_out, tend_out, left_context_out,left_context_out+dt, T_out, chunk_length, chunk_length_out, tbeg_in, tend_in) -# y[tbeg_out:tend_out] = y_i[left_context_out:left_context_out+dt] -# tbeg_out += chunk_shift_out - -# tbeg_in += chunk_shift_in - -# # put time dimension back in his place -# y = y.transpose(0, time_dim) - -# return y - - -# def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, device=None, time_dim=-1): - -# #assume time is the last dimension -# T = x.shape[time_dim] -# if T <= chunk_length or chunk_length == 0: -# if device is not None: -# x = x.to(device) -# return nnet(x).detach() - -# if chunk_overlap is None: -# #infer chunk overlap from network input context -# try: -# left_context, right_context = nnet.in_context() -# except: -# left_context = right_context = 0 - -# chunk_overlap = left_context + right_context - - -# in_shape = x.shape -# chunk_shift_in = chunk_length - chunk_overlap - -# try: -# out_shape = nnet.out_shape(in_shape) -# T_out = out_shape[time_dim] -# r = float(T_out)/T -# except: -# out_shape = None - - -# num_chunks = int(math.ceil((T-chunk_length)/chunk_shift_in+1)) -# #move time dimension to dim 0 -# x = x.transpose(0, time_dim) -# y = None -# N = None -# tbeg_in = 0 -# tbeg_out = 0 -# for i in range(num_chunks): -# tend_in = min(tbeg_in + chunk_length, x.shape[0]) -# #get slice and move back time dimension to last dim -# x_i = x[tbeg_in:tend_in].transpose(0, time_dim) -# if device is not None: -# x_i = x_i.to(device) - -# y_i = nnet(x_i).detach() -# chunk_length_out = y_i.shape[time_dim] -# if out_shape is None: -# # infer chunk_shift in the output -# r = float(chunk_length_out)/chunk_length - -# # infer total output length -# T_out = int(r * T) -# out_shape = list(y_i.shape) -# out_shape[time_dim] = T_out - -# if y is None: -# chunk_shift_out = r*chunk_shift_in -# # create output tensor -# y = torch.zeros(out_shape) -# #move time dimension to dim 0 -# y = y.transpose(0, time_dim) -# count = torch.zeros(T_out) - -# y_i = y_i.transpose(0, time_dim) - -# tend_out = min(int(round(tbeg_out)) + chunk_length_out, T_out) -# dt = tend_out - tbeg_out -# y[tbeg_out:tend_out] += y_i[:dt] -# count[tbeg_out:tend_out] += 1 -# tbeg_out += chunk_shift_out -# tbeg_in += chunk_shift_in - -# # put time dimension back in his place and normalize -# y = y.transpose(0, time_dim)/count - -# return y diff --git a/hyperion/torch/utils/masking.py b/hyperion/torch/utils/masking.py new file mode 100644 index 00000000..1a240976 --- /dev/null +++ b/hyperion/torch/utils/masking.py @@ -0,0 +1,86 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import torch +import torch.nn as nn + + +def scale_seq_lengths(lengths, max_out_length, max_in_length=None): + if lengths is None: + return None + + if max_in_length is None: + max_in_length = lengths.max() + + if max_in_length == max_out_length: + return lengths + + return torch.div(lengths * max_out_length, max_in_length, rounding_mode="floor") + + +def seq_lengths_to_mask( + lengths, max_length=None, dtype=None, time_dim=1, ndim=None, none_if_all_max=False +): + """Creates a binary masks indicating the valid values in a sequence. + + Args: + lengths: sequence lengths with shape=(batch,). If None, it returns None + max_length: maximum length of the sequence. + dtype: dtype for the mask. + time_dim: dimension > 0 corresponding to time in the mask. This will + return a view of the mask which will adapt to the shape + of the tensor where we want to apply the mask. + This has to be a positive integer. + ndim: number of dimensions in the mask tensor, if None, it is equal to time_dim + 1. + none_if_all_max: if True and all lengths are equal to max. length, it returns None + + Returns: + Binary mask with shape=(batch,...,max_length,...) or None + """ + if lengths is None: + return None + + assert time_dim > 0 + assert lengths.dim() == 1 + + if max_length is None: + max_length = lengths.max() + + if none_if_all_max and torch.all(lengths == max_length): + return None + + idx = torch.arange(max_length, dtype=lengths.dtype, device=lengths.device) + + # compute mask shape=(batch, max_length) + mask = idx.unsqueeze(0) < lengths.unsqueeze(1) + + if ndim is None: + ndim = time_dim + 1 + + # view to match the tensor where we want to apply the mask + if ndim > 1: + shape = [1] * ndim + shape[0] = lengths.size(0) + shape[time_dim] = -1 + mask = mask.view(*shape) + + # change dtype if needed + if dtype is not None: + mask = mask.to(dtype) + + return mask + + +def make_attn_mask_causal(mask: torch.Tensor): + """Make causal mask for decoder self-attention.""" + size = mask.size(-1) + causal_mask = torch.ones(size, size, device=mask.device, dtype=torch.bool) + torch.tril(causal_mask, out=causal_mask) + return mask & causal_mask + + +def make_dec_causal_att_mask(y: torch.Tensor, padding_idx: int): + mask = (y != padding_idx).unsqueeze(-2) + return make_attn_mask_causal(mask) diff --git a/hyperion/torch/utils/metric_acc.py b/hyperion/torch/utils/metric_acc.py index d635310b..a82c174a 100644 --- a/hyperion/torch/utils/metric_acc.py +++ b/hyperion/torch/utils/metric_acc.py @@ -4,6 +4,7 @@ """ import logging from collections import OrderedDict as ODict + import numpy as np import torch diff --git a/hyperion/torch/utils/misc.py b/hyperion/torch/utils/misc.py index 2b4f6034..46c09080 100644 --- a/hyperion/torch/utils/misc.py +++ b/hyperion/torch/utils/misc.py @@ -1,26 +1,69 @@ """ - Copyright 2020 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import torch +import torch.nn as nn import torch.cuda.amp as amp -def l2_norm(x, axis=-1): +def l2_norm(x, dim=1, axis=None): + """Applies length normalization to vectors. + + Args: + x: input tensor. + dim: dimension along which normalize the vectors. + axis: same as dim (deprecated). + + Returns: + Normalized tensor. + """ + if axis is not None: + dim = axis + with amp.autocast(enabled=False): - norm = torch.norm(x.float(), 2, axis, True) + 1e-10 + norm = torch.norm(x.float(), 2, dim, True) + 1e-10 y = torch.div(x, norm) return y -def compute_snr(x, n, axis=-1): - P_x = 10 * torch.log10(torch.mean(x ** 2, dim=axis)) - P_n = 10 * torch.log10(torch.mean(n ** 2, dim=axis)) +def compute_snr(x, n, dim=1, axis=None): + """Computes SNR (dB) + + Args: + x: tensor with clean signal. + n: tensor with noisy signal + dim: dimension along which normalize power. + axis: same as dim (deprecated). + + Returns: + Tensor with SNR(dB) + """ + if axis is not None: + dim = axis + P_x = 10 * torch.log10(torch.mean(x ** 2, dim=dim)) + P_n = 10 * torch.log10(torch.mean(n ** 2, dim=dim)) return P_x - P_n def compute_stats_adv_attack(x, x_adv): + """Compute statistics of adversarial attack sample. + + Args: + x: benign signal tensor. + x_adv: adversarial signal tensor. + + Returns: + SNR (dB). + Power of x. + Power of n. + L2 norm of x. + Linf norm of x. + L0 norm of n. + L2 norm of n. + Linf norm of n. + """ if x.dim() > 2: x = torch.flatten(x, start_dim=1) @@ -42,6 +85,17 @@ def compute_stats_adv_attack(x, x_adv): def get_selfsim_tarnon(y, return_mask=False): + """Computes ground truth selfsimilarity matrix given + integer class labels. + + Args: + y: integer tensor with class labels of shape (batch,). + return_mask: If True, it returns upper triangular mask with zero diagonal. + + Returns: + Self-similarity binary matrix wiht shape=(batch, batch). + Upper triangular mask. + """ y_bin = y.unsqueeze(-1) - y.unsqueeze(0) + 1 y_bin[y_bin != 1] = 0 y_bin = y_bin.float() @@ -50,3 +104,5 @@ def get_selfsim_tarnon(y, return_mask=False): mask = torch.triu(torch.ones_like(y_bin, dtype=torch.bool), diagonal=1) return y_bin, mask + + diff --git a/hyperion/torch/utils/vad_utils.py b/hyperion/torch/utils/vad_utils.py new file mode 100644 index 00000000..4dc11ff7 --- /dev/null +++ b/hyperion/torch/utils/vad_utils.py @@ -0,0 +1,59 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import torch +import torch.nn as nn + +from .collation import collate_seqs_nd + + +def remove_silence(x, vad, x_lengths=None, time_dim=1, tol=0): + """Remove silence samples/frames. + + Args: + x: input signal/spectrogram of shape=(batch,...,time,...). + vad: binary voice activity detection mask of shape=(batch, time). + x_lenghts: lengths of each sequence in x. + time_dim: which dimension in x is time. + tol: tolerance for the difference between time dimensions in x and vad. + + Returns: + x without silence samples/frames. + """ + + # we make x and vad time dimensions of the same size. + assert x.size(0) == vad.size(0), "batch-size is different for x and vad" + x_max_length = x.size(time_dim) + vad_max_length = vad.size(-1) + length_err = x_max_length - vad_max_length + assert abs(length_err) <= tol, ( + f"Difference between x_length({x_max_length}) and " + f"vad_length({vad_max_length}) > tol ({tol})" + ) + if length_err > 0: + vad = nn.functional.pad(vad, (0, length_err), model="constant", value=0) + elif length_err < 0: + vad = vad[:, :x_max_length] + + # if x_lengths is passed, we make sure that vad is 0 for time steps larger + # than x_length + if x_lengths is not None: + for i in range(x.size(0)): + vad[i, x_lengths[i] :] = 0 + + trans = False + if time_dim != 1 or time_dim != 1 - x.dim(): + x = x.transpose(1, time_dim) + trans = True + + y = [] + for i in range(x.size(0)): + y.append(x[i, vad[i]]) + + y, y_lengths = collate_seqs_nd(y, pad_dim=0) + if trans: + y = y.transpose(1, time_dim).contigous() + + return y, y_lengths diff --git a/hyperion/torch/wd_schedulers/__init__.py b/hyperion/torch/wd_schedulers/__init__.py new file mode 100644 index 00000000..d8440b12 --- /dev/null +++ b/hyperion/torch/wd_schedulers/__init__.py @@ -0,0 +1,9 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +from .cos_wd import CosineWD +from .factory import WDSchedulerFactory +from .wd_scheduler import WDScheduler diff --git a/hyperion/torch/wd_schedulers/cos_wd.py b/hyperion/torch/wd_schedulers/cos_wd.py new file mode 100644 index 00000000..563e4353 --- /dev/null +++ b/hyperion/torch/wd_schedulers/cos_wd.py @@ -0,0 +1,50 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +import logging +import math + +import torch + +from .wd_scheduler import WDScheduler + + +class CosineWD(WDScheduler): + r"""Set the weight decay of each parameter group using a cosine + + Attributes: + optimizer: Pytorch optimizer object. + initial_wd: initial value of the weight decay. + warmup_steps: number of warm up steps to get the the weight decay to its final value. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_wd_on_opt_step: if True, updates the weight decay each time we update the model, + otherwise after each epoch. + """ + + def __init__( + self, + optimizer, + initial_wd=0, + warmup_steps=0, + epoch=0, + step=0, + update_wd_on_opt_step=False, + ): + super().__init__( + optimizer, initial_wd, warmup_steps, epoch, step, update_wd_on_opt_step + ) + + def get_wd(self, step): + if step >= self.warmup_steps: + return self.final_wds + + r = math.pi / self.warmup_steps + return [ + final_wd + (init_wd - final_wd) * (1 + math.cos(r * step)) / 2 + for init_wd, final_wd in zip(self.initial_wds, self.final_wds) + ] diff --git a/hyperion/torch/wd_schedulers/factory.py b/hyperion/torch/wd_schedulers/factory.py new file mode 100644 index 00000000..dc72bd2c --- /dev/null +++ b/hyperion/torch/wd_schedulers/factory.py @@ -0,0 +1,89 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args +from .cos_wd import CosineWD + + +class WDSchedulerFactory: + def create( + optimizer, + wdsch_type, + initial_wd=None, + warmup_steps=0, + update_wd_on_opt_step=False, + ): + """Creates a weight decay scheduler object. + + Args: + optimizer: Pytorch optimizer object. + wdsched_type: type of scheduler in ["none", "cos_wd"]. + initial_wd: inital value of weight decay + warmup_steps: steps until reaching final weight decay + update_wd_on_opt_step: if True, updates the wd each time we update the model, + otherwise after each epoch. + """ + + if wdsch_type == "none": + return None + + if wdsch_type == "cos_wd": + return CosineWD( + optimizer, + initial_wd=initial_wd, + warmup_steps=warmup_steps, + update_wd_on_opt_step=update_wd_on_opt_step, + ) + + raise ValueError(f"invalid wdsch_type={wdsch_type}") + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(WDSchedulerFactory.create, kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--wdsch-type", + type=str.lower, + default="none", + choices=[ + "none", + "cos_wd", + ], + help=("weight decay schedulers: None," "Cosine Annealing."), + ) + + parser.add_argument( + "--initial-wd", + default=None, + type=float, + help=( + "Initial value of weight decay, it is expected to be lower than final value." + ), + ) + + parser.add_argument( + "--warmup-steps", + default=0, + type=int, + help=("Number of steps to reach the final value of weight decay"), + ) + + parser.add_argument( + "--update-wd-on-opt-step", + default=False, + action=ActionYesNo, + help=("Update weight decay based on batch number instead of epoch number"), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/wd_schedulers/wd_scheduler.py b/hyperion/torch/wd_schedulers/wd_scheduler.py new file mode 100644 index 00000000..3a092c3d --- /dev/null +++ b/hyperion/torch/wd_schedulers/wd_scheduler.py @@ -0,0 +1,124 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import torch +import torch.optim as optim + + +class WDScheduler: + """Base class for weight decay schedulers. + + Attributes: + optimizer: Pytorch optimizer object. + initial_wd: initial value of the weight decay. + warmup_steps: number of warm up steps to get the the weight decay to its final value. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_wd_on_opt_step: if True, updates the weight decay each time we update the model, + otherwise after each epoch. + """ + + def __init__( + self, + optimizer, + initial_wd=0, + warmup_steps=0, + epoch=0, + step=0, + update_wd_on_opt_step=False, + ): + if not isinstance(optimizer, optim.Optimizer): + raise TypeError("%s is not an Optimizer" % (type(optimizer).__name__)) + self.optimizer = optimizer + + if epoch == 0: + for group in optimizer.param_groups: + group.setdefault("final_wd", group["weight_decay"]) + else: + for i, group in enumerate(optimizer.param_groups): + if "final_wd" not in group: + raise KeyError( + "param 'final_wd' is not specified " + "in param_groups[{}] when resuming an optimizer".format(i) + ) + + self.final_wds = list( + map(lambda group: group["final_wd"], optimizer.param_groups) + ) + + if isinstance(initial_wd, list) or isinstance(initial_wd, tuple): + if len(initial_wd) != len(optimizer.param_groups): + raise ValueError( + "expected {} initial_wds, got {}".format( + len(optimizer.param_groups), len(initial_wd) + ) + ) + self.initial_wds = list(initial_wd) + else: + max_wd = max([group["final_wd"] for group in optimizer.param_groups]) + self.initial_wds = [ + initial_wd * group["final_wd"] / max_wd + for group in optimizer.param_groups + ] + + if epoch == 0: + for group, wd in zip(optimizer.param_groups, self.initial_wds): + group["weight_decay"] = wd + + self.warmup_steps = warmup_steps + self.epoch = epoch + self.step = step + self.update_wd_on_opt_step = update_wd_on_opt_step + + @property + def in_warmup(self): + return self.step < self.warmup_steps + + def state_dict(self): + """Returns the state of the scheduler as a :class:`dict`. + + It contains an entry for every variable in self.__dict__ which + is not the optimizer. + """ + return { + key: value for key, value in self.__dict__.items() if key != "optimizer" + } + + def load_state_dict(self, state_dict): + """Loads the schedulers state. + + Arguments: + state_dict (dict): scheduler state. Should be an object returned + from a call to :meth:`state_dict`. + """ + self.__dict__.update(state_dict) + + def get_wd(self): + raise NotImplementedError + + def on_epoch_begin(self, epoch=None, **kwargs): + if epoch is not None: + self.epoch = epoch + + if self.update_wd_on_opt_step: + return + + for param_group, wd in zip( + self.optimizer.param_groups, self.get_wd(self.epoch) + ): + param_group["weight_decay"] = wd + + def on_epoch_end(self, metrics=None): + self.epoch += 1 + + def on_opt_step(self): + if self.update_wd_on_opt_step: + for param_group, wd in zip( + self.optimizer.param_groups, self.get_wd(self.step) + ): + param_group["weight_decay"] = wd + + self.step += 1 diff --git a/hyperion/transforms/__init__.py b/hyperion/transforms/__init__.py deleted file mode 100644 index 3f6c5f45..00000000 --- a/hyperion/transforms/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -from .cent_whiten import CentWhiten -from .lnorm import LNorm -from .sb_sw import SbSw -from .pca import PCA -from .lda import LDA -from .nda import NDA -from .nap import NAP -from .mvn import MVN -from .coral import CORAL -from .gaussianizer import Gaussianizer -from .skl_tsne import SklTSNE -from .transform_list import TransformList - -from .cent_whiten_up import CentWhitenUP -from .lnorm_up import LNormUP diff --git a/hyperion/transforms/cent_whiten.py b/hyperion/transforms/cent_whiten.py deleted file mode 100644 index 00a83cca..00000000 --- a/hyperion/transforms/cent_whiten.py +++ /dev/null @@ -1,119 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py - -import scipy.linalg as la - -from ..hyp_model import HypModel -from ..pdfs import Normal - - -class CentWhiten(HypModel): - """Class to do centering and whitening of i-vectors.""" - - def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): - super().__init__(**kwargs) - self.mu = mu - self.T = T - self.update_mu = update_mu - self.update_T = update_T - - def predict(self, x): - if self.mu is not None: - x = x - self.mu - if self.T is not None: - if self.T.ndim == 1: - x = x * T - else: - x = np.dot(x, self.T) - return x - - def fit(self, x=None, sample_weight=None, mu=None, S=None): - - if x is not None: - if x.shape[0] > x.shape[1]: - gauss = Normal(x_dim=x.shape[1]) - gauss.fit(x=x, sample_weight=sample_weight) - mu = gauss.mu - S = gauss.Sigma - else: - mu = np.mean(x, axis=0) - S = np.eye(x.shape[1]) - - if self.update_mu: - self.mu = mu - - if self.update_T: - d, V = la.eigh(S) - V *= np.sqrt(1 / d) - V = np.fliplr(V) - - p = V[0, :] < 0 - V[:, p] *= -1 - - nonzero = d > 0 - if not np.all(nonzero): - V = V[:, nonzero[::-1]] - - self.T = V - - def get_config(self): - config = {"update_mu": self.update_mu, "update_t": self.update_T} - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {"mu": self.mu, "T": self.T} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "T"] - params = cls._load_params_to_dict(f, config["name"], param_list) - return cls(mu=params["mu"], T=params["T"], name=config["name"]) - - @classmethod - def load_mat(cls, file_path): - with h5py.File(file_path, "r") as f: - mu = np.asarray(f["mu"], dtype="float32") - T = np.asarray(f["T"], dtype="float32") - return cls(mu, T) - - def save_mat(self, file_path): - with h5py.File(file_path, "w") as f: - f.create_dataset("mu", data=self.mu) - f.create_dataset("T", data=self.T) - - @staticmethod - def filter_args(**kwargs): - valid_args = ("update_mu", "update_T", "name") - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod - def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." - - parser.add_argument( - p1 + "update-mu", - default=True, - type=bool, - help=("updates centering parameter"), - ) - - parser.add_argument( - p1 + "update-T", - default=True, - type=bool, - help=("updates whitening parameter"), - ) - - parser.add_argument(p1 + "name", default="lnorm") - - add_argparse_args = add_class_args diff --git a/hyperion/transforms/cent_whiten_up.py b/hyperion/transforms/cent_whiten_up.py deleted file mode 100644 index f22488f4..00000000 --- a/hyperion/transforms/cent_whiten_up.py +++ /dev/null @@ -1,33 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py - -import scipy.linalg as la - -from ..hyp_model import HypModel -from ..pdfs import Normal -from .cent_whiten import CentWhiten - - -class CentWhitenUP(CentWhiten): - """Class to do centering and whitening with uncertainty propagation.""" - - def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): - super(CentWhitenUP, self).__init__(mu, T, update_mu, update_T, **kwargs) - - def predict(self, x): - x_dim = int(x.shape[-1] / 2) - m_x = x[:, :x_dim] - s2_x = x[:, x_dim:] - m_x = super(CentWhitenUP, self).predict(m_x) - for i in range(x.shape[0]): - s2_x[i] = np.diag(np.dot(self.T.T * s2_x[i], self.T)) - return np.hstack((m_x, s2_x)) - - def fit(self, x, sample_weight=None): - x = x[:, : int(x.shape[-1] / 2)] - super(CentWhitenUP, self).fit(x, sample_weight=sample_weight) diff --git a/hyperion/transforms/coral.py b/hyperion/transforms/coral.py deleted file mode 100644 index 0c9dea85..00000000 --- a/hyperion/transforms/coral.py +++ /dev/null @@ -1,108 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py - -import scipy.linalg as la - -from ..hyp_model import HypModel - - -class CORAL(HypModel): - """Class to do CORAL""" - - def __init__( - self, - mu=None, - T_col=None, - T_white=None, - update_mu=True, - update_T=True, - alpha_mu=1, - alpha_T=1, - **kwargs - ): - super(CORAL, self).__init__(**kwargs) - self.mu = mu - self.T_col = T_col - self.T_white = T_white - self.T = None - self.update_mu = update_mu - self.update_T = update_T - self.alpha_mu = alpha_mu - self.alpha_T = alpha_T - - def get_config(self): - config = { - "update_mu": self.update_mu, - "update_t": self.update_T, - "pca_dim": self.pca_dim, - } - base_config = super(CORAL, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def _compute_T(self): - if self.T_col is not None and self.T_white is not None: - self.T = np.dot(self.T_white, self.T_col) - - def predict(self, x): - if self.T is None: - self._compute_T() - if self.mu is not None: - x = x - self.mu - - if self.T is not None: - x = np.dot(x, self.T) - - return x - - def fit(self, x, sample_weight=None, x_out=None, sample_weight_out=None): - - if x_out is None: - assert self.T_white is not None - else: - mu_out = np.mean(x_out, axis=0) - if self.update_T: - delta = x_out - mu_out - S_out = np.dot(delta.T, delta) / x_out.shape[0] - # zero-phase component analysis (ZCA) - d, V = la.eigh(S_out) - self.T_white = np.dot(V * (1 / np.sqrt(d)), V.T) - - mu_in = np.mean(x, axis=0) - if self.update_T: - delta = x - mu_in - S_in = np.dot(delta.T, delta) / x.shape[0] - if self.alpha_T < 1: - S_in = self.alpha_T * S_in + (1 - self.alpha_T) * S_out - # zero-phase component analysis (ZCA) - d, V = la.eigh(S_in) - d[d < 0] = 0 - self.T_col = np.dot(V * np.sqrt(d), V.T) - - if self.update_mu: - self.mu = self.alpha_mu * (mu_out - mu_in) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "T_col", "T_white"] - params = cls._load_params_to_dict(f, config["name"], param_list) - return cls( - mu=params["mu"], - T_col=params["T_col"], - T_white=params["T_white"], - name=config["name"], - ) - - def save_params(self, f): - params = { - "mu": self.mu, - "T_col": self.T_col, - "T_white": self.T_white, - "alpha_mu": self.alpha_mu, - "alpha_T": self.alpha_T, - } - self._save_params_from_dict(f, params) diff --git a/hyperion/transforms/gaussianizer.py b/hyperion/transforms/gaussianizer.py deleted file mode 100644 index ea512ade..00000000 --- a/hyperion/transforms/gaussianizer.py +++ /dev/null @@ -1,101 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import logging -import numpy as np -import h5py - -import scipy.linalg as la -from scipy.special import erfinv - -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel - - -class Gaussianizer(HypModel): - """Class to make i-vector distribution standard Normal.""" - - def __init__(self, max_vectors=None, r=None, **kwargs): - super(Gaussianizer, self).__init__(**kwargs) - self.max_vectors = max_vectors - self.r = r - - def predict(self, x): - px_cum = np.linspace(0, 1, self.r.shape[0] + 2)[1:-1] - y_map = erfinv(2 * px_cum - 1) * np.sqrt(2) - - r = self.r[1:] - y = np.zeros_like(x) - for i in range(x.shape[1]): - y_index = np.searchsorted(r[:, i], x[:, i]) - logging.debug(y_index) - y[:, i] = y_map[y_index] - - return y - - def fit(self, x): - - r = np.sort(x, axis=0, kind="heapsort") - - x = np.zeros((1, x.shape[-1]), dtype=float_cpu()) - - if r.shape[0] > self.max_vectors: - index = np.round( - np.linspace(0, r.shape[0] - 1, self.max_vectors, dtype=float) - ).astype(int) - r = r[index, :] - - self.r = np.vstack((x, r)) - - def get_config(self): - config = {"max_vectors": self.max_vectors} - - base_config = super(Gaussianizer, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {"r": self.r} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["r"] - params = cls._load_params_to_dict(f, config["name"], param_list) - return cls( - r=params["r"], max_vectors=config["max_vectors"], name=config["name"] - ) - - @classmethod - def load_mat(cls, file_path): - with h5py.File(file_path, "r") as f: - r = np.asarray(f["r"], dtype="float32") - return cls(r=r) - - def save_mat(self, file_path): - with h5py.File(file_path, "w") as f: - f.create_dataset("r", data=self.r) - - @staticmethod - def filter_args(**kwargs): - valid_args = ("max_vectors", "name") - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod - def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." - - parser.add_argument( - p1 + "max-vectors", - default=None, - type=int, - help=("maximum number of background vectors"), - ) - - parser.add_argument(p1 + "name", default="gauss") - - add_arparse_args = add_class_args diff --git a/hyperion/transforms/lda.py b/hyperion/transforms/lda.py deleted file mode 100644 index 142ed2bd..00000000 --- a/hyperion/transforms/lda.py +++ /dev/null @@ -1,106 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py - -import scipy.linalg as la - -from ..hyp_model import HypModel -from .sb_sw import SbSw - - -class LDA(HypModel): - """Class to do linear discriminant analysis.""" - - def __init__( - self, mu=None, T=None, lda_dim=None, update_mu=True, update_T=True, **kwargs - ): - super(LDA, self).__init__(**kwargs) - self.mu = mu - self.T = T - if T is None: - self.lda_dim = lda_dim - else: - self.lda_dim = T.shape[1] - self.update_mu = update_mu - self.update_T = update_T - - def predict(self, x): - if self.mu is not None: - x = x - self.mu - return np.dot(x, self.T) - - def fit(self, x, y, mu=None, Sb=None, Sw=None): - - if mu is None or Sb is None or Sw is None: - sbsw = SbSw() - sbsw.fit(x, y) - mu = sbsw.mu - Sb = sbsw.Sb - Sw = sbsw.Sw - - if self.update_mu: - self.mu = mu - - if not self.update_T: - return - - assert Sb.shape == Sw.shape - - try: - d, V = la.eigh(Sb, Sw) - except: - alpha = 1e-2 * np.max(np.diag(Sw)) - d, V = la.eigh(Sb, alpha * np.eye(Sw.shape[0]) + Sw) - V = np.fliplr(V) - - p = V[0, :] < 0 - V[:, p] *= -1 - - if self.lda_dim is not None: - assert self.lda_dim <= V.shape[1] - V = V[:, : self.lda_dim] - - self.T = V - - def get_config(self): - config = { - "lda_dim": self.lda_dim, - "update_mu": self.update_mu, - "update_t": self.update_T, - } - base_config = super(LDA, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {"mu": self.mu, "T": self.T} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "T"] - params = cls._load_params_to_dict(f, config["name"], param_list) - return cls(mu=params["mu"], T=params["T"], name=config["name"]) - - # @classmethod - # def load(cls, file_path): - # with h5py.File(file_path, 'r') as f: - # config = self.load_config_from_json(f['config']) - # param_list = ['mu', 'T'] - # params = self._load_params_to_dict(f, config['name'], param_list) - # return cls(mu=params['mu'], T=params['T'], name=config['name']) - - @classmethod - def load_mat(cls, file_path): - with h5py.File(file_path, "r") as f: - mu = np.asarray(f["mu"], dtype="float32") - T = np.asarray(f["T"], dtype="float32") - return cls(mu, T) - - def save_mat(self, file_path): - with h5py.File(file_path, "w") as f: - f.create_dataset("mu", data=self.mu) - f.create_dataset("T", data=self.T) diff --git a/hyperion/transforms/lnorm.py b/hyperion/transforms/lnorm.py deleted file mode 100644 index 088748b2..00000000 --- a/hyperion/transforms/lnorm.py +++ /dev/null @@ -1,17 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import numpy as np -import h5py - -from .cent_whiten import CentWhiten - - -class LNorm(CentWhiten): - """Class to do length normalization.""" - - def predict(self, x): - x = super().predict(x) - mx = np.sqrt(np.sum(x ** 2, axis=1, keepdims=True)) + 1e-10 - return np.sqrt(x.shape[1]) * x / mx diff --git a/hyperion/transforms/lnorm_up.py b/hyperion/transforms/lnorm_up.py deleted file mode 100644 index ab7b1ec9..00000000 --- a/hyperion/transforms/lnorm_up.py +++ /dev/null @@ -1,26 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - - -import numpy as np -import h5py - -from .cent_whiten_up import CentWhitenUP - - -class LNormUP(CentWhitenUP): - """Class to do Lenght Normalization with uncertainty propagation""" - - def predict(self, x): - x = super(LNormUP, self).predict(x) - x_dim = int(x.shape[-1] / 2) - m_x = x[:, :x_dim] - s2_x = x[:, x_dim:] - - mx2 = np.sum(m_x ** 2, axis=1, keepdims=True) + 1e-10 - m_x /= np.sqrt(mx2) - s2_x /= mx2 - - return np.hstack((m_x, s2_x)) diff --git a/hyperion/transforms/mvn.py b/hyperion/transforms/mvn.py deleted file mode 100644 index a3b77582..00000000 --- a/hyperion/transforms/mvn.py +++ /dev/null @@ -1,41 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py - -import scipy.linalg as la - -from ..hyp_model import HypModel - - -class MVN(HypModel): - """Class to do global mean and variance normalization.""" - - def __init__(self, mu=None, s=None, **kwargs): - super(MVN, self).__init__(**kwargs) - self.mu = mu - self.s = s - - def predict(self, x): - if self.mu is not None: - x = x - self.mu - if self.s is not None: - x = x / self.s - return x - - def fit(self, x): - self.mu = np.mean(x, axis=0) - self.s = np.std(x, axis=0) - - def save_params(self, f): - params = {"mu": self.mu, "s": self.s} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "s"] - params = cls._load_params_to_dict(f, config["name"], param_list) - return cls(mu=params["mu"], s=params["s"], name=config["name"]) diff --git a/hyperion/transforms/nap.py b/hyperion/transforms/nap.py deleted file mode 100644 index 6917c6b4..00000000 --- a/hyperion/transforms/nap.py +++ /dev/null @@ -1,64 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py - -import scipy.linalg as la - -from ..hyp_model import HypModel - - -class NAP(HypModel): - """Class to do nussance attribute projection.""" - - def __init__(self, U=None, **kwargs): - super(NAP, self).__init__(**kwargs) - self.U = U - - def predict(self, x): - return x - np.dot(np.dot(x, self.U.T), self.U) - - def fit(self, x, U_dim, class_ids): - x_hat = np.zeros_like(x) - u_ids = np.uniqe(class_ids) - M = np.sqrt(len(u_ids)) - for i in u_ids: - idx = np.nonzero(i == class_ids) - N = np.sqrt(len(idx)) - mu_i = np.mean(x[idx, :], axis=0) - xx[idx, :] = (x[idx, :] - mu_i) / N - xx /= M - _, s, Vt = np.svd(xx, full_matrices=False, overwrite_a=True) - idx = (np.argsort(s)[::-1])[:U_dim] - self.U = Vt[idx, :] - - def save_params(self, f): - params = {"U": self.U} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["U"] - params = cls._load_params_to_dict(f, config["name"], param_list) - return cls(U=params["U"], name=config["name"]) - - # @classmethod - # def load(cls, file_path): - # with h5py.File(file_path, 'r') as f: - # config = self.load_config_from_json(f['config']) - # param_list = ['U'] - # params = self._load_params_to_dict(f, config['name'], param_list) - # return cls(U=params['U'], name=config['name']) - - @classmethod - def load_mat(cls, file_path): - with h5py.File(file_path, "r") as f: - U = np.asarray(f["U"], dtype="float32") - return cls(U) - - def save_mat(self, file_path): - with h5py.File(file_path, "w") as f: - f.create_dataset("U", data=self.U) diff --git a/hyperion/transforms/nda.py b/hyperion/transforms/nda.py deleted file mode 100644 index 4f9772fc..00000000 --- a/hyperion/transforms/nda.py +++ /dev/null @@ -1,65 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np -import h5py - -import scipy.linalg as la - -from ..hyp_model import HypModel -from ..hyp_defs import float_cpu - - -class NDA(HypModel): - """Class to do nearest-neighbors discriminant analysis""" - - def __init__(self, mu=None, T=None, **kwargs): - super().__init__(**kwargs) - self.mu = mu - self.T = T - - def predict(self, x): - if self.mu is not None: - x = x - self.mu - return np.dot(x, self.T) - - def fit(self, mu, Sb, Sw, nda_dim=None): - self.mu = mu - - assert Sb.shape == Sw.shape - - d, V = la.eigh(Sb, Sw) - V = np.fliplr(V) - - p = V[0, :] < 0 - V[:, p] *= -1 - - if nda_dim is not None: - assert nda_dim <= V.shape[1] - V = V[:, :nda_dim] - - self.T = V - - def save_params(self, f): - params = {"mu": self.mu, "T": self.T} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "T"] - params = cls._load_params_to_dict(f, config["name"], param_list) - return cls(mu=params["mu"], T=params["T"], name=config["name"]) - - @classmethod - def load_mat(cls, file_path): - with h5py.File(file_path, "r") as f: - mu = np.asarray(f["mu"], dtype="float32") - T = np.asarray(f["T"], dtype="float32") - return cls(mu, T) - - def save_mat(self, file_path): - with h5py.File(file_path, "w") as f: - f.create_dataset("mu", data=self.mu) - f.create_dataset("T", data=self.T) diff --git a/hyperion/transforms/pca.py b/hyperion/transforms/pca.py deleted file mode 100644 index cd8d6973..00000000 --- a/hyperion/transforms/pca.py +++ /dev/null @@ -1,181 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import numpy as np -import h5py - -from numpy.linalg import matrix_rank -import scipy.linalg as la - -from ..hyp_model import HypModel - - -class PCA(HypModel): - """Class to do principal component analysis""" - - def __init__( - self, - mu=None, - T=None, - update_mu=True, - update_T=True, - pca_dim=None, - pca_var_r=None, - pca_min_dim=2, - whiten=False, - **kwargs - ): - super().__init__(**kwargs) - self.mu = mu - self.T = T - self.update_mu = update_mu - self.update_T = update_T - self.pca_dim = pca_dim - self.pca_var_r = pca_var_r - self.pca_min_dim = pca_min_dim - self.whiten = whiten - - def predict(self, x): - if self.mu is not None: - x = x - self.mu - return np.dot(x, self.T) - - @staticmethod - def get_pca_dim_for_var_ratio(x, var_r=1, min_dim=2): - if var_r == 1: - rank = matrix_rank(x) - if rank <= min_dim: - # it may have failed, let's try the cov - rank = matrix_rank(np.dot(x.T, x)) - else: - sv = la.svd(x, compute_uv=False) - Ecc = np.cumsum(sv ** 2) - Ecc = Ecc / Ecc[-1] - rank = np.where(Ecc > var_r)[0][0] - - rank = max(min_dim, rank) - return rank - - def fit(self, x=None, sample_weight=None, mu=None, S=None): - - if x is not None: - mu = np.mean(x, axis=0) - delta = x - mu - S = np.dot(delta.T, delta) / x.shape[0] - - if self.update_mu: - self.mu = mu - - if self.update_T: - d, V = la.eigh(S) - d = np.flip(d) - V = np.fliplr(V) - - # This makes the Transform unique - p = V[0, :] < 0 - V[:, p] *= -1 - - if self.pca_var_r is not None: - var_acc = np.cumsum(d) - var_r = var_acc / var_acc[-1] - self.pca_dim = max( - np.where(var_r > self.pca_var_r)[0][0], self.pca_min_dim - ) - - if self.whiten: - # the projected features will be whitened - # do not whithen dimension with eigenvalue eq. to 0. - is_zero = d <= 0 - if np.any(is_zero): - max_dim = np.where(is_zero)[0][0] - V = V[:, :max_dim] * 1 / np.sqrt(d[:max_dim]) - if self.pca_dim is None: - self.pca_dim = max_dim - else: - self.pca_dim = min(max_dim, self.pca_dim) - else: - V = V * 1 / np.sqrt(d) - - if self.pca_dim is not None: - assert self.pca_dim <= V.shape[1] - V = V[:, : self.pca_dim] - - self.T = V - - def get_config(self): - config = { - "update_mu": self.update_mu, - "update_t": self.update_T, - "pca_dim": self.pca_dim, - "pca_var_r": self.pca_var_r, - } - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {"mu": self.mu, "T": self.T} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "T"] - params = cls._load_params_to_dict(f, config["name"], param_list) - return cls( - mu=params["mu"], - T=params["T"], - pca_dim=config["pca_dim"], - name=config["name"], - ) - - @classmethod - def load_mat(cls, file_path): - with h5py.File(file_path, "r") as f: - mu = np.asarray(f["mu"], dtype="float32") - T = np.asarray(f["T"], dtype="float32") - return cls(mu, T) - - def save_mat(self, file_path): - with h5py.File(file_path, "w") as f: - f.create_dataset("mu", data=self.mu) - f.create_dataset("T", data=self.T) - - @staticmethod - def filter_args(**kwargs): - valid_args = ("update_mu", "update_T", "name", "pca_dim", "pca_var_r") - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod - def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." - - parser.add_argument( - p1 + "update-mu", - default=True, - type=bool, - help=("updates centering parameter"), - ) - parser.add_argument( - p1 + "update-T", - default=True, - type=bool, - help=("updates whitening parameter"), - ) - - parser.add_argument( - p1 + "pca-dim", default=None, type=int, help=("output dimension of PCA") - ) - - parser.add_argument( - p1 + "pca-var-r", - default=None, - type=int, - help=("proportion of variance to keep when choosing the PCA dimension"), - ) - - parser.add_argument("--name", dest="name", default="pca") - - add_argparse_args = add_class_args diff --git a/hyperion/transforms/transform_list.py b/hyperion/transforms/transform_list.py deleted file mode 100644 index 3e89966a..00000000 --- a/hyperion/transforms/transform_list.py +++ /dev/null @@ -1,73 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import logging - -import numpy as np -import h5py - -from ..hyp_model import HypModel - -from .cent_whiten import CentWhiten -from .cent_whiten_up import CentWhitenUP -from .lnorm import LNorm -from .lnorm_up import LNormUP -from .pca import PCA -from .lda import LDA -from .nda import NDA -from .nap import NAP -from .mvn import MVN -from .gaussianizer import Gaussianizer - - -class TransformList(HypModel): - """Class to perform a list of transformations""" - - def __init__(self, transforms, **kwargs): - super(TransformList, self).__init__(**kwargs) - if not isinstance(transforms, list): - transforms = [transforms] - self.transforms = transforms - if transforms is not None: - self.update_names() - - def append(self, t): - self.transforms.append(t) - if self.name is not None: - t.name = self.name + "/" + t.name - - def predict(self, x): - for t in self.transforms: - x = t.predict(x) - return x - - def update_names(self): - if self.name is not None: - for t in self.transforms: - t.name = self.name + "/" + t.name - - def get_config(self): - config = super(TransformList, self).get_config() - config_t = {} - for i in range(len(self.transforms)): - config_t[i] = self.transforms[i].get_config() - config["transforms"] = config_t - return config - - def save_params(self, f): - for t in self.transforms: - t.save_params(f) - - @classmethod - def load_params(cls, f, config): - config_ts = config["transforms"] - transforms = [] - for i in range(len(config_ts)): - config_t = config_ts[str(i)] - logging.debug(config_t) - class_t = globals()[config_t["class_name"]] - t = class_t.load_params(f, config_t) - transforms.append(t) - return cls(transforms, name=config["name"]) diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index bfd81028..9bc51181 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -3,14 +3,24 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .trial_ndx import TrialNdx -from .trial_key import TrialKey -from .trial_scores import TrialScores +from .class_info import ClassInfo +from .enrollment_map import EnrollmentMap +from .feature_set import FeatureSet +from .hyp_dataclass import HypDataClass +from .hyp_dataset import HypDataset +from .info_table import InfoTable +from .kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix +from .misc import PathLike +from .recording_set import RecordingSet +from .rttm import RTTM +from .scp_list import SCPList + +# from .ext_segment_list import ExtSegmentList +from .segment_list import SegmentList +from .segment_set import SegmentSet from .sparse_trial_key import SparseTrialKey from .sparse_trial_scores import SparseTrialScores -from .scp_list import SCPList +from .trial_key import TrialKey +from .trial_ndx import TrialNdx +from .trial_scores import TrialScores from .utt2info import Utt2Info -from .ext_segment_list import ExtSegmentList -from .segment_list import SegmentList -from .kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix -from .rttm import RTTM diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py new file mode 100644 index 00000000..b3a08178 --- /dev/null +++ b/hyperion/utils/class_info.py @@ -0,0 +1,126 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from pathlib import Path + +import numpy as np +import pandas as pd + +from .info_table import InfoTable + + +class ClassInfo(InfoTable): + def __init__(self, df): + super().__init__(df) + if "class_idx" not in self.df: + self.add_class_idx() + + if "weights" not in self.df: + self.set_uniform_weights() + else: + self.df["weights"] /= self.df["weights"].sum() + + def add_class_idx(self): + self.sort() + self.df["class_idx"] = [i for i in range(len(self.df))] + + def set_uniform_weights(self): + self.df["weights"] = 1 / len(self.df) + + def set_weights(self, weights): + self.df["weights"] = weights / weights.sum() + + def renorm_weights(self): + weights = self.df["weights"] + self.df["weights"] = weights / weights.sum() + + def exp_weights(self, x): + weights = self.df["weights"] ** x + self.set_weights(weights) + + def set_zero_weight(self, ids): + self.df.loc[ids, "weights"] = 0 + self.df["weights"] /= self.df["weights"].sum() + + @property + def weights(self, ids): + return self.df.loc[ids, "weights"] + + @property + def num_classes(self): + return self.df["class_idx"].values.max() + 1 + + def sort_by_idx(self, ascending=True): + self.sort("class_idx", ascending) + + @classmethod + def load(cls, file_path, sep=None): + """Loads utt2info list from text file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + dtype: Dictionary with the dtypes of each column. + Returns: + Utt2Info object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext == "": + # if no extension we load as kaldi utt2spk file + df = pd.read_csv( + file_path, + sep=" ", + header=None, + names=["id"], + dtype={"id": str}, + ) + return cls(df) + + return super().load(file_path, sep) + + @classmethod + def cat(cls, tables): + """Concatenates several tables. + + Args: + info_lists: List of InfoTables + + Returns: + InfoTable object concatenation the info_lists. + """ + df_list = [table.df for table in tables] + df = pd.concat(df_list) + if not df["id"].is_unique: + logging.warning( + """there are duplicated ids in original tables, + removing duplicated rows""" + ) + df.drop_duplicates(subset="id", keep="first", inplace=True) + + if not df["class_idx"].is_unique: + logging.warning( + """class_idx in concat tables are not unique, + we will assign new class_idx""" + ) + df.drop(columns=["class_idx"], inplace=True) + return cls(df) + + def filter( + self, + predicate=None, + items=None, + iindex=None, + columns=None, + by="id", + keep=True, + rebuild_idx=False, + ): + new_class_info = super().filter(predicate, items, iindex, columns, by, keep) + if rebuild_idx: + new_class_info.add_class_idx() + + return new_class_info diff --git a/hyperion/utils/enrollment_map.py b/hyperion/utils/enrollment_map.py new file mode 100644 index 00000000..4af69144 --- /dev/null +++ b/hyperion/utils/enrollment_map.py @@ -0,0 +1,101 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import re +from collections import OrderedDict +from copy import deepcopy +from pathlib import Path + +import numpy as np +import pandas as pd + +from .list_utils import split_list, split_list_group_by_key +from .info_table import InfoTable + + +class EnrollmentMap(InfoTable): + """Class to store the mapping between enrollment id + and segmentids + """ + + def __init__(self, df): + if "modelid" in df: + df.rename(columns={"modelid": "id"}, inplace=True) + assert "segmentid" in df + super().__init__(df) + + def split(self, idx, num_parts): + """Splits the mapping into num_parts and return part idx. + + Args: + idx: Part to return from 1 to num_parts. + num_parts: Number of parts to split the list. + group_by: All the lines with the same value in column + groub_by_field go to the same part + + Returns: + Sub InfoTable object + """ + _, idx1 = split_list_group_by_key(self.df["id"], idx, num_parts) + + df = self.df.iloc[idx1] + return EnrollmentMap(df) + + def save(self, file_path, sep=None, nist_compatible=True): + if nist_compatible: + # For compatibility with NIST SRE files the index column "id" + # is saved as modelid + self.df.rename(columns={"id": "modelid"}, inplace=True) + + super().save(file_path, sep) + if nist_compatible: + self.df.rename(columns={"modelid": "id"}, inplace=True) + + @classmethod + def load(cls, file_path, sep=None): + """Loads EnrollmentMap from file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + dtype: Dictionary with the dtypes of each column. + name: name for the data to be loaded + Returns: + EnrollmentMap object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext in ["", ".scp"]: + # if no extension we load as kaldi utt2spk file + df = pd.read_csv( + file_path, + sep=" ", + header=None, + names=["segmentid", "modelid"], + dtype={"segmentid": np.str, "modelid": np.str}, + ) + df = df[["modelid", "segmentid"]] + else: + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + + return cls(df) + + @classmethod + def cat(cls, tables): + """Concatenates several tables. + + Args: + info_lists: List of InfoTables + + Returns: + InfoTable object concatenation the info_lists. + """ + df_list = [table.df for table in tables] + df = pd.concat(df_list) + return cls(df) diff --git a/hyperion/utils/ext_segment_list.py b/hyperion/utils/ext_segment_list.py index 38a4a1b4..132cf7ff 100644 --- a/hyperion/utils/ext_segment_list.py +++ b/hyperion/utils/ext_segment_list.py @@ -3,10 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import logging -from copy import deepcopy +import os.path as path from collections import OrderedDict +from copy import deepcopy import numpy as np import pandas as pd @@ -78,11 +78,11 @@ def create( ) if series_id is None: - u_file_id = self.segments["file_id"].unique() + u_file_id = segments.segments["file_id"].unique() files = pd.DataFrame({"file_id": u_file_id, "series_id": u_file_id}) else: - file_id = [f for f in v for k, v in series_id.items()] - series_id = [k for f in v for k, v in series_id.items()] + file_id = [f for k, v in series_id.items() for f in v] + series_id = [k for k, v in series_id.items() for f in v] files = pd.DataFrame({"file_id": file_id, "series_id": series_id}) if isinstance(name, str): @@ -128,8 +128,8 @@ def create_from_segment_list( u_file_id = segments["file_id"].unique() files = pd.DataFrame({"file_id": u_file_id, "series_id": u_file_id}) else: - file_id = [f for f in v for k, v in series_id.items()] - series_id = [k for f in v for k, v in series_id.items()] + file_id = [f for k, v in series_id.items() for f in v] + series_id = [k for k, v in series_id.items() for f in v] files = pd.DataFrame({"file_id": file_id, "series_id": series_id}) return cls(segments, ext_segments, files, index_column) diff --git a/hyperion/utils/feature_set.py b/hyperion/utils/feature_set.py new file mode 100644 index 00000000..7e40dfd6 --- /dev/null +++ b/hyperion/utils/feature_set.py @@ -0,0 +1,80 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from pathlib import Path + +import numpy as np +import pandas as pd + +from .info_table import InfoTable +from .misc import PathLike + + +class FeatureSet(InfoTable): + def __init__(self, df): + super().__init__(df) + assert "storage_path" in df + + def add_prefix_to_storage_path(self, prefix: PathLike): + self.df["storge_path"] = self.df["storage_path"].apply(lambda x: f"{prefix}{x}") + + def save(self, file_path, sep=None): + """Saves info table to file + + Args: + file_path: File to write the list. + sep: Separator between the key and file_path in the text file. + """ + file_path = Path(file_path) + file_path.parent.mkdir(parents=True, exist_ok=True) + ext = file_path.suffix + if ext == ".scp": + # if no extension we save as kaldi feats.scp file + from .scp_list import SCPList + + offset = self.df["storage_byte"] if "storage_byte" in self.df else None + range_spec = None + if "start" and "num_frames" in self.df: + range_spec = [ + np.array([s, n], dtype=np.int64) + for s, n in self.df[["start", "num_frames"]] + ] + scp = SCPList( + self.df["id"].values, self.df["storage_path"].values, offset, range_spec + ) + scp.save(file_path) + return + + super().save(file_path, sep) + + @classmethod + def load(cls, file_path, sep=None): + """Loads utt2info list from text file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + Returns: + FeatureSet object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext == ".scp": + # if no extension we load as kaldi feats.scp file + from .scp_list import SCPList + + scp = SCPList.load(file_path) + df_dict = {"id": scp.key, "storage_path": scp.file_path} + df = pd.DataFrame(df_dict) + if scp.offset is not None: + df["storage_byte"] = scp.offset + + if scp.range_spec is not None: + df["start"] = [r[0] for r in scp.range_spec] + df["num_frames"] = [r[1] for r in scp.range_spec] + + return cls(df) + + return super().load(file_path, sep) diff --git a/hyperion/utils/fold_list.py b/hyperion/utils/fold_list.py index d5731f10..80b818d6 100644 --- a/hyperion/utils/fold_list.py +++ b/hyperion/utils/fold_list.py @@ -5,8 +5,8 @@ Class to make/read/write k-fold x-validation lists """ -import os.path as path import logging +import os.path as path from collections import OrderedDict from copy import deepcopy @@ -176,7 +176,7 @@ def create( FoldList object. """ if shuffle: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) if group_by_key is None: group_by_key = segment_key diff --git a/hyperion/utils/hyp_dataclass.py b/hyperion/utils/hyp_dataclass.py new file mode 100644 index 00000000..f1e86d2c --- /dev/null +++ b/hyperion/utils/hyp_dataclass.py @@ -0,0 +1,31 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from dataclasses import dataclass + + +@dataclass +class HypDataClass: + """Dataclass that can imitate a dict""" + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, val): + return setattr(self, key, val) + + def keys(self): + return self.__dict__.keys() + #return self.__annotations__.keys() + + def items(self): + return self.__dict__.items() + # for k in self.keys(): + # yield k, getattr(self, k) + + @classmethod + def from_parent(cls, parent, **kwargs): + args = parent.__dict__ + args.update(kwargs) + return cls(**args) diff --git a/hyperion/utils/hyp_dataset.py b/hyperion/utils/hyp_dataset.py new file mode 100644 index 00000000..dda4231e --- /dev/null +++ b/hyperion/utils/hyp_dataset.py @@ -0,0 +1,1502 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +from copy import deepcopy +from pathlib import Path +from typing import Dict, List, Optional, Union + +import lhotse +import numpy as np +import pandas as pd +import yaml + +from .class_info import ClassInfo +from .enrollment_map import EnrollmentMap +from .feature_set import FeatureSet +from .info_table import InfoTable +from .misc import PathLike +from .recording_set import RecordingSet +from .segment_set import SegmentSet +from .sparse_trial_key import SparseTrialKey +from .trial_key import TrialKey +from .trial_ndx import TrialNdx + + +class HypDataset: + """Class that contains all objects + (segments, recordings, features, class_infos) that + conform a dataset + + Attributes: + segments: SegmentSet object or path to it. + classes: Dictionary of ClassInfo objects or paths to then + recordings: RecordingSet object or paths to then + features: Dictionary of FeatureSet objects or paths to then + enrollments: Dictionary of EnrollmentMap objects or paths to then + trials: Dictionary of TrialKey/TrialNdx/SparseTrialKey objects + or paths to then + sparse_trials: load trial keys using the SparseTrialKey class instead + of TrialKey class. + table_sep: Column separator when reading/writting tables + + """ + + def __init__( + self, + segments: Union[SegmentSet, PathLike], + classes: Optional[Dict[str, Union[ClassInfo, PathLike]]] = None, + recordings: Optional[Union[RecordingSet, PathLike]] = None, + features: Optional[Dict[str, Union[FeatureSet, PathLike]]] = None, + enrollments: Optional[Dict[str, Union[EnrollmentMap, PathLike]]] = None, + trials: Optional[ + Dict[str, Union[TrialKey, TrialNdx, SparseTrialKey, PathLike]] + ] = None, + sparse_trials: bool = False, + table_sep: Optional[str] = None, + ): + if isinstance(segments, SegmentSet): + self._segments = segments + self._segments_path = None + else: + assert isinstance(segments, (str, Path)) + self._segments = None + self._segments_path = Path(segments) + + self._classes, self._classes_paths = self._parse_dict_args(classes, ClassInfo) + if recordings is not None: + if isinstance(recordings, RecordingSet): + self._recordings = recordings + self._recordings_path = None + else: + assert isinstance(recordings, (str, Path)) + self._recordings = None + self._recordings_path = Path(recordings) + + # self._recordings, self._recordings_paths = self._parse_dict_args( + # recordings, RecordingSet + # ) + + self._features, self._features_paths = self._parse_dict_args( + features, FeatureSet + ) + self._enrollments, self._enrollments_paths = self._parse_dict_args( + enrollments, + EnrollmentMap, + ) + self._trials, self._trials_paths = self._parse_dict_args( + trials, + (TrialKey, TrialNdx, SparseTrialKey), + ) + + self.sparse_trials = sparse_trials + self.table_sep = table_sep + self._files_to_delete = [] + self.fix_segments_dtypes() + + def fix_segments_dtypes(self): + if self._segments is not None: + self._fix_segments_dtypes(self._segments) + + def _fix_segments_dtypes(self, segments): + # ids in class_infos should be strings in segment set columns + for k in self.classes_keys(): + segments.convert_col_to_str(k) + + def get_dataset_files(self): + file_paths = [] + for file_path in [self._segments_path, self._recordings_path]: + if file_path is not None: + file_paths.append(file_path) + + for path_dict in [ + self._features_paths, + self._enrollments_paths, + self._trials_paths, + ]: + if path_dict is None: + continue + for k, v in path_dict.items(): + file_paths.append(v) + + return file_paths + + def _delete_files(self, dataset_dir): + if not self._files_to_delete: + return + + dataset_files = self.get_dataset_files() + for file_path in self._files_to_delete: + file_path = Path(file_path) + # if the file has been added again we don't delete + if file_path in dataset_files: + continue + + # if we are saving the dataset to another location + # we don't delete the one in the original + if file_path.parent == dataset_dir and file_path.is_file(): + file_path.unlink() + + def _parse_dict_args(self, data, types): + if data is None: + return None, None + + assert isinstance(data, dict) + objects = {k: (v if isinstance(v, types) else None) for k, v in data.items()} + paths = { + k: (v if isinstance(v, (str, Path)) else None) for k, v in data.items() + } + + return objects, paths + + def clone(self): + return deepcopy(self) + + def segments(self, keep_loaded: bool = True): + if self._segments is None: + assert self._segments_path is not None + segments = SegmentSet.load(self._segments_path, sep=self.table_sep) + self._fix_segments_dtypes(segments) + if keep_loaded: + self._segments = segments + return segments + + return self._segments + + def __len__(self): + return len(self.segments()) + + def recordings(self, keep_loaded: bool = True): + if self._recordings is None: + assert self._recordings_path is not None + recordings = RecordingSet.load(self._recordings_path, sep=self.table_sep) + if keep_loaded: + self._recordings = recordings + return recordings + + return self._recordings + + # def recordings_value(self, key: str, keep_loaded: bool = True): + # if self._recordings[key] is None: + # assert self._recordings_paths[key] is not None + # recordings = RecordingSet.load( + # self._recordings_paths[key], sep=self.table_sep + # ) + # if keep_loaded: + # self._recordings[key] = recordings + # return recordings + + # return self._recordings[key] + + def features_keys(self): + if self._features is not None: + return self._features.keys() + elif self._features_paths is not None: + return self._features_paths.keys() + else: + return {} + + def features_value(self, key: str, keep_loaded: bool = True): + if self._features[key] is None: + assert self._features_paths[key] is not None + features = FeatureSet.load(self._features_paths[key], sep=self.table_sep) + if keep_loaded: + self._features[key] = features + return features + + return self._features[key] + + def classes_keys(self): + if self._classes is not None: + return self._classes.keys() + elif self._classes_path is not None: + return self._classes_path.keys() + else: + return {} + + def classes_value(self, key: str, keep_loaded: bool = True): + if self._classes[key] is None: + assert self._classes_paths[key] is not None + classes = ClassInfo.load(self._classes_paths[key], self.table_sep) + if keep_loaded: + self._classes[key] = classes + return classes + + return self._classes[key] + + def enrollments_value(self, key: str, keep_loaded: bool = True): + if self._enrollments[key] is None: + assert self._enrollments_paths[key] is not None + enrollments = EnrollmentMap.load( + self._enrollments_paths[key], sep=self.table_sep + ) + if keep_loaded: + self._enrollments[key] = enrollments + return enrollments + + return self._enrollments[key] + + def trials_value(self, key: str, keep_loaded: bool = True): + if self._trials[key] is None: + assert self._trials_paths[key] is not None + try: + if self.sparse_trials: + trials = SparseTrialKey.load(self._trials_paths[key]) + else: + trials = TrialKey.load(self._trials_paths[key]) + except: + trials = TrialNdx.load(self._trials_paths[key]) + + if keep_loaded: + self._trials[key] = trials + return trials + + return self._trials[key] + + # def recordings(self, keep_loaded: bool = True): + # if self._recordings is None: + # yield from () + # else: + # for key in self._recordings.keys(): + # yield key, self.recordings_value(key, keep_loaded) + + def features(self, keep_loaded: bool = True): + if self._features is None: + yield from () + else: + for key in self._features.keys(): + yield key, self.features_value(key, keep_loaded) + + def classes(self, keep_loaded: bool = True): + if self._classes is None: + yield from () + else: + for key in self._classes.keys(): + yield key, self.classes_value(key, keep_loaded) + + def enrollments(self, keep_loaded: bool = True): + if self._enrollments is None: + yield from () + else: + for key in self._enrollments.keys(): + yield key, self.enrollments_value(key, keep_loaded) + + def trials(self, keep_loaded: bool = True): + if self._trials is None: + yield from () + else: + for key in self._trials.keys(): + yield key, self.trials_value(key, keep_loaded) + + # def add_recordings(self, recordings: Dict[str, Union[RecordingSet, PathLike]]): + # recordings, recordings_paths = self._parse_dict_args(recordings, RecordingSet) + # if self._recordings is None: + # self._recordings = self._recordings_paths = {} + # self._recordings.update(recordings) + # self._recordings_paths.update(recordings_paths) + + # def add_features(self, features: Dict[str, Union[FeatureSet, PathLike]]): + # features, features_paths = self._parse_dict_args(features, FeatureSet) + # if self._features is None: + # self._features = self._features_paths = {} + # self._features.update(features) + # self._features_paths.update(features_paths) + + # def add_classes(self, classes: Dict[str, Union[ClassInfo, PathLike]]): + # classes, classes_paths = self._parse_dict_args(classes, ClassInfo) + # if self._classes is None: + # self._classes = self._classes_paths = {} + # self._classes.update(classes) + # self._classes_paths.update(classes_paths) + + # def add_enrollments(self, enrollments: Dict[str, Union[EnrollmentMap, PathLike]]): + # enrollments, enrollments_paths = self._parse_dict_args( + # enrollments, + # EnrollmentMap, + # ) + # if self._enrollments is None: + # self._enrollments = self._enrollments_paths = {} + # self._enrollments.update(enrollments) + # self._enrollments_paths.update(enrollments_paths) + + # def add_trials( + # self, trials: Dict[str, Union[TrialKey, TrialNdx, SparseTrialKey, PathLike]] + # ): + # trials, trials_paths = self._parse_dict_args( + # trials, + # (TrialKey, TrialNdx, SparseTrialKey), + # ) + # if self._trials is None: + # self._trials = self._trials_paths = {} + # self._trials.update(trials) + # self._trials_paths.update(trials_paths) + + @staticmethod + def resolve_dataset_path(dataset_path): + dataset_path = Path(dataset_path) + ext = dataset_path.suffix + if ext in [".yaml", "yml"]: + dataset_file = dataset_path + dataset_dir = dataset_path.parent + else: + dataset_file = dataset_path / "dataset.yaml" + dataset_dir = dataset_path + + return dataset_dir, dataset_file + + @staticmethod + def resolve_file_path(dataset_dir, file_path): + dataset_dir = Path(dataset_dir) + file_path = Path(file_path) + if file_path.is_file(): + return file_path + + return dataset_dir / file_path + + def save( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + force_save_all: bool = False, + ): + """Saves the dataset to disk. + + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DataSet object + force_save_all: forces saving all tables even if they haven't changed, + otherwise, it only saves tables loaded in memory + and those that are not in the datadirectory + """ + if force_save_all: + self.save_all(dataset_path, update_paths, table_sep) + else: + self.save_changed(dataset_path, update_paths, table_sep) + + def save_changed( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + ): + """Saves the tables that change in disk or tables + that are not in the ouput directory. + + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DataSet object + """ + table_sep = self.table_sep if table_sep is None else table_sep + if update_paths: + self.table_sep = table_sep + + table_ext = ".tsv" if table_sep == "\t" else ".csv" + dataset_dir, dataset_file = HypDataset.resolve_dataset_path(dataset_path) + dataset = {} + file_name = f"segments{table_ext}" + dataset["segments"] = file_name + file_path = dataset_dir / file_name + if ( + self._segments is not None + or file_path != self._segments_path + or not file_path.exists() + ): + self.segments(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._segments_path = file_path + + file_name = f"recordings{table_ext}" + dataset["recordings"] = file_name + file_path = dataset_dir / file_name + if ( + self._recordings is not None + or file_path != self._recordings_path + or not file_path.exists() + ): + self.recordings(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._recordings_path = file_path + + # if self._recordings is not None: + # file_names = {} + # for k in self._recordings.keys(): + # file_name = k + table_ext + # file_names[k] = file_name + # file_path = dataset_dir / file_name + # if ( + # self._recordings[k] is not None + # or file_path != self._recordings_paths[k] + # or not file_path.exists() + # ): + # v = self.recordings_value(k, keep_loaded=False) + # v.save(file_path, sep=table_sep) + # if update_paths: + # self._recordings_paths[k] = file_path + + # if file_names: + # dataset["recordings"] = file_names + + if self._features is not None: + file_names = {} + for k in self._features.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._features[k] is not None + or file_path != self._features_paths[k] + or not file_path.exists() + ): + v = self.features_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._features_paths[k] = file_path + + if file_names: + dataset["features"] = file_names + + if self._classes is not None: + file_names = {} + for k in self._classes.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._classes[k] is not None + or file_path != self._classes_paths[k] + or not file_path.exists() + ): + v = self.classes_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._classes_paths[k] = file_path + + if file_names: + dataset["classes"] = file_names + + if self._enrollments is not None: + file_names = {} + for k in self._enrollments.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._enrollments[k] is not None + or file_path != self._enrollments_paths[k] + or not file_path.exists() + ): + v = self.enrollments_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._enrollments_paths[k] = file_path + + if file_names: + dataset["enrollments"] = file_names + + if self._trials is not None: + file_names = {} + for k in self._trials.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._trials[k] is not None + or file_path != self._trials_paths[k] + or not file_path.exists() + ): + v = self.trials_value(k, keep_loaded=False) + v.save(file_path) + if update_paths: + self._trials_paths[k] = file_path + + if file_names: + dataset["trials"] = file_names + + with open(dataset_file, "w") as f: + yaml.dump(dataset, f) + + self._delete_files(dataset_dir) + + def save_all( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + ): + """Saves all the dataset objects. + + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DataSet object + """ + table_sep = self.table_sep if table_sep is None else table_sep + if update_paths: + self.table_sep = table_sep + + table_ext = ".tsv" if table_sep == "\t" else ".csv" + dataset_dir, dataset_file = HypDataset.resolve_dataset_path(dataset_path) + dataset = {} + file_name = f"segments{table_ext}" + dataset["segments"] = file_name + file_path = dataset_dir / file_name + self.segments(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._segments_path = file_path + + file_name = f"recordings{table_ext}" + dataset["recordings"] = file_name + file_path = dataset_dir / file_name + self.recordings(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._recordings_path = file_path + + # file_names = {} + # for k, v in self.recordings(keep_loaded=False): + # file_name = k + table_ext + # file_names[k] = file_name + # file_path = dataset_dir / file_name + # v.save(file_path, sep=table_sep) + # if update_paths: + # self._recordings_paths[k] = file_path + + # if file_names: + # dataset["recordings"] = file_names + + file_names = {} + for k, v in self.features(keep_loaded=False): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._features_paths[k] = file_path + + if file_names: + dataset["features"] = file_names + + file_names = {} + for k, v in self.classes(keep_loaded=False): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._classes_paths[k] = file_path + + if file_names: + dataset["classes"] = file_names + + file_names = {} + for k, v in self.enrollments(keep_loaded=False): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._enrollments_paths[k] = file_path + + if file_names: + dataset["enrollments"] = file_names + + file_names = {} + for k, v in self.trials(keep_loaded=False): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path) + if update_paths: + self._trials_paths[k] = file_path + + if file_names: + dataset["trials"] = file_names + + with open(dataset_file, "w") as f: + yaml.dump(dataset, f) + + self._delete_files(dataset_dir) + + def update_from_disk(self): + self.segments() + self.recordings() + + for k, v in self.features(): + pass + + for k, v in self.classes(): + pass + + for k, v in self.enrollments(): + pass + + for k, v in self.trials(): + pass + + @classmethod + def load( + cls, dataset_path: PathLike, lazy: bool = True, sparse_trials: bool = False + ): + """Loads all the dataset objects. + + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + lazy: load data structures lazily when they are needed. + sparse_trials: load trial keys using the SparseTrialKey class instead of TrialKey class + + """ + dataset_dir, dataset_file = HypDataset.resolve_dataset_path(dataset_path) + with open(dataset_file, "r") as f: + dataset = yaml.safe_load(f) + + assert "segments" in dataset + segments = HypDataset.resolve_file_path(dataset_dir, dataset["segments"]) + classes = None + recordings = None + features = None + enrollments = None + trials = None + if "classes" in dataset: + classes = {} + for k, v in dataset["classes"].items(): + classes[k] = HypDataset.resolve_file_path(dataset_dir, v) + + if "recordings" in dataset: + recordings = HypDataset.resolve_file_path( + dataset_dir, dataset["recordings"] + ) + # recordings = {} + # for k, v in dataset["recordings"].items(): + # recordings[k] = HypDataset.resolve_file_path(dataset_dir, v) + + if "features" in dataset: + features = {} + for k, v in dataset["features"].items(): + features[k] = HypDataset.resolve_file_path(dataset_dir, v) + + if "enrollments" in dataset: + enrollments = {} + for k, v in dataset["enrollments"].items(): + enrollments[k] = HypDataset.resolve_file_path(dataset_dir, v) + + if "trials" in dataset: + trials = {} + for k, v in dataset["trials"].items(): + trials[k] = HypDataset.resolve_file_path(dataset_dir, v) + + dataset = cls( + segments, + classes, + recordings, + features, + enrollments, + trials, + sparse_trials=sparse_trials, + ) + if not lazy: + dataset.update_from_disk() + + return dataset + + def add_features(self, features_name: str, features: Union[PathLike, FeatureSet]): + if self._features is None: + self._features = {} + self._features_paths = {} + + if isinstance(features, (str, Path)): + self._features[features_name] = None + self._features_paths[features_name] = features + elif isinstance(features, FeatureSet): + self._features[features_name] = features + self._features_paths[features_name] = None + else: + raise ValueError() + + def set_segments( + self, + segments: Union[PathLike, SegmentSet], + ): + if isinstance(segments, (str, Path)): + self._segments = None + self._segments_path = segments + elif isinstance(segments, SegmentSet): + self._segments = segments + self._segments_path = None + else: + raise ValueError() + + def set_recordings( + self, + recordings: Union[PathLike, RecordingSet], + update_seg_durs: bool = False, + ): + if isinstance(recordings, (str, Path)): + self._recordings = None + self._recordings_path = Path(recordings) + elif isinstance(recordings, RecordingSet): + self._recordings = recordings + self._recordings_path = None + else: + raise ValueError() + + if update_seg_durs: + rec_ids = self.segments(keep_loaded=True).recordings() + self.segments()["duration"] = self.recordings().loc[rec_ids, "duration"] + + def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): + if self._classes is None: + self._classes = {} + self._classes_paths = {} + + if isinstance(classes, (str, Path)): + self._classes[classes_name] = None + self._classes_paths[classes_name] = Path(classes) + elif isinstance(classes, ClassInfo): + self._classes[classes_name] = classes + self._classes_paths[classes_name] = None + else: + raise ValueError() + + def add_enrollments( + self, + enrollments_name: str, + enrollments: Union[PathLike, EnrollmentMap], + ): + if self._enrollments is None: + self._enrollments = {} + self._enrollments_paths = {} + + if isinstance(enrollments, (str, Path)): + self._enrollments[enrollments_name] = None + self._enrollments_paths[enrollments_name] = Path(enrollments) + elif isinstance(enrollments, EnrollmentMap): + self._enrollments[enrollments_name] = enrollments + self._enrollments_paths[enrollments_name] = None + else: + raise ValueError() + + def add_trials( + self, + trials_name: str, + trials: Union[PathLike, TrialKey, TrialNdx, SparseTrialKey], + ): + if self._trials is None: + self._trials = {} + self._trials_paths = {} + + if isinstance(trials, (str, Path)): + self._trials[trials_name] = None + self._trials_paths[trials_name] = Path(trials) + elif isinstance(trials, (TrialKey, TrialNdx, SparseTrialKey)): + self._trials[trials_name] = trials + self._trials_paths[trials_name] = None + else: + raise ValueError() + + def remove_features(self, features_name: str): + if self._features_paths[features_name] is not None: + self._files_to_delete.append(self._features_paths[features_name]) + + del self._features[features_name] + del self._features_paths[features_name] + + def remove_recordings( + self, + ): + if self._recordings_path is not None: + self._files_to_delete.append(self._recordings_path) + + self._recordings = None + self._recordings_path = None + + def remove_classes(self, classes_name: str): + if self._classes_paths[classes_name] is not None: + self._files_to_delete.append(self._class_paths[classes_name]) + + del self._classes[classes_name] + del self._classes_paths[classes_name] + + def remove_enrollments( + self, + enrollments_name: str, + ): + if self._enrollments_paths[enrollments_name] is not None: + self._files_to_delete.append(self._enrollments_paths[enrollments_name]) + + del self._enrollments[enrollments_name] + del self._enrollments_paths[enrollments_name] + + def remove_trials( + self, + trials_name: str, + ): + if self._trials_paths[trials_name] is not None: + self._files_to_delete.append(self._trials_paths[trials_name]) + + del self._trials[trials_name] + del self._trials_paths[trials_name] + + def add_cols_to_segments( + self, + right_table: Union[InfoTable, pd.DataFrame, PathLike], + column_names: Union[None, str, List[str], np.ndarray] = None, + on: Union[str, List[str], np.ndarray] = "id", + right_on: Union[None, str, List[str], np.ndarray] = None, + remove_missing: bool = False, + create_class_info: bool = False, + ): + if isinstance(right_table, (str, Path)): + file_path = Path(right_table) + if file_path.is_file(): + right_table = InfoTable.load(file_path) + else: + if right_table == "recordings": + right_table = self.recordings() + elif right_table in self.features_keys(): + right_table = self.features_value(right_table) + elif right_table in self.classes_keys(): + right_table = self.classes_value(right_table) + else: + raise ValueError("%s not found", right_table) + + segments = self.segments(keep_loaded=True) + num_segs_0 = len(segments) + segments.add_columns( + right_table, + column_names, + on=on, + right_on=right_on, + remove_missing=remove_missing, + ) + if remove_missing and len(segments) < num_segs_0: + self.clean() + + if create_class_info and column_names is not None: + self.create_class_info_from_col(column_names) + + def create_class_info_from_col( + self, + column_names: Union[str, List[str], np.ndarray], + ): + if isinstance(column_names, str): + column_names = [column_names] + + for col in column_names: + if col not in self._classes: + df = pd.DataFrame( + {"id": np.unique(self.segments(keep_loaded=True)[col])} + ) + class_info = ClassInfo(df) + self.add_classes(col, class_info) + + def clean(self, rebuild_class_idx=False): + + rec_ids = self.segments().recordings() + self._recordings = self.recordings().filter(lambda df: df["id"].isin(rec_ids)) + + ids = self.segments()["id"].values + for k, table in self.features(): + self._features[k] = table.filter(lambda df: df["id"].isin(ids)) + + for k, table in self.classes(): + class_ids = self.segments()[k].unique() + self._classes[k] = table.filter(lambda df: df["id"].isin(class_ids)) + + remove_keys = [] + for k, table in self.enrollments(): + table = table.filter(lambda df: df["segmentid"].isin(ids)) + if len(table) > 0: + self._enrollments[k] = table + else: + remove_keys.append(k) + + for k in remove_keys: + self.remove_enrollments(k) + + remove_keys = [] + for k, key in self.trials(): + keep_ids = [cur_id for cur_id in key.seg_set if cur_id in ids] + if keep_ids: + key = key.filter(key.model_set, keep_ids, keep=True) + self._trials[k] = key + else: + remove_keys.append(k) + + for k in remove_keys: + self.remove_trials(k) + + def _split_into_trials_and_cohort( + self, + segments: SegmentSet, + num_tar_trials: int, + num_trial_speakers: int, + seed: int, + ): + # select test speakers + rng = np.random.default_rng(seed=seed) + + spks = segments["speaker"].unique() + trial_spks = rng.choice(spks, size=(num_trial_speakers,), replace=False) + snorm_segments = SegmentSet(segments[~segments["speaker"].isin(trial_spks)]) + + trial_segments = segments[segments["speaker"].isin(trial_spks)] + # solution of 2nd degree eq. + # num_spks * n (n-1) /2 = num_trials + num_segs_per_spk = int( + math.ceil((1 + math.sqrt(1 + 8 * num_tar_trials // num_trial_speakers)) / 2) + ) + + n = num_trial_speakers * num_segs_per_spk + seg_ids = rng.choice(trial_segments["id"], size=(n,), replace=False) + trial_segments = SegmentSet(segments[segments["id"].isin(seg_ids)]) + seg_ids = trial_segments["id"].values + class_ids = trial_segments["speaker"].values + tar = np.zeros((n - 1, n), dtype=bool) + non = np.zeros((n - 1, n), dtype=bool) + + ntar = 0 + nnon = 0 + for i in range(n - 1): + for j in range(i + 1, n): + if class_ids[i] == class_ids[j]: + tar[i, j] = True + else: + non[i, j] = True + + logging.info("Got ntar=%d and nnon=%d", tar.sum(), non.sum()) + trials = TrialKey(seg_ids[:-1], seg_ids, tar, non) + df_enr = pd.DataFrame({"id": seg_ids[:-1], "segmentid": seg_ids[:-1]}) + enrollments = EnrollmentMap(df_enr) + return trials, enrollments, snorm_segments + + def split_into_trials_and_cohort( + self, + num_1k_tar_trials: int, + num_trial_speakers: int, + intra_gender: bool = True, + trials_name="trials_qmf", + seed=1123, + ): + """When training quality measure fusion in, e.g., VoxCeleb recipe. + We split the data into 2 parts: + 1) used to calculate SV scores to train the fusion + 2) cohort used to calculate the S-Norm parameters used in the QMF. + + The trials_file will be stored in the current dataset + A new dataset is created with only the cohort speakers + + Args: + num_1k_tar_trials: num of 1000 target trials. + num_trial_speakers: number of spks used to create trials. + intra_gender: if True, no cross gender trials are done. + + Returns: + HypDataset used for trials with trial list. + HypDataset used for cohort. + """ + num_tar_trials = num_1k_tar_trials * 1000 + if intra_gender: + num_tar_trials = num_tar_trials // 2 + num_trial_speakers = num_trial_speakers // 2 + segments = self.segments() + segments_male = SegmentSet(segments[segments["gender"] == "m"]) + segments_female = SegmentSet(segments[segments["gender"] == "f"]) + trials_male, enroll_male, cohort_male = self._split_into_trials_and_cohort( + segments_male, + num_tar_trials, + num_trial_speakers, + seed, + ) + ( + trials_female, + enroll_female, + cohort_female, + ) = self._split_into_trials_and_cohort( + segments_female, + num_tar_trials, + num_trial_speakers, + seed, + ) + trials = TrialKey.merge([trials_male, trials_female]) + enroll = EnrollmentMap.cat([enroll_male, enroll_female]) + cohort = SegmentSet.cat([cohort_male, cohort_female]) + else: + segments = self.segments() + trials, enroll, cohort = self._split_into_trials_and_cohort( + segments, + num_tar_trials, + num_trial_speakers, + seed, + ) + + dataset_trials = self.clone() + segments = self.segments() + trials_segments = SegmentSet(segments.loc[segments["id"].isin(trials.seg_set)]) + dataset_trials.set_segments(trials_segments) + dataset_trials.add_trials("trials", trials) + dataset_trials.add_enrollments("enrollments", enroll) + dataset_trials.clean() + + dataset_cohort = self.clone() + dataset_cohort.set_segments(cohort) + dataset_cohort.clean() + + return dataset_trials, dataset_cohort + + def remove_short_segments(self, min_length: float, length_name: str = "duration"): + segments = self.segments() + self._segments = segments.filter(lambda df: df[length_name] >= min_length) + self.clean() + + def remove_classes_few_segments( + self, + class_name: str, + min_segs: int, + rebuild_idx: bool = False, + ): + segments = self.segments() + classes, counts = np.unique(segments[class_name], return_counts=True) + keep_classes = classes[counts >= min_segs] + self._segments = segments.filter(lambda df: df[class_name].isin(keep_classes)) + self.clean() + if rebuild_idx: + class_info = self.classes_value(class_name) + class_info.add_class_idx() + + def remove_classes_few_toomany_segments( + self, + class_name: str, + min_segs: int, + max_segs: int, + rebuild_idx: bool = False, + ): + segments = self.segments() + classes, counts = np.unique(segments[class_name], return_counts=True) + if max_segs is None: + keep_classes = classes[counts >= min_segs] + else: + keep_classes = classes[ + np.logical_and(counts >= min_segs, counts <= max_segs) + ] + self._segments = segments.filter(lambda df: df[class_name].isin(keep_classes)) + self.clean() + if rebuild_idx: + class_info = self.classes_value(class_name) + class_info.add_class_idx() + + def rebuild_class_idx(self, class_name: str): + class_info = self.classes_value(class_name) + class_info.add_class_idx() + + def _segments_split(self, val_prob: float, rng: np.random.Generator): + segments = self.segments() + p = rng.permutation(len(segments)) + num_train = int(round((1 - val_prob) * len(p))) + + train_idx = p[:num_train] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_idx = p[num_train:] + val_segs = segments.filter(iindex=val_idx) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_joint_classes( + self, + val_prob: float, + joint_classes: List[str], + min_train_samples: int, + rng: np.random.Generator, + ): + segments = self.segments() + classes = segments[joint_classes].apply("-".join, axis=1) + u_classes, class_ids = np.unique(classes, return_inverse=True) + train_mask = np.zeros(len(segments), dtype=bool) + kk = 0 + for c_id in range(len(u_classes)): + idx = (class_ids == c_id).nonzero()[0] + count = len(idx) + p = rng.permutation(count) + num_train = max( + int(round((1 - val_prob) * count)), min(min_train_samples, count) + ) + kk += count - num_train + train_idx = idx[p[:num_train]] + train_mask[train_idx] = True + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_disjoint_classes( + self, + val_prob: float, + disjoint_classes: List[str], + rng: np.random.Generator, + ): + segments = self.segments() + classes = segments[disjoint_classes].apply("-".join, axis=1) + u_classes, class_ids = np.unique(classes, return_inverse=True) + p = rng.permutation(len(u_classes)) + class_ids = p[class_ids] + num_train = int(round((1 - val_prob) * len(segments))) + train_mask = np.zeros(len(segments), dtype=bool) + count_acc = 0 + for c_id in range(len(u_classes)): + idx = (class_ids == c_id).nonzero()[0] + train_mask[idx] = True + count = len(idx) + count_acc += count + if count_acc >= num_train: + break + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_joint_and_disjoint_classes( + self, + val_prob: float, + joint_classes: List[str], + disjoint_clases: List[str], + min_train_samples: int, + rng: np.random.Generator, + ): + raise NotImplementedError("I'll implement this when I need it") + segments = self.segments() + j_classes = segments[joint_classes].apply("-".join, axis=1) + ju_classes, j_class_ids = np.unique(j_classes, return_inverse=True) + d_classes = segments[disjoint_classes].apply("-".join, axis=1) + du_classes, d_class_ids = np.unique(d_classes, return_inverse=True) + d_p = rng.permutation(len(du_classes)) + d_class_ids = d_p[d_class_ids] + d_sort_idx = np.argsort(d_class_ids) + d_sort_j_class_ids = j_class_ids[d_sort_idx] + + train_d_classes = set() + for c_id in range(len(ju_classes)): + idx = (j_sort_class_ids == c_id).nonzero()[0] + count = len(idx) + num_train = max( + int(round((1 - val_prob) * count)), min(min_train_samples, count) + ) + sel_d_class_ids = set(d_sort_idx[:num_train]) + train_d_classes = train_d_classes.union(sel_d_class_ids) + + train_mask = np.zeros(len(segments), dtype=bool) + for c_id in train_d_classes: + mask = d_class_ids == c_id + train_mask[mask] = True + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def split_train_val( + self, + val_prob: float, + joint_classes: Optional[List[str]] = None, + disjoint_classes: Optional[List[str]] = None, + min_train_samples: int = 1, + seed: int = 11235813, + ): + rng = np.random.default_rng(seed) + if joint_classes is None and disjoint_classes is None: + train_segs, val_segs = self._segments_split(val_prob, rng) + elif joint_classes is not None and disjoint_classes is None: + train_segs, val_segs = self._segments_split_joint_classes( + val_prob, + joint_classes, + min_train_samples, + rng, + ) + elif joint_classes is None and disjoint_classes is not None: + train_segs, val_segs = self._segments_split_disjoint_classes( + val_prob, + disjoint_classes, + rng, + ) + else: + train_segs, val_segs = self._segments_split_joint_and_disjoint_classes( + val_prob, + joint_classes, + disjoint_classes, + min_train_samples, + rng, + ) + + train_ds = self.clone() + train_ds.set_segments(train_segs) + train_ds.clean() + + val_ds = self.clone() + val_ds.set_segments(val_segs) + val_ds.clean() + + return train_ds, val_ds + + @classmethod + def merge(cls, datasets): + segments = [] + for dset in datasets: + segs_dset = dset.segments(keep_loaded=False) + if segs_dset is not None: + segments.append(segs_dset) + + segments = SegmentSet.cat(segments) + dataset = cls(segments) + + classes_keys = [] + for dset in datasets: + classes_dset = list(dset.classes_keys()) + classes_keys.extend(classes_dset) + + classes_keys = list(set(classes_keys)) + for key in classes_keys: + classes = [] + for dset in datasets: + if key in dset.classes_keys(): + classes_key = dset.classes_value(key, keep_loaded=False) + classes.append(classes_key) + + classes = ClassInfo.cat(classes) + dataset.add_classes(classes_name=key, classes=classes) + + recordings = [] + for dset in datasets: + recs_i = dset.recordings(keep_loaded=False) + if recs_i is not None: + recordings.append(recs_i) + + if recordings: + recordings = RecordingSet.cat(recordings) + dataset.set_recordings(recordings) + + features_keys = [] + for dset in datasets: + features_dset = list(dset.features_keys()) + features_keys.extend(features_dset) + + features_keys = list(set(features_keys)) + for key in features_keys: + features = [] + for dset in datasets: + if key in dset.features_keys(): + features_key = dset.features_value(key, keep_loaded=False) + features.append(features_key) + + features = FeatureSet.cat(features) + dataset.add_features(features_name=key, features=features) + + # TODO: merge enrollments and trials + # Usually you don't need that + return dataset + + @classmethod + def from_lhotse( + cls, + cuts: Optional[Union[lhotse.CutSet, PathLike]] = None, + recordings: Optional[Union[lhotse.RecordingSet, PathLike]] = None, + supervisions: Optional[Union[lhotse.SupervisionSet, PathLike]] = None, + ): + """Creates a Hyperion Dataset from a lhotse CutSet or + from a lhotse RecordingSet + SupervisionSet + + Args: + cuts: lhotse CutSet manifest or file + recordings: lhotse RecordingSet manifest or file + supervisions: lhotse SupervisionSet manifest or file. + + Returns + HypDataset object + """ + assert cuts is not None or supervisions is not None + if cuts is not None: + if isinstance(cuts, (str, Path)): + cuts = lhotse.CutSet.from_file(cuts) + else: + if isinstance(supervisions, (str, Path)): + supervisions = lhotse.SupervisionSet.from_file(supervisions) + + if recordings is not None and isinstance(recordings, (str, Path)): + recordings = lhotse.RecordingSet.from_file(recordings) + + cuts = lhotse.CutSet.from_manifests( + recordings=recordings, supervisions=supervisions + ) + + from lhotse import MonoCut, Recording, SupervisionSegment + + supervision_keys = [ + "speaker", + "gender", + "language", + "emotion", + "text", + "duration", + ] + recs_df = [] + segs_df = [] + for cut in cuts: + supervision = cut.supervisions[0] + recording = cut.recording + seg_dict = {"id": cut.id} + recording = cut.recording + if recording is not None: + # if recording.id != cut.id: + # seg_dict["recording_id"] = recording.id + + rec_dict = { + "id": cut.id, + "sampling_rate": recording.sampling_rate, + "duration": recording.duration, + } + source = recording.sources[0] + assert len(recording.sources) == 1 + assert source.type in ["file", "command"] + rec_dict["storage_path"] = source.source + assert recording.transforms is None, f"{recording.transforms}" + recs_df.append(rec_dict) + + for key in supervision_keys: + if hasattr(supervision, key): + val = getattr(supervision, key) + if val is not None: + seg_dict[key] = val + + if supervision.custom is not None: + for key, val in supervision.custom: + if val is not None: + seg_dict[key] = val + + segs_df.append(seg_dict) + + recs_df = pd.DataFrame(recs_df) + segs_df = pd.DataFrame(segs_df) + recordings = RecordingSet(recs_df) + segments = SegmentSet(segs_df) + class_names = ["speaker", "language", "emotion", "gender"] + classes = {} + for key in class_names: + if key in segments: + uniq_classes = np.unique(segments[key]) + classes[key] = ClassInfo(pd.DataFrame({"id": uniq_classes})) + + if not classes: + classes = None + + dataset = cls(segments=segments, classes=classes, recordings=recordings) + return dataset + + @classmethod + def from_kaldi( + cls, + kaldi_data_dir: PathLike, + ): + """Creates a Hyperion Dataset from a Kaldi data dir + + Args: + kaldi_data_dir: Kaldi data directory + + Returns + HypDataset object + """ + kaldi_data_dir = Path(kaldi_data_dir) + + kaldi_files = ["utt2lang", "utt2dur", "utt2text"] + attributes = ["language", "duration", "text"] + + k_file = kaldi_data_dir / "utt2spk" + from .utt2info import Utt2Info + + utt2spk = Utt2Info.load(k_file) + df_segs = pd.DataFrame({"id": utt2spk.key, "speaker": utt2spk.info}) + segments = SegmentSet(df_segs) + del utt2spk + + for att, k_file in zip(kaldi_files, attributes): + k_file = kaldi_data_dir / k_file + if k_file.is_file(): + u2i = Utt2Info.load(k_file) + segments.loc[u2i.key, att] = u2i.info + + k_file = kaldi_data_dir / "spk2gender" + if k_file.is_file(): + segments["gender"] = "N/A" + s2g = Utt2Info.load(k_file) + for spk in s2g.key: + g = s2g[spk] + segments.loc[segments["speaker"] == spk, "gender"] = g + + kaldi_files = ["feats.scp", "vad.scp"] + attributes = ["feats", "vad"] + features = None + from .scp_list import SCPList + + for att, k_file in zip(kaldi_files, attributes): + k_file = kaldi_data_dir / k_file + if k_file.is_file(): + scp = SCPList.load(k_file) + feats_dict = {"id": scp.key, "storage_path": scp.file_path} + if scp.offset is not None: + feats_dict["storage_byte"] = scp.offset + df_feats = pd.DataFrame(feats_dict) + if features is None: + features = {} + features["att"] = FeatureSet(df_feats) + + recordings = None + k_file = kaldi_data_dir / "wav.scp" + if k_file.is_file(): + scp = SCPList.load(k_file) + wav_dict = {"id": scp.key, "storage_path": scp.file_path} + df_recs = pd.DataFrame(wav_dict) + recordings = RecordingSet(df_recs) + recordings.get_durations() + if "duration" not in segments: + segments["duration"] = recordings.loc[segments["id"], "duration"] + + class_names = ["speaker", "language", "emotion", "gender"] + classes = {} + for key in class_names: + if key in segments: + uniq_classes = np.unique(segments[key]) + classes[key] = ClassInfo(pd.DataFrame({"id": uniq_classes})) + + if not classes: + classes = None + + dataset = cls( + segments=segments, classes=classes, recordings=recordings, features=features + ) + return dataset diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py new file mode 100644 index 00000000..70ec49a0 --- /dev/null +++ b/hyperion/utils/info_table.py @@ -0,0 +1,550 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import re +from collections import OrderedDict +from copy import deepcopy +from pathlib import Path +from typing import List, Optional, Union + +import numpy as np +import pandas as pd +from pandas.api.types import infer_dtype + +from .list_utils import split_list, split_list_group_by_key + + +class InfoTable: + """This is a base class to store information about recordings, segments, + features, etc. + + Attributes: + df: pandas dataframe. + """ + + def __init__(self, df): + assert "id" in df, f"info_table={df}" + self.df = df + self.fix_dtypes() + self.df.set_index("id", drop=False, inplace=True) + + def fix_dtypes(self): + if infer_dtype(self.df.id) != "string": + self.df.loc[:, "id"] = self.df["id"].apply(str) + + def convert_col_to_str(self, column): + if infer_dtype(self.df[column]) != "string": + self.df.loc[:, column] = self.df[column].apply(str) + + def copy(self): + """Makes a copy of the object.""" + return deepcopy(self) + + def clone(self): + """Makes a copy of the object.""" + return deepcopy(self) + + @property + def __len__(self): + return self.df.__len__ + + @property + def __str__(self): + return self.df.__str__ + + @property + def __repr__(self): + return self.df.__repr__ + + @property + def iat(self): + return self.df.iat + + @property + def at(self): + return self.df.at + + @property + def iloc(self): + return self.df.iloc + + @property + def loc(self): + return self.df.loc + + @property + def __getitem__(self): + return self.df.__getitem__ + + @property + def __setitem__(self): + return self.df.__setitem__ + + @property + def __contains__(self): + return self.df.__contains__ + + @property + def index(self): + return self.df.index + + def save(self, file_path, sep=None): + """Saves info table to file + + Args: + file_path: File to write the list. + sep: Separator between the key and file_path in the text file. + """ + file_path = Path(file_path) + file_path.parent.mkdir(parents=True, exist_ok=True) + ext = file_path.suffix + if ext in ["", ".scp"] or re.match(r"\.[0-9]+$", ext): + # if no extension we save as kaldi utt2spk file + assert len(self.df.columns) == 2 + self.df.to_csv(file_path, sep=" ", header=False, index=False) + return + + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + self.df.to_csv(file_path, sep=sep, index=False) + + @classmethod + def from_lists(cls, ids, column_names, column_data): + df_dict = {"id": ids} + assert len(column_names) == len(column_data) + for name, data in zip(column_names, column_data): + assert len(ids) == len(data) + df_dict[name] = data + df = pd.DataFrame(df_dict) + return cls(df) + + @classmethod + def from_dict(cls, df_dict): + assert "id" in df_dict + df = pd.DataFrame(df_dict) + return cls(df) + + @classmethod + def load(cls, file_path, sep=None, name="class_id"): + """Loads table from file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + dtype: Dictionary with the dtypes of each column. + name: name for the data to be loaded + Returns: + InfoTable object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext in ["", ".scp"]: + # if no extension we load as kaldi utt2spk file + df = pd.read_csv( + file_path, + sep=" ", + header=None, + names=["id", name], + dtype={"id": str, name: str}, + ) + else: + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + fixed_dtypes = { + "id": str, + "speaker": str, + "language": str, + "gender": str, + "duration": float, + "storage_path": str, + "storage_byte": int, + "num_frames": int, + "video_ids": str, + "language_est": str, + } + df = pd.read_csv(file_path, sep=sep, dtype=fixed_dtypes) + + return cls(df) + + def sort(self, column="id", ascending=True): + """Sorts the table by column""" + if column == "id": + self.df.sort_index(inplace=True, ascending=ascending) + else: + self.df.sort_values(by=column, inplace=True, ascending=ascending) + + def split(self, idx, num_parts, group_by=None): + """Splits the table into num_parts and return part idx. + + Args: + idx: Part to return from 1 to num_parts. + num_parts: Number of parts to split the list. + group_by: All the lines with the same value in column + groub_by_field go to the same part + + Returns: + Sub InfoTable object + """ + if group_by is None or group_by == "id": + _, idx1 = split_list(self.df["id"], idx, num_parts) + else: + _, idx1 = split_list_group_by_key(self.df[group_by], idx, num_parts) + + df = self.df.iloc[idx1] + return self.__class__(df) + + @classmethod + def cat(cls, tables): + """Concatenates several tables. + + Args: + info_lists: List of InfoTables + + Returns: + InfoTable object concatenation the info_lists. + """ + df_list = [table.df for table in tables] + df = pd.concat(df_list) + assert df[ + "id" + ].is_unique, """there are duplicated ids in the tables we are concatenating""" + return cls(df) + + def filter( + self, + predicate=None, + items=None, + iindex=None, + columns=None, + by="id", + keep=True, + raise_if_missing=True, + ): + """Filters the table and produce a new table with the elements to keep + + Args: + predicate: callable function that defines the filtering criterion e.g.: + lambda df: df["duration"] > 1.0. + items: filters the table based in column value with pandas command: + df.loc[items, by], used only if predicate is None + iindex: filters the table based on integer index with pandas command: + df.iloc[iiindex], used if predicate and items are None + columns: columns to keep of remove. + by: column id to use with items criterion + keep: if True, the criterion is used to keep rows, if False it is used + to remove rows + + Returns + InfoTable of the same class as the input. + """ + assert ( + predicate is not None + or items is not None + or iindex is not None + or columns is not None + ), "predicate, items, iindex and columns cannot be not None at the same time" + df = self.df + + if predicate is not None: + mask = predicate(self.df) + + if not keep: + if predicate is not None: + mask = np.logical_not(mask) + elif items is not None: + items = np.setdiff1d(df[by], items) + elif iindex is not None: + iindex = np.setdiff1d(np.arange(len(df)), iindex) + + if columns is not None: + columns = np.setdiff1d(df.columns, columns) + + if predicate is not None: + if columns is None: + df = df.loc[mask] + else: + df = df.loc[mask, columns] + elif items is not None: + if by != "id": + missing = [False if v in df[by] else True for v in items] + if any(missing) and raise_if_missing: + raise Exception(f"{items[missing]} not found in table") + items = [True if v in items else False for v in df[by]] + elif not raise_if_missing: + items = [item for item in items if item in df.index] + + if columns is None: + df = df.loc[items] + else: + df = df.loc[items, columns] + else: + if not raise_if_missing: + iindex = iindex[iindex < len(df)] + + if iindex is not None: + df = self.df.iloc[iindex] + + if columns is not None: + df = df[columns] + + return self.__class__(df.copy()) + + def __eq__(self, other): + """Equal operator""" + if self.df.shape[0] == 0 and other.df.shape[0] == 0: + return True + eq = self.df.equals(other.df) + return eq + + def __ne__(self, other): + """Non-equal operator""" + return not self.__eq__(other) + + def __cmp__(self, other): + """Comparison operator""" + if self.__eq__(other): + return 0 + return 1 + + def shuffle(self, seed=1024, rng=None): + """Shuffles the elements of the list. + + Args: + seed: Seed for random number generator. + rng: numpy random number generator object. + + Returns: + Index used to shuffle the list. + """ + if rng is None: + rng = np.random.default_rng(seed=seed) + index = np.arange(len(self.df)) + rng.shuffle(index) + self.df = self.df.iloc[index] + return index + + def set_index(self, keys, inplace=True): + if inplace: + self.df.set_index(keys, drop=False, inplace=True) + return + + df = self.df.set_index(keys, drop=False, inplace=False) + return type(self)(df) + + def reset_index(self): + self.df.set_index("id", drop=False, inplace=True) + + def get_loc(self, keys): + if isinstance(keys, (list, np.ndarray)): + return self.df.index.get_indexer(keys) + + loc = self.df.index.get_loc(keys) + if isinstance(loc, int): + return loc + + if isinstance(loc, np.ndarray) and loc.dtype == bool: + return np.nonzero(loc)[0] + + return list(range(loc.start, loc.stop, loc.step)) + + def get_col_idx(self, keys): + return self.df.columns.get_loc(keys) + + def add_columns( + self, + right_table, + column_names: Union[None, str, List[str], np.ndarray] = None, + on: Union[str, List[str], np.ndarray] = "id", + right_on: Union[None, str, List[str], np.ndarray] = None, + remove_missing: bool = False, + ): + if isinstance(right_table, InfoTable): + right_table = right_table.df + + if column_names is not None: + right_table = right_table[column_names] + + if right_on is None: + right_on = on + + how = "inner" if remove_missing else "left" + left_index = False + right_index = False + if on == "id" or on == ["id"]: + on = None + left_index = True + + if (right_on == "id" or right_on == ["id"]) and "id" in right_table: + right_on = None + right_index = True + + self.df = self.df.merge( + right_table, + how=how, + left_on=on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ) + + # def __len__(self): + + # """Returns the number of elements in the list.""" + # return len(self.df) + + # def _create_dict(self): + # """Creates dictionary that returns the position of + # a segment in the list. + # """ + # self.key_to_index = OrderedDict( + # (k, i) for i, k in enumerate(self.utt_info.index) + # ) + + # def get_index(self, key): + # """Returns the position of key in the list.""" + # if self.key_to_index is None: + # self._create_dict() + # return self.key_to_index[key] + + # def __contains__(self, id): + # """Returns True if the list contains the key""" + # return id in self.df.index + + # def __getitem__(self, id): + # """It allows to acces the data in the list by key or index like in + # a ditionary, e.g.: + # If input is a string key: + # utt2spk = Utt2Info(info) + # spk_id = utt2spk['data1'] + # If input is an index: + # key, spk_id = utt2spk[0] + + # Args: + # key: String key or integer index. + # Returns: + # If key is a string: + # info corresponding to key + # If key is the index in the key list: + # key, info given index + # """ + # if isinstance(id, str): + # row = np.array(self.utt_info.loc[key])[1:] + # if len(row) == 1: + # return row[0] + # else: + # return row + # else: + # row = np.array(self.utt_info.iloc[key]) + # if len(row) == 2: + # return row[0], row[1] + # else: + # return row[0], row[1:] + + # def sort(self, field=0): + # """Sorts the list by key""" + # if field == 0: + # self.utt_info.sort_index(ascending=True, inplace=True) + # else: + # idx = np.argsort(self.utt_info[field]) + # self.utt_info = self.utt_info.iloc[idx] + # self.key_to_index = None + + # @classmethod + # def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}): + # """Loads utt2info list from text file. + + # Args: + # file_path: File to read the list. + # sep: Separator between the key and file_path in the text file. + # dtype: Dictionary with the dtypes of each column. + # Returns: + # Utt2Info object + # """ + # df = pd.read_csv(file_path, sep=sep, header=None, dtype=dtype) + # df = df.rename(index=str, columns={0: "key"}) + # return cls(df) + + # def split(self, idx, num_parts, group_by_field=0): + # """Splits SCPList into num_parts and return part idx. + + # Args: + # idx: Part to return from 1 to num_parts. + # num_parts: Number of parts to split the list. + # group_by_field: All the lines with the same value in column + # groub_by_field go to the same part + + # Returns: + # Sub Utt2Info object + # """ + # if group_by_field == 0: + # key, idx1 = split_list(self.utt_info["key"], idx, num_parts) + # else: + # key, idx1 = split_list_group_by_key( + # self.utt_info[group_by_field], idx, num_parts + # ) + + # utt_info = self.utt_info.iloc[idx1] + # return Utt2Info(utt_info) + + # def filter(self, filter_key, keep=True): + # """Removes elements from Utt2Info object by key + + # Args: + # filter_key: List with the keys of the elements to keep or remove. + # keep: If True, we keep the elements in filter_key; + # if False, we remove the elements in filter_key; + + # Returns: + # Utt2Info object. + # """ + # if not keep: + # filter_key = np.setdiff1d(self.utt_info["key"], filter_key) + # utt_info = self.utt_info.loc[filter_key] + # return Utt2Info(utt_info) + + # def filter_info(self, filter_key, field=1, keep=True): + # """Removes elements of Utt2Info by info value + + # Args: + # filter_key: List with the file_path of the elements to keep or remove. + # field: Field number corresponding to the info to filter + # keep: If True, we keep the elements in filter_key; + # if False, we remove the elements in filter_key; + + # Returns: + # Utt2Info object. + # """ + # if not keep: + # filter_key = np.setdiff1d(self.utt_info[field], filter_key) + # f, _ = ismember(filter_key, self.utt_info[field]) + # if not np.all(f): + # for k in filter_key[f == False]: + # logging.error("info %s not found in field %d" % (k, field)) + # raise Exception("not all keys were found in field %d" % (field)) + + # f, _ = ismember(self.utt_info[field], filter_key) + # utt_info = self.utt_info.iloc[f] + # return Utt2Info(utt_info) + + # def filter_index(self, index, keep=True): + # """Removes elements of Utt2Info by index + + # Args: + # filter_key: List with the index of the elements to keep or remove. + # keep: If True, we keep the elements in filter_key; + # if False, we remove the elements in filter_key; + + # Returns: + # Utt2Info object. + # """ + + # if not keep: + # index = np.setdiff1d(np.arange(len(self.key), dtype=np.int64), index) + + # utt_info = self.utt_info.iloc[index] + # return Utt2Info(utt_info) diff --git a/hyperion/utils/kaldi_matrix.py b/hyperion/utils/kaldi_matrix.py index 11726cc7..c8e26cbb 100644 --- a/hyperion/utils/kaldi_matrix.py +++ b/hyperion/utils/kaldi_matrix.py @@ -6,6 +6,7 @@ """ import struct + import numpy as np from ..hyp_defs import float_cpu diff --git a/hyperion/utils/lexicon.py b/hyperion/utils/lexicon.py new file mode 100644 index 00000000..6128c0ff --- /dev/null +++ b/hyperion/utils/lexicon.py @@ -0,0 +1,278 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import re +import sys +from pathlib import Path +from typing import List, Tuple + +import k2 + +import torch + + +def read_lexicon(filename: str) -> List[Tuple[str, List[str]]]: + """Read a lexicon from `filename`. + + Each line in the lexicon contains "word p1 p2 p3 ...". + That is, the first field is a word and the remaining + fields are tokens. Fields are separated by space(s). + + Args: + filename: + Path to the lexicon.txt + + Returns: + A list of tuples., e.g., [('w', ['p1', 'p2']), ('w1', ['p3, 'p4'])] + """ + ans = [] + + with open(filename, "r", encoding="utf-8") as f: + whitespace = re.compile("[ \t]+") + for line in f: + a = whitespace.split(line.strip(" \t\r\n")) + if len(a) == 0: + continue + + if len(a) < 2: + logging.info( + f"Found bad line {line} in lexicon file {filename}" + ) + logging.info( + "Every line is expected to contain at least 2 fields" + ) + sys.exit(1) + word = a[0] + if word == "": + logging.info( + f"Found bad line {line} in lexicon file {filename}" + ) + logging.info(" should not be a valid word") + sys.exit(1) + + tokens = a[1:] + ans.append((word, tokens)) + + return ans + + +def write_lexicon(filename: str, lexicon: List[Tuple[str, List[str]]]) -> None: + """Write a lexicon to a file. + + Args: + filename: + Path to the lexicon file to be generated. + lexicon: + It can be the return value of :func:`read_lexicon`. + """ + with open(filename, "w", encoding="utf-8") as f: + for word, tokens in lexicon: + f.write(f"{word} {' '.join(tokens)}\n") + + +def convert_lexicon_to_ragged( + filename: str, word_table: k2.SymbolTable, token_table: k2.SymbolTable +) -> k2.RaggedTensor: + """Read a lexicon and convert it to a ragged tensor. + + The ragged tensor has two axes: [word][token]. + + Caution: + We assume that each word has a unique pronunciation. + + Args: + filename: + Filename of the lexicon. It has a format that can be read + by :func:`read_lexicon`. + word_table: + The word symbol table. + token_table: + The token symbol table. + Returns: + A k2 ragged tensor with two axes [word][token]. + """ + disambig_id = word_table["#0"] + # We reuse the same words.txt from the phone based lexicon + # so that we can share the same G.fst. Here, we have to + # exclude some words present only in the phone based lexicon. + excluded_words = ["", "!SIL", ""] + + # epsilon is not a word, but it occupies a position + # + row_splits = [0] + token_ids_list = [] + + lexicon_tmp = read_lexicon(filename) + lexicon = dict(lexicon_tmp) + if len(lexicon_tmp) != len(lexicon): + raise RuntimeError( + "It's assumed that each word has a unique pronunciation" + ) + + for i in range(disambig_id): + w = word_table[i] + if w in excluded_words: + row_splits.append(row_splits[-1]) + continue + tokens = lexicon[w] + token_ids = [token_table[k] for k in tokens] + + row_splits.append(row_splits[-1] + len(token_ids)) + token_ids_list.extend(token_ids) + + cached_tot_size = row_splits[-1] + row_splits = torch.tensor(row_splits, dtype=torch.int32) + + shape = k2.ragged.create_ragged_shape2( + row_splits, + None, + cached_tot_size, + ) + values = torch.tensor(token_ids_list, dtype=torch.int32) + + return k2.RaggedTensor(shape, values) + + +class Lexicon(object): + """Phone based lexicon.""" + + def __init__( + self, + lang_dir: Path, + disambig_pattern: str = re.compile(r"^#\d+$"), + ): + """ + Args: + lang_dir: + Path to the lang directory. It is expected to contain the following + files: + - tokens.txt + - words.txt + - L.pt + The above files are produced by the script `prepare.sh`. You + should have run that before running the training code. + disambig_pattern: + It contains the pattern for disambiguation symbols. + """ + lang_dir = Path(lang_dir) + self.token_table = k2.SymbolTable.from_file(lang_dir / "tokens.txt") + self.word_table = k2.SymbolTable.from_file(lang_dir / "words.txt") + + if (lang_dir / "Linv.pt").exists(): + logging.info(f"Loading pre-compiled {lang_dir}/Linv.pt") + L_inv = k2.Fsa.from_dict(torch.load(lang_dir / "Linv.pt")) + else: + logging.info("Converting L.pt to Linv.pt") + L = k2.Fsa.from_dict(torch.load(lang_dir / "L.pt")) + L_inv = k2.arc_sort(L.invert()) + torch.save(L_inv.as_dict(), lang_dir / "Linv.pt") + + # We save L_inv instead of L because it will be used to intersect with + # transcript FSAs, both of whose labels are word IDs. + self.L_inv = L_inv + self.disambig_pattern = disambig_pattern + + @property + def tokens(self) -> List[int]: + """Return a list of token IDs excluding those from + disambiguation symbols. + + Caution: + 0 is not a token ID so it is excluded from the return value. + """ + symbols = self.token_table.symbols + ans = [] + for s in symbols: + if not self.disambig_pattern.match(s): + ans.append(self.token_table[s]) + if 0 in ans: + ans.remove(0) + ans.sort() + return ans + + +class UniqLexicon(Lexicon): + def __init__( + self, + lang_dir: Path, + uniq_filename: str = "uniq_lexicon.txt", + disambig_pattern: str = re.compile(r"^#\d+$"), + ): + """ + Refer to the help information in Lexicon.__init__. + + uniq_filename: It is assumed to be inside the given `lang_dir`. + + Each word in the lexicon is assumed to have a unique pronunciation. + """ + lang_dir = Path(lang_dir) + super().__init__(lang_dir=lang_dir, disambig_pattern=disambig_pattern) + + self.ragged_lexicon = convert_lexicon_to_ragged( + filename=lang_dir / uniq_filename, + word_table=self.word_table, + token_table=self.token_table, + ) + # TODO: should we move it to a certain device ? + + def texts_to_token_ids( + self, texts: List[str], oov: str = "" + ) -> k2.RaggedTensor: + """ + Args: + texts: + A list of transcripts. Each transcript contains space(s) + separated words. An example texts is:: + + ['HELLO k2', 'HELLO icefall'] + oov: + The OOV word. If a word in `texts` is not in the lexicon, it is + replaced with `oov`. + Returns: + Return a ragged int tensor with 2 axes [utterance][token_id] + """ + oov_id = self.word_table[oov] + + word_ids_list = [] + for text in texts: + word_ids = [] + for word in text.split(): + if word in self.word_table: + word_ids.append(self.word_table[word]) + else: + word_ids.append(oov_id) + word_ids_list.append(word_ids) + ragged_indexes = k2.RaggedTensor(word_ids_list, dtype=torch.int32) + ans = self.ragged_lexicon.index(ragged_indexes) + ans = ans.remove_axis(ans.num_axes - 2) + return ans + + def words_to_token_ids(self, words: List[str]) -> k2.RaggedTensor: + """Convert a list of words to a ragged tensor containing token IDs. + + We assume there are no OOVs in "words". + """ + word_ids = [self.word_table[w] for w in words] + word_ids = torch.tensor(word_ids, dtype=torch.int32) + + ragged, _ = self.ragged_lexicon.index( + indexes=word_ids, + axis=0, + need_value_indexes=False, + ) + return ragged diff --git a/hyperion/utils/list_utils.py b/hyperion/utils/list_utils.py index 6e805a25..4375183d 100644 --- a/hyperion/utils/list_utils.py +++ b/hyperion/utils/list_utils.py @@ -5,9 +5,10 @@ Utilities for lists. """ -import numpy as np -from operator import itemgetter from itertools import groupby +from operator import itemgetter + +import numpy as np def list2ndarray(a, dtype=None): diff --git a/hyperion/utils/math.py b/hyperion/utils/math.py deleted file mode 100644 index 84596f7d..00000000 --- a/hyperion/utils/math.py +++ /dev/null @@ -1,356 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - - Some math functions. -""" - -import numpy as np -import scipy.linalg as la - -from ..hyp_defs import float_cpu - - -def logdet_pdmat(A): - """Log determinant of positive definite matrix.""" - assert A.shape[0] == A.shape[1] - R = la.cholesky(A) - return 2 * np.sum(np.log(np.diag(R))) - - -def invert_pdmat(A, right_inv=False, return_logdet=False, return_inv=False): - """Inversion of positive definite matrices. - Returns lambda function f that multiplies the inverse of A times a vector. - - Args: - A: Positive definite matrix - right_inv: If False, f(v)=A^{-1}v; if True f(v)=v' A^{-1} - return_logdet: If True, it also returns the log determinant of A. - return_inv: If True, it also returns A^{-1} - - Returns: - Lambda function that multiplies A^{-1} times vector. - Cholesky transform of A upper triangular - Log determinant of A - A^{-1} - """ - assert A.shape[0] == A.shape[1] - R = la.cholesky(A, lower=False) - - if right_inv: - fh = lambda x: la.cho_solve((R, False), x.T).T - else: - fh = lambda x: la.cho_solve((R, False), x) - # fh=lambda x: la.solve_triangular(R, la.solve_triangular(R.T, x, lower=True), lower=False) - - r = [fh, R] - - logdet = None - invA = None - - if return_logdet: - logdet = 2 * np.sum(np.log(np.diag(R))) - r.append(logdet) - - if return_inv: - invA = fh(np.eye(A.shape[0])) - r.append(invA) - - return r - - -def invert_trimat( - A, lower=False, right_inv=False, return_logdet=False, return_inv=False -): - """Inversion of triangular matrices. - Returns lambda function f that multiplies the inverse of A times a vector. - - Args: - A: Triangular matrix. - lower: if True A is lower triangular, else A is upper triangular. - right_inv: If False, f(v)=A^{-1}v; if True f(v)=v' A^{-1} - return_logdet: If True, it also returns the log determinant of A. - return_inv: If True, it also returns A^{-1} - - Returns: - Lambda function that multiplies A^{-1} times vector. - Log determinant of A - A^{-1} - """ - - if right_inv: - fh = lambda x: la.solve_triangular(A.T, x.T, lower=not (lower)).T - else: - fh = lambda x: la.solve_triangular(A, x, lower=lower) - - if return_logdet or return_inv: - r = [fh] - else: - r = fh - - if return_logdet: - logdet = np.sum(np.log(np.diag(A))) - r.append(logdet) - - if return_inv: - invA = fh(np.eye(A.shape[0])) - r.append(invA) - - return r - - -def softmax(r, axis=-1): - """ - Returns: - y = \exp(r)/\sum(\exp(r)) - """ - max_r = np.max(r, axis=axis, keepdims=True) - r = np.exp(r - max_r) - r /= np.sum(r, axis=axis, keepdims=True) - return r - - -def logsumexp(r, axis=-1): - """ - Returns: - y = \log \sum(\exp(r)) - """ - max_r = np.max(r, axis=axis, keepdims=True) - r = np.exp(r - max_r) - return np.log(np.sum(r, axis=axis) + 1e-20) + np.squeeze(max_r, axis=axis) - - -def logsigmoid(x): - """ - Returns: - y = \log(sigmoid(x)) - """ - e = np.exp(-x) - f = x < -100 - log_p = -np.log(1 + np.exp(-x)) - log_p[f] = x[f] - return log_p - - -def neglogsigmoid(x): - """ - Returns: - y = -\log(sigmoid(x)) - """ - e = np.exp(-x) - f = x < -100 - log_p = np.log(1 + np.exp(-x)) - log_p[f] = -x[f] - return log_p - - -def sigmoid(x): - """ - Returns: - y = sigmoid(x) - """ - e = np.exp(-x) - f = x < -100 - p = 1 / (1 + np.exp(-x)) - p[f] = 0 - return p - - -def fisher_ratio(mu1, Sigma1, mu2, Sigma2): - """Computes the Fisher ratio between two classes - from the class means and covariances. - """ - S = Sigma1 + Sigma2 - L = invert_pdmat(S)[0] - delta = mu1 - mu2 - return np.inner(delta, L(delta)) - - -def fisher_ratio_with_precs(mu1, Lambda1, mu2, Lambda2): - """Computes the Fisher ratio between two classes - from the class means precisions. - """ - - Sigma1 = invert_pdmat(Lambda1, return_inv=True)[-1] - Sigma2 = invert_pdmat(Lambda2, return_inv=True)[-1] - return fisher_ratio(mu1, Sigma1, mu2, Sigma2) - - -def symmat2vec(A, lower=False, diag_factor=None): - """Puts a symmetric matrix into a vector. - - Args: - A: Symmetric matrix. - lower: If True, it uses the lower triangular part of the matrix. - If False, it uses the upper triangular part of the matrix. - diag_factor: It multiplies the diagonal of A by diag_factor. - - Returns: - Vector with the upper or lower triangular part of A. - """ - if diag_factor is not None: - A = np.copy(A) - A[np.diag_indices(A.shape[0])] *= diag_factor - if lower: - return A[np.tril_indices(A.shape[0])] - return A[np.triu_indices(A.shape[0])] - - -def vec2symmat(v, lower=False, diag_factor=None): - """Puts a vector back into a symmetric matrix. - - Args: - v: Vector with the upper or lower triangular part of A. - lower: If True, v contains the lower triangular part of the matrix. - If False, v contains the upper triangular part of the matrix. - diag_factor: It multiplies the diagonal of A by diag_factor. - - Returns: - Symmetric matrix. - """ - - dim = int((-1 + np.sqrt(1 + 8 * v.shape[0])) / 2) - idx_u = np.triu_indices(dim) - idx_l = np.tril_indices(dim) - A = np.zeros((dim, dim), dtype=float_cpu()) - if lower: - A[idx_l] = v - A[idx_u] = A.T[idx_u] - else: - A[idx_u] = v - A[idx_l] = A.T[idx_l] - if diag_factor is not None: - A[np.diag_indices(A.shape[0])] *= diag_factor - return A - - -def trimat2vec(A, lower=False): - """Puts a triangular matrix into a vector. - - Args: - A: Triangular matrix. - lower: If True, it uses the lower triangular part of the matrix. - If False, it uses the upper triangular part of the matrix. - - Returns: - Vector with the upper or lower triangular part of A. - """ - - return symmat2vec(A, lower) - - -def vec2trimat(v, lower=False): - """Puts a vector back into a triangular matrix. - - Args: - v: Vector with the upper or lower triangular part of A. - lower: If True, v contains the lower triangular part of the matrix. - If False, v contains the upper triangular part of the matrix. - - Returns: - Triangular matrix. - """ - dim = int((-1 + np.sqrt(1 + 8 * v.shape[0])) / 2) - A = np.zeros((dim, dim), dtype=float_cpu()) - if lower: - A[np.tril_indices(dim)] = v - return A - A[np.triu_indices(dim)] = v - return A - - -def fullcov_varfloor(S, F, F_is_chol=False, lower=False): - """Variance flooring for full covariance matrices. - - Args: - S: Covariance. - F: Minimum cov or Cholesqy decomposisition of it - F_is_chol: If True F is Cholesqy decomposition - lower: True if cholF is lower triangular, False otherwise - - Returns: - Floored covariance - """ - if isinstance(F, np.ndarray): - if not F_is_chol: - cholF = la.cholesky(F, lower=False, overwrite_a=False) - else: - cholF = F - if lower: - cholF = cholF.T - icholF = invert_trimat(cholF, return_inv=True)[-1] - T = np.dot(np.dot(icholF.T, S), icholF) - else: - T = S / F - - u, d, _ = la.svd(T, full_matrices=False, overwrite_a=True) - d[d < 1.0] = 1 - T = np.dot(u * d, u.T) - - if isinstance(F, np.ndarray): - S = np.dot(cholF.T, np.dot(T, cholF)) - else: - S = F * T - return S - - -def fullcov_varfloor_from_cholS(cholS, cholF, lower=False): - """Variance flooring for full covariance matrices - using Cholesky decomposition as input/output - - Args: - cholS: Cholesqy decomposisition of the covariance. - cholF: Cholesqy decomposisition of the minimum covariance. - lower: True if matrices are lower triangular, False otherwise - - Returns: - Cholesky decomposition of the floored covariance - """ - - if isinstance(cholF, np.ndarray): - if lower: - cholS = cholS.T - cholF = cholF.T - T = np.dot(cholS, invert_trimat(cholF, return_inv=True)[-1]) - else: - if lower: - cholS = cholS.T - T = cholS / cholF - T = np.dot(T.T, T) - u, d, _ = la.svd(T, full_matrices=False, overwrite_a=True) - d[d < 1.0] = 1 - T = np.dot(u * d, u.T) - if isinstance(cholF, np.ndarray): - S = np.dot(cholF.T, np.dot(T, cholF)) - else: - S = (cholF ** 2) * T - return la.cholesky(S, lower) - - -def int2onehot(class_ids, num_classes=None): - """Integer to 1-hot vector. - - Args: - class_ids: Numpy array of integers. - num_classes: Maximum number of classes. - - Returns: - 1-hot Numpy array. - """ - - if num_classes is None: - num_classes = np.max(class_ids) + 1 - - p = np.zeros((len(class_ids), num_classes), dtype=float_cpu()) - p[np.arange(len(class_ids)), class_ids] = 1 - return p - - -def cosine_scoring(x1, x2): - - l2_1 = np.sqrt(np.sum(x1 ** 2, axis=-1, keepdims=True)) - l2_2 = np.sqrt(np.sum(x2 ** 2, axis=-1, keepdims=True)) - x1 = x1 / l2_1 - x2 = x2 / l2_2 - - return np.dot(x1, x2.T) diff --git a/hyperion/utils/math_funcs.py b/hyperion/utils/math_funcs.py new file mode 100644 index 00000000..5ee510b9 --- /dev/null +++ b/hyperion/utils/math_funcs.py @@ -0,0 +1,372 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Some math functions. +""" + +import numpy as np +import scipy.linalg as la + +from ..hyp_defs import float_cpu + + +def logdet_pdmat(A): + """Log determinant of positive definite matrix.""" + assert A.shape[0] == A.shape[1] + R = la.cholesky(A) + return 2 * np.sum(np.log(np.diag(R))) + + +def invert_pdmat(A, right_inv=False, return_logdet=False, return_inv=False): + """Inversion of positive definite matrices. + Returns lambda function f that multiplies the inverse of A times a vector. + + Args: + A: Positive definite matrix + right_inv: If False, f(v)=A^{-1}v; if True f(v)=v' A^{-1} + return_logdet: If True, it also returns the log determinant of A. + return_inv: If True, it also returns A^{-1} + + Returns: + Lambda function that multiplies A^{-1} times vector. + Cholesky transform of A upper triangular + Log determinant of A + A^{-1} + """ + assert A.shape[0] == A.shape[1] + R = la.cholesky(A, lower=False) + + if right_inv: + fh = lambda x: la.cho_solve((R, False), x.T).T + else: + fh = lambda x: la.cho_solve((R, False), x) + # fh=lambda x: la.solve_triangular(R, la.solve_triangular(R.T, x, lower=True), lower=False) + + r = [fh, R] + + logdet = None + invA = None + + if return_logdet: + logdet = 2 * np.sum(np.log(np.diag(R))) + r.append(logdet) + + if return_inv: + invA = fh(np.eye(A.shape[0])) + r.append(invA) + + return r + + +def invert_trimat( + A, lower=False, right_inv=False, return_logdet=False, return_inv=False +): + """Inversion of triangular matrices. + Returns lambda function f that multiplies the inverse of A times a vector. + + Args: + A: Triangular matrix. + lower: if True A is lower triangular, else A is upper triangular. + right_inv: If False, f(v)=A^{-1}v; if True f(v)=v' A^{-1} + return_logdet: If True, it also returns the log determinant of A. + return_inv: If True, it also returns A^{-1} + + Returns: + Lambda function that multiplies A^{-1} times vector. + Log determinant of A + A^{-1} + """ + + if right_inv: + fh = lambda x: la.solve_triangular(A.T, x.T, lower=not (lower)).T + else: + fh = lambda x: la.solve_triangular(A, x, lower=lower) + + if return_logdet or return_inv: + r = [fh] + else: + r = fh + + if return_logdet: + logdet = np.sum(np.log(np.diag(A))) + r.append(logdet) + + if return_inv: + invA = fh(np.eye(A.shape[0])) + r.append(invA) + + return r + + +def softmax(r, axis=-1): + """ + Returns: + y = \exp(r)/\sum(\exp(r)) + """ + max_r = np.max(r, axis=axis, keepdims=True) + r = np.exp(r - max_r) + r /= np.sum(r, axis=axis, keepdims=True) + return r + + +def logsumexp(r, axis=-1): + """ + Returns: + y = \log \sum(\exp(r)) + """ + max_r = np.max(r, axis=axis, keepdims=True) + r = np.exp(r - max_r) + return np.log(np.sum(r, axis=axis) + 1e-20) + np.squeeze(max_r, axis=axis) + + +def logsigmoid(x): + """ + Returns: + y = \log(sigmoid(x)) + """ + e = np.exp(-x) + f = x < -100 + log_p = -np.log(1 + np.exp(-x)) + log_p[f] = x[f] + return log_p + + +def neglogsigmoid(x): + """ + Returns: + y = -\log(sigmoid(x)) + """ + e = np.exp(-x) + f = x < -100 + log_p = np.log(1 + np.exp(-x)) + log_p[f] = -x[f] + return log_p + + +def sigmoid(x): + """ + Returns: + y = sigmoid(x) + """ + e = np.exp(-x) + f = x < -100 + p = 1 / (1 + np.exp(-x)) + p[f] = 0 + return p + + +def fisher_ratio(mu1, Sigma1, mu2, Sigma2): + """Computes the Fisher ratio between two classes + from the class means and covariances. + """ + S = Sigma1 + Sigma2 + L = invert_pdmat(S)[0] + delta = mu1 - mu2 + return np.inner(delta, L(delta)) + + +def fisher_ratio_with_precs(mu1, Lambda1, mu2, Lambda2): + """Computes the Fisher ratio between two classes + from the class means precisions. + """ + + Sigma1 = invert_pdmat(Lambda1, return_inv=True)[-1] + Sigma2 = invert_pdmat(Lambda2, return_inv=True)[-1] + return fisher_ratio(mu1, Sigma1, mu2, Sigma2) + + +def symmat2vec(A, lower=False, diag_factor=None): + """Puts a symmetric matrix into a vector. + + Args: + A: Symmetric matrix. + lower: If True, it uses the lower triangular part of the matrix. + If False, it uses the upper triangular part of the matrix. + diag_factor: It multiplies the diagonal of A by diag_factor. + + Returns: + Vector with the upper or lower triangular part of A. + """ + if diag_factor is not None: + A = np.copy(A) + A[np.diag_indices(A.shape[0])] *= diag_factor + if lower: + return A[np.tril_indices(A.shape[0])] + return A[np.triu_indices(A.shape[0])] + + +def vec2symmat(v, lower=False, diag_factor=None): + """Puts a vector back into a symmetric matrix. + + Args: + v: Vector with the upper or lower triangular part of A. + lower: If True, v contains the lower triangular part of the matrix. + If False, v contains the upper triangular part of the matrix. + diag_factor: It multiplies the diagonal of A by diag_factor. + + Returns: + Symmetric matrix. + """ + + dim = int((-1 + np.sqrt(1 + 8 * v.shape[0])) / 2) + idx_u = np.triu_indices(dim) + idx_l = np.tril_indices(dim) + A = np.zeros((dim, dim), dtype=float_cpu()) + if lower: + A[idx_l] = v + A[idx_u] = A.T[idx_u] + else: + A[idx_u] = v + A[idx_l] = A.T[idx_l] + if diag_factor is not None: + A[np.diag_indices(A.shape[0])] *= diag_factor + return A + + +def trimat2vec(A, lower=False): + """Puts a triangular matrix into a vector. + + Args: + A: Triangular matrix. + lower: If True, it uses the lower triangular part of the matrix. + If False, it uses the upper triangular part of the matrix. + + Returns: + Vector with the upper or lower triangular part of A. + """ + + return symmat2vec(A, lower) + + +def vec2trimat(v, lower=False): + """Puts a vector back into a triangular matrix. + + Args: + v: Vector with the upper or lower triangular part of A. + lower: If True, v contains the lower triangular part of the matrix. + If False, v contains the upper triangular part of the matrix. + + Returns: + Triangular matrix. + """ + dim = int((-1 + np.sqrt(1 + 8 * v.shape[0])) / 2) + A = np.zeros((dim, dim), dtype=float_cpu()) + if lower: + A[np.tril_indices(dim)] = v + return A + A[np.triu_indices(dim)] = v + return A + + +def fullcov_varfloor(S, F, F_is_chol=False, lower=False): + """Variance flooring for full covariance matrices. + + Args: + S: Covariance. + F: Minimum cov or Cholesqy decomposisition of it + F_is_chol: If True F is Cholesqy decomposition + lower: True if cholF is lower triangular, False otherwise + + Returns: + Floored covariance + """ + if isinstance(F, np.ndarray): + if not F_is_chol: + cholF = la.cholesky(F, lower=False, overwrite_a=False) + else: + cholF = F + if lower: + cholF = cholF.T + icholF = invert_trimat(cholF, return_inv=True)[-1] + T = np.dot(np.dot(icholF.T, S), icholF) + else: + T = S / F + + u, d, _ = la.svd(T, full_matrices=False, overwrite_a=True) + d[d < 1.0] = 1 + T = np.dot(u * d, u.T) + + if isinstance(F, np.ndarray): + S = np.dot(cholF.T, np.dot(T, cholF)) + else: + S = F * T + return S + + +def fullcov_varfloor_from_cholS(cholS, cholF, lower=False): + """Variance flooring for full covariance matrices + using Cholesky decomposition as input/output + + Args: + cholS: Cholesqy decomposisition of the covariance. + cholF: Cholesqy decomposisition of the minimum covariance. + lower: True if matrices are lower triangular, False otherwise + + Returns: + Cholesky decomposition of the floored covariance + """ + + if isinstance(cholF, np.ndarray): + if lower: + cholS = cholS.T + cholF = cholF.T + T = np.dot(cholS, invert_trimat(cholF, return_inv=True)[-1]) + else: + if lower: + cholS = cholS.T + T = cholS / cholF + T = np.dot(T.T, T) + u, d, _ = la.svd(T, full_matrices=False, overwrite_a=True) + d[d < 1.0] = 1 + T = np.dot(u * d, u.T) + if isinstance(cholF, np.ndarray): + S = np.dot(cholF.T, np.dot(T, cholF)) + else: + S = (cholF ** 2) * T + return la.cholesky(S, lower) + + +def int2onehot(class_ids, num_classes=None): + """Integer to 1-hot vector. + + Args: + class_ids: Numpy array of integers. + num_classes: Maximum number of classes. + + Returns: + 1-hot Numpy array. + """ + + if num_classes is None: + num_classes = np.max(class_ids) + 1 + + p = np.zeros((len(class_ids), num_classes), dtype=float_cpu()) + p[np.arange(len(class_ids)), class_ids] = 1 + return p + + +def average_vectors(x, ids): + assert x.shape[0] == len(ids) + num_ids = np.max(ids) + 1 + x_avg = np.zeros((num_ids, x.shape[1]), dtype=x.dtype) + for i in range(num_ids): + mask = ids == i + x_avg[i] = np.mean(x[mask], axis=0) + + return x_avg + + +def cosine_scoring(x1, x2, ids1=None, ids2=None): + if ids1 is not None: + x1 = average_vectors(x1, ids1) + + if ids2 is not None: + x2 = average_vectors(x2, ids2) + + l2_1 = np.sqrt(np.sum(x1 ** 2, axis=-1, keepdims=True) + 1e-10) + l2_2 = np.sqrt(np.sum(x2 ** 2, axis=-1, keepdims=True) + 1e-10) + x1 = x1 / l2_1 + x2 = x2 / l2_2 + + return np.dot(x1, x2.T) diff --git a/hyperion/utils/misc.py b/hyperion/utils/misc.py index c185b9a3..6afd4a88 100644 --- a/hyperion/utils/misc.py +++ b/hyperion/utils/misc.py @@ -4,9 +4,14 @@ Miscellaneous functions """ +from inspect import signature +from pathlib import Path +from typing import TypeVar import numpy as np +PathLike = TypeVar("PathLike", str, Path, type(None)) + def generate_data(g): while 1: @@ -72,9 +77,8 @@ def energy_vad(P): def compute_snr(x, n, axis=-1): - - P_x = 10 * np.log10(np.mean(x ** 2, axis=axis)) - P_n = 10 * np.log10(np.mean(n ** 2, axis=axis)) + P_x = 10 * np.log10(np.mean(x**2, axis=axis)) + P_n = 10 * np.log10(np.mean(n**2, axis=axis)) return P_x - P_n @@ -88,3 +92,79 @@ def filter_args(valid_args, kwargs): Dictionary with only valid_args keys if they exists """ return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + +def filter_func_args(func, kwargs, skip=set()): + """Filters arguments expected by a function + + Args: + func: function object + kwargs: dictionary containing arguments + skip: set with keys of func arguments to remove from kwargs + + Returns + Dictionary with arguments expected by the target function + """ + sig = signature(func) + valid_args = sig.parameters.keys() + skip.add("self") + for param in skip: + if param in kwargs: + del kwargs[param] + + my_kwargs = filter_args(valid_args, kwargs) + if "kwargs" in kwargs: + my_kwargs.update(kwargs["kwargs"]) + + args = sig.bind_partial(**my_kwargs).arguments + return args + + +from tqdm import tqdm + + +def tqdm_urlretrieve_hook(t): + """Wraps tqdm instance. + Don't forget to close() or __exit__() + the tqdm instance once you're done with it (easiest using `with` syntax). + Example + ------- + >>> from urllib.request import urlretrieve + >>> with tqdm(...) as t: + ... reporthook = tqdm_urlretrieve_hook(t) + ... urlretrieve(..., reporthook=reporthook) + Source: https://github.com/tqdm/tqdm/blob/master/examples/tqdm_wget.py + """ + last_b = [0] + + def update_to(b=1, bsize=1, tsize=None): + """ + b : int, optional + Number of blocks transferred so far [default: 1]. + bsize : int, optional + Size of each block (in tqdm units) [default: 1]. + tsize : int, optional + Total size (in tqdm units). If [default: None] or -1, + remains unchanged. + """ + if tsize not in (None, -1): + t.total = tsize + displayed = t.update((b - last_b[0]) * bsize) + last_b[0] = b + return displayed + + return update_to + + +def urlretrieve_progress(url, filename=None, data=None, desc=None): + """ + Works exactly like urllib.request.urlretrieve, but attaches a tqdm hook to display + a progress bar of the download. + Use "desc" argument to display a user-readable string that informs what is being downloaded. + Taken from lhotse: https://github.com/lhotse-speech/lhotse/blob/master/lhotse/utils.py + """ + from urllib.request import urlretrieve + + with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=desc) as t: + reporthook = tqdm_urlretrieve_hook(t) + return urlretrieve(url=url, filename=filename, reporthook=reporthook, data=data) diff --git a/hyperion/utils/plotting.py b/hyperion/utils/plotting.py index 7b87dbee..ec617975 100644 --- a/hyperion/utils/plotting.py +++ b/hyperion/utils/plotting.py @@ -3,17 +3,16 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np -import scipy.linalg as la - import matplotlib # matplotlib.use('Agg') import matplotlib.pyplot as plt +import numpy as np +import scipy.linalg as la import scipy.stats as stats from mpl_toolkits.mplot3d import Axes3D as plt3d -from .math import invert_pdmat +from .math_funcs import invert_pdmat def plot_gaussian_1D(mu, C, num_sigmas=3, num_pts=100, weight=1, **kwargs): diff --git a/hyperion/utils/queues.py b/hyperion/utils/queues.py deleted file mode 100644 index ad4298be..00000000 --- a/hyperion/utils/queues.py +++ /dev/null @@ -1,287 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import warnings -import copy -import time -import numpy as np -import multiprocessing -import threading -import six -from abc import abstractmethod - - -try: - import queue -except ImportError: - import Queue as queue - - -class SequenceQueue(object): - """Base class to enqueue inputs. - - The task of an Queue is to use parallelism to speed up preprocessing. - This is done with processes or threads. - - # Examples - - ```python - enqueuer = SequenceQueue(...) - enqueuer.start() - datas = enqueuer.get() - for data in datas: - # Use the inputs; training, evaluating, predicting. - # ... stop sometime. - enqueuer.close() - ``` - - The `enqueuer.get()` should be an infinite stream of datas. - - """ - - @abstractmethod - def is_running(self): - raise NotImplemented - - @abstractmethod - def start(self, workers=1, max_queue_size=10): - """Starts the handler's workers. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, threads could block on `put()`). - """ - raise NotImplemented - - @abstractmethod - def stop(self, timeout=None): - """Stop running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called start(). - - # Arguments - timeout: maximum time to wait on thread.join() - """ - raise NotImplemented - - @abstractmethod - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - Generator yielding tuples `(inputs, targets)` - or `(inputs, targets, sample_weights)`. - """ - raise NotImplemented - - -class OrderedQueue(SequenceQueue): - """Builds a Queue from a Sequence. - - Used in `fit_generator`, `evaluate_generator`, `predict_generator`. - - # Arguments - sequence: A `keras.utils.data_utils.Sequence` object. - use_multiprocessing: use multiprocessing if True, otherwise threading - scheduling: Sequential querying of datas if 'sequential', random otherwise. - """ - - def __init__(self, sequence, use_multiprocessing=False, scheduling="sequential"): - self.sequence = sequence - self.use_multiprocessing = use_multiprocessing - self.scheduling = scheduling - self.workers = 0 - self.executor = None - self.queue = None - self.run_thread = None - self.stop_signal = None - - def is_running(self): - return self.stop_signal is not None and not self.stop_signal.is_set() - - def start(self, workers=1, max_queue_size=10): - """Start the handler's workers. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, workers could block on `put()`) - """ - if self.use_multiprocessing: - self.executor = multiprocessing.Pool(workers) - else: - self.executor = ThreadPool(workers) - self.queue = queue.Queue(max_queue_size) - self.stop_signal = threading.Event() - self.run_thread = threading.Thread(target=self._run) - self.run_thread.daemon = True - self.run_thread.start() - - def _run(self): - """Function to submit request to the executor and queue the `Future` objects.""" - sequence = list(range(len(self.sequence))) - while True: - if self.scheduling is not "sequential": - random.shuffle(sequence) - for i in sequence: - if self.stop_signal.is_set(): - return - self.queue.put( - self.executor.apply_async(get_index, (self.sequence, i)), block=True - ) - - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - Generator yielding tuples (inputs, targets) - or (inputs, targets, sample_weights) - """ - try: - while self.is_running(): - inputs = self.queue.get(block=True).get() - if inputs is not None: - yield inputs - except Exception as e: - self.stop() - raise StopIteration(e) - - def stop(self, timeout=None): - """Stops running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called `start()`. - - # Arguments - timeout: maximum time to wait on `thread.join()` - """ - self.stop_signal.set() - with self.queue.mutex: - self.queue.queue.clear() - self.queue.unfinished_tasks = 0 - self.queue.not_full.notify() - self.executor.close() - self.executor.join() - self.run_thread.join(timeout) - - -class GeneratorQueue(SequenceQueue): - """Builds a queue out of a data generator. - - Used in `fit_generator`, `evaluate_generator`, `predict_generator`. - - # Arguments - generator: a generator function which endlessly yields data - use_multiprocessing: use multiprocessing if True, otherwise threading - wait_time: time to sleep in-between calls to `put()` - random_seed: Initial seed for workers, - will be incremented by one for each workers. - """ - - def __init__( - self, generator, use_multiprocessing=False, wait_time=0.05, random_seed=None - ): - self.wait_time = wait_time - self._generator = generator - self._use_multiprocessing = use_multiprocessing - self._threads = [] - self._stop_event = None - self.queue = None - self.random_seed = random_seed - - def start(self, workers=1, max_queue_size=10): - """Kicks off threads which add data from the generator into the queue. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, threads could block on `put()`) - """ - - def data_generator_task(): - while not self._stop_event.is_set(): - try: - if self._use_multiprocessing or self.queue.qsize() < max_queue_size: - generator_output = next(self._generator) - self.queue.put(generator_output) - else: - time.sleep(self.wait_time) - except Exception: - self._stop_event.set() - raise - - try: - if self._use_multiprocessing: - self.queue = multiprocessing.Queue(maxsize=max_queue_size) - self._stop_event = multiprocessing.Event() - else: - self.queue = queue.Queue() - self._stop_event = threading.Event() - - for _ in range(workers): - if self._use_multiprocessing: - # Reset random seed else all children processes - # share the same seed - np.random.seed(self.random_seed) - thread = multiprocessing.Process(target=data_generator_task) - thread.daemon = True - if self.random_seed is not None: - self.random_seed += 1 - else: - thread = threading.Thread(target=data_generator_task) - self._threads.append(thread) - thread.start() - except: - self.stop() - raise - - def is_running(self): - return self._stop_event is not None and not self._stop_event.is_set() - - def stop(self, timeout=None): - """Stops running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called `start()`. - - # Arguments - timeout: maximum time to wait on `thread.join()`. - """ - if self.is_running(): - self._stop_event.set() - - for thread in self._threads: - if thread.is_alive(): - if self._use_multiprocessing: - thread.terminate() - else: - thread.join(timeout) - - if self._use_multiprocessing: - if self.queue is not None: - self.queue.close() - - self._threads = [] - self._stop_event = None - self.queue = None - - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - A generator - """ - while self.is_running(): - if not self.queue.empty(): - inputs = self.queue.get() - if inputs is not None: - yield inputs - else: - time.sleep(self.wait_time) diff --git a/hyperion/utils/recording_set.py b/hyperion/utils/recording_set.py new file mode 100644 index 00000000..b266e514 --- /dev/null +++ b/hyperion/utils/recording_set.py @@ -0,0 +1,100 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from pathlib import Path + +import numpy as np +import pandas as pd + +from .info_table import InfoTable + + +class RecordingSet(InfoTable): + def __init__(self, df): + super().__init__(df) + assert "storage_path" in df + + def save(self, file_path, sep=None): + """Saves info table to file + + Args: + file_path: File to write the list. + sep: Separator between the key and file_path in the text file. + """ + file_path = Path(file_path) + file_path.parent.mkdir(parents=True, exist_ok=True) + ext = file_path.suffix + if ext == ".scp": + # if no extension we save as kaldi feats.scp file + from .scp_list import SCPList + + scp = SCPList(self.df["id"].values, self.df["storage_path"].values) + scp.save(file_path) + return + + super().save(file_path, sep) + + @classmethod + def load(cls, file_path, sep=None): + """Loads utt2info list from text file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + Returns: + RecordingSet object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext == ".scp": + # if no extension we load as kaldi feats.scp file + from .scp_list import SCPList + + scp = SCPList.load(file_path) + df_dict = {"id": scp.key, "storage_path": scp.file_path} + df = pd.DataFrame(df_dict) + + return cls(df) + + return super().load(file_path, sep) + + @staticmethod + def _get_durations(recordings, i, n): + from ..io import SequentialAudioReader as AR + + durations = [] + fss = [] + with AR(recordings, part_idx=i + 1, num_parts=n) as reader: + for data in reader: + key, x, fs = data + duration = x.shape[0] / fs + fss.append(fs) + durations.append(duration) + + return fss, durations + + def get_durations(self, num_threads: int = 16): + + import itertools + from concurrent.futures import ThreadPoolExecutor + + from tqdm import tqdm + + futures = [] + num_threads = min(num_threads, len(self.df)) + logging.info("submitting threats...") + with ThreadPoolExecutor(max_workers=num_threads) as pool: + for i in tqdm(range(num_threads)): + future = pool.submit(RecordingSet._get_durations, self, i, num_threads) + futures.append(future) + + logging.info("waiting threats...") + res = [f.result() for f in tqdm(futures)] + fss = list(itertools.chain(*[r[0] for r in res])) + durations = list(itertools.chain(*[r[1] for r in res])) + + self.df["duration"] = durations + self.df["sample_freq"] = fss diff --git a/hyperion/utils/rttm.py b/hyperion/utils/rttm.py index 2ff3a4b0..db7c0fae 100644 --- a/hyperion/utils/rttm.py +++ b/hyperion/utils/rttm.py @@ -9,8 +9,8 @@ import pandas as pd from .list_utils import * -from .vad_utils import * from .segment_list import SegmentList +from .vad_utils import * class RTTM(object): @@ -636,7 +636,7 @@ def get_bin_sample_mask_for_spk( tend[tend > max_samples] = max_samples - vad = np.zeros((max_samples,), dtype=np.bool) + vad = np.zeros((max_samples,), dtype=bool) for i, j in zip(tbeg, tend): if j > i: vad[i:j] = True diff --git a/hyperion/utils/scp_list.py b/hyperion/utils/scp_list.py index 8109d905..3d8b5e9d 100644 --- a/hyperion/utils/scp_list.py +++ b/hyperion/utils/scp_list.py @@ -3,11 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os import os.path as path from collections import OrderedDict from copy import deepcopy -import logging import numpy as np @@ -36,7 +36,7 @@ def __init__(self, key, file_path, offset=None, range_spec=None): def validate(self): """Validates the attributes of the SCPList object.""" self.key = list2ndarray(self.key) - self.file_path = list2ndarray(self.file_path, dtype=np.object) + self.file_path = list2ndarray(self.file_path, dtype=object) assert len(self.key) == len(self.file_path) if self.offset is not None: if isinstance(self.offset, list): @@ -384,7 +384,7 @@ def shuffle(self, seed=1024, rng=None): Index used to shuffle the list. """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) index = np.arange(len(self.key)) rng.shuffle(index) diff --git a/hyperion/utils/segment_list.py b/hyperion/utils/segment_list.py index 33b432bd..0151e967 100644 --- a/hyperion/utils/segment_list.py +++ b/hyperion/utils/segment_list.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import logging +import os.path as path from copy import deepcopy import numpy as np diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py new file mode 100644 index 00000000..a99b4e1e --- /dev/null +++ b/hyperion/utils/segment_set.py @@ -0,0 +1,60 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .info_table import InfoTable + + +class SegmentSet(InfoTable): + """Class to store information about a speech segment + Internally, it uses a pandas table. + """ + + def __init__(self, df): + super().__init__(df) + if "start" in df and "recordings" not in df: + df["recordings"] = df["id"] + + if "start" not in df and "recordings" in df: + df["start"] = 0.0 + + @property + def has_time_marks(self): + return "recordings" in self.df and "start" in self.df and "duration" in self.df + + @property + def has_recording_ids(self): + return "recordings" in self.df + + @property + def has_recordings(self): + return "recordings" in self.df + + def recordings(self, ids=None): + if ids is None: + if "recordings" in self.df: + return self.df["recordings"] + else: + return self.df["id"] + + if "recordings" in self.df: + return self.df.loc[ids, "recordings"] + + return ids + + def recording_ids(self, ids=None): + return self.recordings(ids) + + def recording_time_marks(self, ids, recordings_name: str = "recordings"): + if recordings_name == "recordings": + if "recordings" in self.df: + recordings_name = "recordings" + else: + recordings_name = "id" + + assert "duration" in self.df + if "start" not in self.df: + self.df["start"] = 0.0 + + return self.df.loc[ids, [recordings_name, "start", "duration"]] diff --git a/hyperion/utils/sparse_trial_key.py b/hyperion/utils/sparse_trial_key.py index f18dee3b..62fcd446 100644 --- a/hyperion/utils/sparse_trial_key.py +++ b/hyperion/utils/sparse_trial_key.py @@ -3,15 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import copy +import os.path as path +from pathlib import Path import numpy as np +import pandas as pd import scipy.sparse as sparse from .list_utils import * -from .trial_ndx import TrialNdx from .trial_key import TrialKey +from .trial_ndx import TrialNdx class SparseTrialKey(TrialKey): @@ -79,6 +81,28 @@ def save_txt(self, file_path): for r, c in zip(non.row, non.col): f.write("%s %s nontarget\n" % (self.model_set[r], self.seg_set[c])) + def save_table(self, file_path, sep=None): + """Saves object to txt file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}targettype\n") + self.tar.eliminate_zeros() + self.non.eliminate_zeros() + tar = self.tar.tocoo() + for r, c in zip(tar.row, tar.col): + f.write(f"{self.model_set[r]}{sep}{self.seg_set[c]}{sep}target\n") + non = self.non.tocoo() + for r, c in zip(non.row, non.col): + f.write(f"{self.model_set[r]}{sep}{self.seg_set[c]}{sep}nontarget\n") + @classmethod def load_h5(cls, file_path): raise NotImplementedError() @@ -113,6 +137,36 @@ def load_txt(cls, file_path): non[item[0], item[1]] = True return cls(model_set, seg_set, tar.tocsr(), non.tocsr()) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from txt file + + Args: + file_path: File to read the list. + + Returns: + SparseTrialKey object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + is_tar = (df["targettype"] == "target").values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + tar = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") + non = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") + for i, j, target_type in zip(model_idx, seg_idx, is_tar): + if target_type: + tar[i, j] = True + else: + non[i, j] = True + return cls(model_set, seg_set, tar.tocsr(), non.tocsr()) + @classmethod def merge(cls, key_list): raise NotImplementedError() diff --git a/hyperion/utils/sparse_trial_scores.py b/hyperion/utils/sparse_trial_scores.py index d269c629..760bd1f1 100644 --- a/hyperion/utils/sparse_trial_scores.py +++ b/hyperion/utils/sparse_trial_scores.py @@ -3,21 +3,19 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - -import os.path as path -import logging import copy +import logging +from pathlib import Path import numpy as np +import pandas as pd import scipy.sparse as sparse -# import h5py - from ..hyp_defs import float_cpu from .list_utils import * -from .trial_ndx import TrialNdx -from .trial_key import TrialKey from .sparse_trial_key import SparseTrialKey +from .trial_key import TrialKey +from .trial_ndx import TrialNdx from .trial_scores import TrialScores @@ -34,7 +32,7 @@ class SparseTrialScores(TrialScores): """ def __init__(self, model_set=None, seg_set=None, scores=None, score_mask=None): - super(SparseTrialScores, self).__init__(model_set, seg_set, scores, score_mask) + super().__init__(model_set, seg_set, scores, score_mask) def save_h5(self, file_path): raise NotImplementedError() @@ -54,6 +52,26 @@ def save_txt(self, file_path): % (self.model_set[r], self.seg_set[c], self.scores[r, c]) ) + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + self.score_mask.eliminate_zeros() + score_mask = self.score_mask.tocoo() + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}LLR\n") + for i, j in zip(score_mask.row, score_mask.col): + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}\n" + ) + @classmethod def load_h5(cls, file_path): raise NotImplementedError() @@ -89,6 +107,35 @@ def load_txt(cls, file_path): scores[item[0], item[1]] = item[2] return cls(model_set, seg_set, scores.tocsr(), score_mask.tocsr()) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialScores object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + score_list = df["LLR"].values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + scores = sparse.lil_matrix((len(model_set), len(seg_set)), dtype=float_cpu()) + score_mask = sparse.lil_matrix(scores.shape, dtype="bool") + for i, j, score in zip(model_idx, seg_idx, score_list): + score_mask[i, j] = True + scores[i, j] = score + + return cls(model_set, seg_set, scores.tocsr(), score_mask.tocsr()) + @classmethod def merge(cls, scr_list): raise NotImplementedError() @@ -123,7 +170,7 @@ def validate(self): assert len(np.unique(self.seg_set)) == len(self.seg_set) if self.scores is None: self.scores = sparse.csr_matrix( - (len(model_set), len(seg_set)), dtype=float_cpu() + (len(self.model_set), len(self.seg_set)), dtype=float_cpu() ) else: assert self.scores.shape == (len(self.model_set), len(self.seg_set)) @@ -159,45 +206,48 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): if not (np.all(f_mod) and np.all(f_seg)): for i in (f_mod == 0).nonzero()[0]: - logging.info("model %s not found" % model_set[i]) + logging.info("model %s not found", model_set[i]) for i in (f_seg == 0).nonzero()[0]: - logging.info("segment %s not found" % seg_set[i]) + logging.info("segment %s not found", seg_set[i]) if raise_missing: raise Exception("some scores were not computed") - # model_set = self.model_set[mod_idx] - # set_set = self.seg_set[seg_idx] - # ix = np.ix_(mod_idx, seg_idx) - - # logging.info('hola1') - # new_src = [[self.scores[r,c], i, j] for i,r in enumerate(mod_idx) for j,c in enumerate(seg_idx) if self.score_mask[r,c]] - # logging.info('hola2') - # new_data = np.array([r[0] for r in new_src], dtype=float_cpu()) - # new_row = np.array([r[1] for r in new_src], dtype=np.int) - # new_col = np.array([r[2] for r in new_src], dtype=np.int) - # logging.info('hola3') - # shape = (len(model_set), len(seg_set)) - # scores = sparse.coo_matrix((new_data, (new_row, new_col)), shape=shape).tocsr() - # score_mask = sparse.coo_matrix((np.ones(new_data.shape, dtype=np.bool), (new_row, new_col)), shape=shape).tocsr() - num_mod = len(model_set) num_seg = len(seg_set) shape = (num_mod, num_seg) scores = self.scores.tocoo() new_data = scores.data new_row = scores.row.copy() + # for i, r in enumerate(mod_idx): + # if f_mod[i] and i != r: + # idx = scores.row == r + # new_row[idx] = i + + # new_col = scores.col.copy() + # for j, c in enumerate(seg_idx): + # if f_seg[j] and j != c: + # idx = scores.col == c + # new_col[idx] = j + + # idx = np.logical_and(new_row < num_mod, new_col < num_seg) + # if not np.all(idx): + # new_data = new_data[idx] + # new_row = new_row[idx] + # new_col = new_col[idx] + + new_row = -1 * np.ones_like(scores.row) for i, r in enumerate(mod_idx): - if f_mod[i] and i != r: + if f_mod[i]: idx = scores.row == r new_row[idx] = i - new_col = scores.col.copy() + new_col = -1 * np.ones_like(scores.col) for j, c in enumerate(seg_idx): - if f_seg[j] and j != c: + if f_seg[j]: idx = scores.col == c new_col[idx] = j - idx = np.logical_and(new_row < num_mod, new_col < num_seg) + idx = np.logical_and(new_row != -1, new_col != -1) if not np.all(idx): new_data = new_data[idx] new_row = new_row[idx] @@ -207,19 +257,37 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): score_mask = self.score_mask.tocoo() new_data = score_mask.data - new_row = score_mask.row.copy() + # new_row = score_mask.row.copy() + # for i, r in enumerate(mod_idx): + # if f_mod[i] and i != r: + # idx = score_mask.row == r + # new_row[idx] = i + + # new_col = score_mask.col.copy() + # for j, c in enumerate(seg_idx): + # if f_seg[j] and j != c: + # idx = score_mask.col == c + # new_col[idx] = j + + # idx = np.logical_and(new_row < num_mod, new_col < num_seg) + # if not np.all(idx): + # new_data = new_data[idx] + # new_row = new_row[idx] + # new_col = new_col[idx] + + new_row = -1 * np.ones_like(score_mask.row) for i, r in enumerate(mod_idx): - if f_mod[i] and i != r: + if f_mod[i]: idx = score_mask.row == r new_row[idx] = i - new_col = score_mask.col.copy() + new_col = -1 * np.ones_like(score_mask.col) for j, c in enumerate(seg_idx): - if f_seg[j] and j != c: + if f_seg[j]: idx = score_mask.col == c new_col[idx] = j - idx = np.logical_and(new_row < num_mod, new_col < num_seg) + idx = np.logical_and(new_row != -1, new_col != -1) if not np.all(idx): new_data = new_data[idx] new_row = new_row[idx] @@ -263,7 +331,7 @@ def align_with_ndx(self, ndx, raise_missing=True): if not scr.score_mask[r, c]: missing_scores = True logging.info( - "missing-scores for %s %s" % (scr.model_set[r], scr.seg_set[c]) + "missing-scores for %s %s", scr.model_set[r], scr.seg_set[c] ) if missing_scores and raise_missing: @@ -288,14 +356,40 @@ def get_tar_non(self, key): non = np.array(scr.scores[non_mask])[0] return tar, non + def get_valid_scores(self, ndx=None): + if ndx is None: + scr = self + else: + scr = self.align_with_ndx(ndx) + + scores = np.array(scr.scores[scr.score_mask])[0] + return scores + + def set_valid_scores(self, scores, ndx=None): + if ndx is not None: + scr = self.align_with_ndx(ndx) + self.model_set = scr.model_set + self.seg_set = scr.seg_set + self.scores = scr.scores + self.score_mat = scr.score_mat + + self.scores[self.score_mask] = scores + @classmethod def from_trial_scores(cls, scr): - scores = sparse.csr_matrix(scr.scores) + scores = scr.scores * scr.score_mask + scores = sparse.csr_matrix(scores) score_mask = sparse.csr_matrix(scr.score_mask) scores.eliminate_zeros() score_mask.eliminate_zeros() return cls(scr.model_set, scr.seg_set, scores, score_mask) + def to_trial_scores(self): + scores = self.scores.toarray("C") + score_mask = self.score_mask.toarray("C") + # scores[~score_mask] = 0.0 + return TrialScores(self.model_set, self.seg_set, scores, score_mask) + def set_missing_to_value(self, ndx, val): """Aligns the scores with a TrialNdx and sets the trials with missing scores to the same value. diff --git a/hyperion/utils/text.py b/hyperion/utils/text.py new file mode 100644 index 00000000..2846fdbf --- /dev/null +++ b/hyperion/utils/text.py @@ -0,0 +1,171 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from pathlib import Path + +try: + import k2 + import k2.version +except ModuleNotFoundError: + from ..torch.utils import dummy_k2 as k2 + +import numpy as np +import pandas as pd + +import torch + + +# Copied and modified from https://github.com/wenet-e2e/wenet/blob/main/wenet/utils/mask.py +def subsequent_chunk_mask( + size: int, + chunk_size: int, + num_left_chunks: int = -1, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + """Create mask for subsequent steps (size, size) with chunk size, + this is for streaming encoder + Args: + size (int): size of mask + chunk_size (int): size of chunk + num_left_chunks (int): number of left chunks + <0: use full chunk + >=0: use num_left_chunks + device (torch.device): "cpu" or "cuda" or torch.Tensor.device + Returns: + torch.Tensor: mask + Examples: + >>> subsequent_chunk_mask(4, 2) + [[1, 1, 0, 0], + [1, 1, 0, 0], + [1, 1, 1, 1], + [1, 1, 1, 1]] + """ + ret = torch.zeros(size, size, device=device, dtype=torch.bool) + for i in range(size): + if num_left_chunks < 0: + start = 0 + else: + start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) + ending = min((i // chunk_size + 1) * chunk_size, size) + ret[i, start:ending] = True + return ret + + +def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor: + """ + Args: + lengths: + A 1-D tensor containing sentence lengths. + Returns: + Return a 2-D bool tensor, where masked positions + are filled with `True` and non-masked positions are + filled with `False`. + + >>> lengths = torch.tensor([1, 3, 2, 5]) + >>> make_pad_mask(lengths) + tensor([[False, True, True, True, True], + [False, False, False, True, True], + [False, False, True, True, True], + [False, False, False, False, False]]) + """ + assert lengths.ndim == 1, lengths.ndim + + max_len = lengths.max() + n = lengths.size(0) + + expaned_lengths = torch.arange(max_len).expand(n, max_len).to(lengths) + + return expaned_lengths >= lengths.unsqueeze(1) + + +def concat(ragged: k2.RaggedTensor, value: int, + direction: str) -> k2.RaggedTensor: + """Prepend a value to the beginning of each sublist or append a value. + to the end of each sublist. + + Args: + ragged: + A ragged tensor with two axes. + value: + The value to prepend or append. + direction: + It can be either "left" or "right". If it is "left", we + prepend the value to the beginning of each sublist; + if it is "right", we append the value to the end of each + sublist. + + Returns: + Return a new ragged tensor, whose sublists either start with + or end with the given value. + + >>> a = k2.RaggedTensor([[1, 3], [5]]) + >>> a + [ [ 1 3 ] [ 5 ] ] + >>> concat(a, value=0, direction="left") + [ [ 0 1 3 ] [ 0 5 ] ] + >>> concat(a, value=0, direction="right") + [ [ 1 3 0 ] [ 5 0 ] ] + + """ + dtype = ragged.dtype + device = ragged.device + + assert ragged.num_axes == 2, f"num_axes: {ragged.num_axes}" + pad_values = torch.full( + size=(ragged.tot_size(0), 1), + fill_value=value, + device=device, + dtype=dtype, + ) + pad = k2.RaggedTensor(pad_values) + + if direction == "left": + ans = k2.ragged.cat([pad, ragged], axis=1) + elif direction == "right": + ans = k2.ragged.cat([ragged, pad], axis=1) + else: + raise ValueError(f'Unsupported direction: {direction}. " \ + "Expect either "left" or "right"') + return ans + + +def add_sos(ragged: k2.RaggedTensor, sos_id: int) -> k2.RaggedTensor: + """Add SOS to each sublist. + + Args: + ragged: + A ragged tensor with two axes. + sos_id: + The ID of the SOS symbol. + + Returns: + Return a new ragged tensor, where each sublist starts with SOS. + + >>> a = k2.RaggedTensor([[1, 3], [5]]) + >>> a + [ [ 1 3 ] [ 5 ] ] + >>> add_sos(a, sos_id=0) + [ [ 0 1 3 ] [ 0 5 ] ] + + """ + return concat(ragged, sos_id, direction="left") + + +def read_text(text_file: str): + # assert check_argument_types() + text_file = Path(text_file) + + data = {"id": [], "text": []} + with Path(text_file).open("r", encoding="utf-8") as f: + for linenum, line in enumerate(f, 1): + sps = line.rstrip().split(maxsplit=1) + if len(sps) == 1: + k, v = sps[0], "" + else: + k, v = sps + # if k in data: + # raise RuntimeError(f"{k} is duplicated ({path}:{linenum})") + data["id"].append(k) + data["text"].append(v) + return pd.DataFrame(data=data, index=data["id"]) diff --git a/hyperion/utils/time_units.py b/hyperion/utils/time_units.py index f8ed0846..6004329b 100644 --- a/hyperion/utils/time_units.py +++ b/hyperion/utils/time_units.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import math + import numpy as np diff --git a/hyperion/utils/train_val_eval_list.py b/hyperion/utils/train_val_eval_list.py index f8cc4ca0..cbccf093 100644 --- a/hyperion/utils/train_val_eval_list.py +++ b/hyperion/utils/train_val_eval_list.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import logging +import os.path as path from collections import OrderedDict from copy import deepcopy @@ -207,7 +207,7 @@ def create( part_names = ["train", "eval"] if shuffle: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) if group_by_key is None: group_by_key = segment_key diff --git a/hyperion/utils/trial_key.py b/hyperion/utils/trial_key.py index b22babda..539a049d 100644 --- a/hyperion/utils/trial_key.py +++ b/hyperion/utils/trial_key.py @@ -3,13 +3,16 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import copy +import os.path as path +from pathlib import Path -import numpy as np import h5py +import numpy as np +import pandas as pd -from .list_utils import * +# from .list_utils import * +from .list_utils import intersect, ismember, list2ndarray, sort, split_list from .trial_ndx import TrialNdx @@ -82,18 +85,20 @@ def sort(self): if self.trial_cond is not None: self.trial_cond = self.trial_cond[:, ix] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + ext = file_path.suffix + if ext in (".h5", ".hdf5"): self.save_h5(file_path) - else: + elif ext in ("", ".txt"): self.save_txt(file_path) + else: + self.save_table(file_path, sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -132,20 +137,40 @@ def save_txt(self, file_path): file_path: File to write the list. """ with open(file_path, "w") as f: - idx = (self.tar.T == True).nonzero() + idx = (self.tar.T).nonzero() for item in zip(idx[0], idx[1]): f.write( "%s %s target\n" % (self.model_set[item[1]], self.seg_set[item[0]]) ) - idx = (self.non.T == True).nonzero() + idx = (self.non.T).nonzero() for item in zip(idx[0], idx[1]): f.write( "%s %s nontarget\n" % (self.model_set[item[1]], self.seg_set[item[0]]) ) + def save_table(self, file_path, sep=None): + """Saves object to txt file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}targettype\n") + I, J = np.logical_or(self.tar, self.non).nonzero() + for i, j in zip(I, J): + target_type = "target" if self.tar[i, j] else "nontarget" + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{target_type}\n" + ) + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -154,11 +179,14 @@ def load(cls, file_path): Returns: TrialKey object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -240,6 +268,36 @@ def load_txt(cls, file_path): non[item[0], item[1]] = True return cls(model_set, seg_set, tar, non) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialKey object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + is_tar = (df["targettype"] == "target").values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + tar = np.zeros((len(model_set), len(seg_set)), dtype="bool") + non = np.zeros((len(model_set), len(seg_set)), dtype="bool") + for i, j, target_type in zip(model_idx, seg_idx, is_tar): + if target_type: + tar[i, j] = True + else: + non[i, j] = True + return cls(model_set, seg_set, tar, non) + @classmethod def merge(cls, key_list): """Merges several key objects. @@ -363,7 +421,7 @@ def filter(self, model_set, seg_set, keep=True): assert np.all(f) model_set = self.model_set[mod_idx] - set_set = self.seg_set[seg_idx] + seg_set = self.seg_set[seg_idx] ix = np.ix_(mod_idx, seg_idx) tar = self.tar[ix] non = self.non[ix] diff --git a/hyperion/utils/trial_ndx.py b/hyperion/utils/trial_ndx.py index 783f39c4..b7b873df 100644 --- a/hyperion/utils/trial_ndx.py +++ b/hyperion/utils/trial_ndx.py @@ -3,13 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import copy +from pathlib import Path -import numpy as np import h5py +import numpy as np +import pandas as pd -from .list_utils import * +# from .list_utils import * +from .list_utils import sort, intersect, ismember, split_list, list2ndarray class TrialNdx(object): @@ -46,17 +48,20 @@ def sort(self): self.seg_set, s_idx = sort(self.seg_set, return_index=True) self.trial_mask = self.trial_mask[np.ix_(m_idx, s_idx)] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in [".h5", ".hdf5"]: self.save_h5(file_path) - else: + elif file_ext in [".txt", ""]: self.save_txt(file_path) + else: + self.save_table(file_path, sep=sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -71,15 +76,6 @@ def save_h5(self, file_path): f.create_dataset("ID/column_ids", data=seg_set) f.create_dataset("trial_mask", data=self.trial_mask.astype("uint8")) - # model_set = self.model_set.astype('S') - # f.create_dataset('ID/row_ids', self.model_set.shape, dtype=model_set.dtype) - # f['ID/row_ids'] = model_set - # seg_set = self.seg_set.astype('S') - # f.create_dataset('ID/column_ids', self.seg_set.shape, dtype=seg_set.dtype) - # f['ID/column_ids'] = seg_set - # f.create_dataset('trial_mask', self.trial_mask.shape, dtype='uint8') - # f['trial_mask'] = self.trial_mask.astype('uint8') - def save_txt(self, file_path): """Saves object to txt file. @@ -91,8 +87,25 @@ def save_txt(self, file_path): for item in zip(idx[0], idx[1]): f.write("%s %s\n" % (self.model_set[item[1]], self.seg_set[item[0]])) + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}\n") + I, J = self.trial_mask.nonzero() + for i, j in zip(I, J): + f.write(f"{self.model_set[i]}{sep}{self.seg_set[j]}\n") + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -101,11 +114,14 @@ def load(cls, file_path): Returns: TrialNdx object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -148,6 +164,36 @@ def load_txt(cls, file_path): trial_mask[item[0], item[1]] = True return cls(model_set, seg_set, trial_mask) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialNdx object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + model_set, _, model_idx = np.unique( + models, return_index=True, return_inverse=True + ) + seg_set, _, seg_idx = np.unique( + segments, return_index=True, return_inverse=True + ) + trial_mask = np.zeros((len(model_set), len(seg_set)), dtype="bool") + for i, j in zip(model_idx, seg_idx): + trial_mask[i, j] = True + + return cls(model_set, seg_set, trial_mask) + @classmethod def merge(cls, ndx_list): """Merges several index objects. @@ -320,7 +366,7 @@ def __ne__(self, other): def __cmp__(self, other): """Comparison operator""" - if self.__eq__(oher): + if self.__eq__(other): return 0 return 1 diff --git a/hyperion/utils/trial_scores.py b/hyperion/utils/trial_scores.py index 19e17190..4a5e59da 100644 --- a/hyperion/utils/trial_scores.py +++ b/hyperion/utils/trial_scores.py @@ -3,18 +3,20 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - -import os.path as path -import logging import copy +import logging +from pathlib import Path -import numpy as np import h5py +import numpy as np +import pandas as pd from ..hyp_defs import float_cpu -from .list_utils import * -from .trial_ndx import TrialNdx + +# from .list_utils import * +from .list_utils import intersect, ismember, list2ndarray, sort, split_list from .trial_key import TrialKey +from .trial_ndx import TrialNdx class TrialScores(object): @@ -26,13 +28,22 @@ class TrialScores(object): seg_set: List of test segment names. scores: Matrix with the scores (num_models x num_segments). score_mask: Boolean matrix with the trials with valid scores to True (num_models x num_segments). + q_measures: optional dictionary of quality measure matrices """ - def __init__(self, model_set=None, seg_set=None, scores=None, score_mask=None): + def __init__( + self, + model_set=None, + seg_set=None, + scores=None, + score_mask=None, + q_measures=None, + ): self.model_set = model_set self.seg_set = seg_set self.scores = scores self.score_mask = score_mask + self.q_measures = q_measures if (model_set is not None) and (seg_set is not None): self.validate() @@ -55,18 +66,24 @@ def sort(self): ix = np.ix_(m_idx, s_idx) self.scores = self.scores[ix] self.score_mask = self.score_mask[ix] + if self.q_measures is not None: + for k in self.q_measures.keys(): + self.q_measures[k] = self.q_measures[k][ix] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in [".h5", ".hdf5"]: self.save_h5(file_path) - else: + elif file_ext in ["", ".txt"]: self.save_txt(file_path) + else: + self.save_table(file_path, sep=sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -81,6 +98,10 @@ def save_h5(self, file_path): f.create_dataset("ID/column_ids", data=seg_set) f.create_dataset("scores", data=self.scores) f.create_dataset("score_mask", data=self.score_mask.astype("uint8")) + if self.q_measures is not None: + q_grp = f.create_group("q_measures") + for k, v in self.q_measures.items(): + q_grp.create_dataset(k, data=v) def save_txt(self, file_path): """Saves object to txt file. @@ -100,8 +121,38 @@ def save_txt(self, file_path): ) ) + if self.q_measures is not None: + logging.warning("q_measures cannot be saved to txt file") + + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + q_str = "" + if self.q_measures is not None: + q_str = sep + sep.join(self.q_measures.keys()) + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}LLR{q_str}\n") + I, J = self.score_mask.nonzero() + for i, j in zip(I, J): + if self.q_measures is not None: + q_str = sep + sep.join( + [str(v[i, j]) for k, v in self.q_measures.items()] + ) + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}{q_str}\n" + ) + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -110,11 +161,14 @@ def load(cls, file_path): Returns: TrialScores object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -131,7 +185,12 @@ def load_h5(cls, file_path): seg_set = [t.decode("utf-8") for t in f["ID/column_ids"]] scores = np.asarray(f["scores"], dtype=float_cpu()) score_mask = np.asarray(f["score_mask"], dtype="bool") - return cls(model_set, seg_set, scores, score_mask) + if "q_measures" in f: + q_grp = f["q_measures"] + q_measures = {k: q_grp[k] for k in q_grp} + else: + q_measures = None + return cls(model_set, seg_set, scores, score_mask, q_measures) @classmethod def load_txt(cls, file_path): @@ -163,6 +222,49 @@ def load_txt(cls, file_path): scores[item[0], item[1]] = item[2] return cls(model_set, seg_set, scores, score_mask) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialScores object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + score_list = df["LLR"].values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + score_mask = np.zeros((len(model_set), len(seg_set)), dtype="bool") + scores = np.zeros((len(model_set), len(seg_set)), dtype=float_cpu()) + for i, j, score in zip(model_idx, seg_idx, score_list): + score_mask[i, j] = True + scores[i, j] = score + + if len(df.columns) > 3: + q_names = df.columns[3:] + q_vals = df.iloc[:, 3:].values + q_measures = {} + for q_name in q_names: + q_measures[q_name] = np.zeros(scores.shape, dtype=float_cpu()) + + for i, j, q_row in zip(model_idx, seg_idx, q_vals): + for col, q_name in enumerate(q_names): + q_measures[q_name][i, j] = q_row[col] + + else: + q_measures = None + + return cls(model_set, seg_set, scores, score_mask, q_measures) + @classmethod def merge(cls, scr_list): """Merges several score objects. @@ -178,6 +280,7 @@ def merge(cls, scr_list): seg_set = scr_list[0].seg_set scores = scr_list[0].scores score_mask = scr_list[0].score_mask + q_measures = scr_list[0].q_measures for i in range(1, num_scr): scr_i = scr_list[i] new_model_set = np.union1d(model_set, scr_i.model_set) @@ -196,6 +299,10 @@ def merge(cls, scr_list): scores_1[ix_a] = scores[ix_b] score_mask_1 = np.zeros(shape, dtype="bool") score_mask_1[ix_a] = score_mask[ix_b] + if q_measures is not None: + q_measures_1 = {k: np.zeros(shape) for k in q_measures.keys()} + for k in q_measures.keys(): + q_measures_1[k][ix_a] = q_measures[k][ix_b] trial_mask_2 = np.zeros( (len(new_model_set), len(new_seg_set)), dtype="bool" @@ -212,14 +319,21 @@ def merge(cls, scr_list): scores_2[ix_a] = scr_i.scores[ix_b] score_mask_2 = np.zeros(shape, dtype="bool") score_mask_2[ix_a] = scr_i.score_mask[ix_b] + if q_measures is not None: + q_measures_2 = {k: np.zeros(shape) for k in q_measures.keys()} + for k in q_measures.keys(): + q_measures_2[k][ix_a] = scr_i.q_measures[k][ix_b] model_set = new_model_set seg_set = new_seg_set scores = scores_1 + scores_2 assert not (np.any(np.logical_and(score_mask_1, score_mask_2))) score_mask = np.logical_or(score_mask_1, score_mask_2) + if q_measures is not None: + for k in q_measures.keys(): + q_measures[k] = q_measures_1[k] + q_measures_2[k] - return cls(model_set, seg_set, scores, score_mask) + return cls(model_set, seg_set, scores, score_mask, q_measures) def filter(self, model_set, seg_set, keep=True, raise_missing=True): """Removes elements from TrialScores object. @@ -235,24 +349,28 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): Filtered TrialScores object. """ - if not (keep): + if not keep: model_set = np.setdiff1d(self.model_set, model_set) seg_set = np.setdiff1d(self.model_set, seg_set) f_mod, mod_idx = ismember(model_set, self.model_set) f_seg, seg_idx = ismember(seg_set, self.seg_set) - + q_measures = None if np.all(f_mod) and np.all(f_seg): model_set = self.model_set[mod_idx] - set_set = self.seg_set[seg_idx] + seg_set = self.seg_set[seg_idx] ix = np.ix_(mod_idx, seg_idx) scores = self.scores[ix] score_mask = self.score_mask[ix] + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = self.q_measures[k][ix] else: for i in (f_mod == 0).nonzero()[0]: - logging.info("model %s not found" % model_set[i]) + logging.info("model %s not found", model_set[i]) for i in (f_seg == 0).nonzero()[0]: - logging.info("segment %s not found" % seg_set[i]) + logging.info("segment %s not found", seg_set[i]) if raise_missing: raise Exception("some scores were not computed") @@ -262,8 +380,13 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): ix2 = np.ix_(mod_idx[f_mod], seg_idx[f_seg]) scores[ix1] = self.scores[ix2] score_mask[ix1] = self.score_mask[ix2] + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = np.zeros(scores.shape, dtype=float_cpu()) + q_measures[k][ix1] = self.q_measures[k][ix2] - return TrialScores(model_set, seg_set, scores, score_mask) + return TrialScores(model_set, seg_set, scores, score_mask, q_measures) def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): """Splits the TrialScores into num_model_parts x num_seg_parts and returns part @@ -284,7 +407,13 @@ def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): ix = np.ix_(model_idx1, seg_idx1) scores = self.scores[ix] score_mask = self.score_mask[ix] - return TrialScores(model_set, seg_set, scores, score_mask) + q_measures = None + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = self.q_measures[k][ix] + + return TrialScores(model_set, seg_set, scores, score_mask, q_measures) def validate(self): """Validates the attributes of the TrialScores object.""" @@ -306,6 +435,10 @@ def validate(self): else: assert self.score_mask.shape == (len(self.model_set), len(self.seg_set)) + if self.q_measures is not None: + for k in self.q_measures.keys(): + assert self.q_measures[k].shape == self.scores.shape + def align_with_ndx(self, ndx, raise_missing=True): """Aligns scores, model_set and seg_set with TrialNdx or TrialKey. @@ -356,6 +489,34 @@ def get_tar_non(self, key): non = scr.scores[non_mask] return tar, non + def get_tar_non_q_measures(self, key, q_names=None, return_dict=False): + """Returns target and non target scores. + + Args: + key: TrialKey object. + q_names: names of quality measures to return, if None it will return all + + Returns: + Numpy array with target scores. + Numpy array with non-target scores. + """ + scr = self.align_with_ndx(key) + tar_mask = np.logical_and(scr.score_mask, key.tar) + if q_names is None: + q_names = self.q_measures.keys() + tar = {} + for k in q_names: + tar[k] = self.q_measures[k][tar_mask] + non_mask = np.logical_and(scr.score_mask, key.non) + non = {} + for k in q_names: + non[k] = self.q_measures[k][non_mask] + + if not return_dict: + tar = np.vstack(tuple(tar[k] for k in q_names)).T + non = np.vstack(tuple(non[k] for k in q_names)).T + return tar, non + def set_missing_to_value(self, ndx, val): """Aligns the scores with a TrialNdx and sets the trials with missing scores to the same value. @@ -394,6 +555,18 @@ def __eq__(self, other): eq = eq and np.all(self.seg_set == other.seg_set) eq = eq and np.all(np.isclose(self.scores, other.scores, atol=1e-5)) eq = eq and np.all(self.score_mask == other.score_mask) + if self.q_measures is not None: + eq = eq and other.q_measures is not None + if eq: + eq = self.q_measures.keys() == other.q_measures.keys() + if eq: + for k in self.q_measures.keys(): + eq = eq and np.all( + np.isclose( + self.q_measures[k], other.q_measures[k], atol=1e-5 + ) + ) + return eq def __ne__(self, other): @@ -402,12 +575,11 @@ def __ne__(self, other): def __cmp__(self, other): """Comparison operator""" - if self.__eq__(oher): + if self.__eq__(other): return 0 return 1 def test(key_file="core-core_det5_key.h5"): - key = TrialKey.load(key_file) mask = np.logical_or(key.tar, key.non) diff --git a/hyperion/utils/trial_stats.py b/hyperion/utils/trial_stats.py index 229bad3c..7d9d74d1 100644 --- a/hyperion/utils/trial_stats.py +++ b/hyperion/utils/trial_stats.py @@ -4,16 +4,16 @@ """ -import os.path as path -import logging import copy +import logging +import os.path as path import numpy as np import pandas as pd from ..hyp_defs import float_cpu -from .trial_ndx import TrialNdx from .trial_key import TrialKey +from .trial_ndx import TrialNdx class TrialStats(object): diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py index 3cf4179b..c1c429f2 100644 --- a/hyperion/utils/utt2info.py +++ b/hyperion/utils/utt2info.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import logging +import os.path as path from collections import OrderedDict from copy import deepcopy @@ -142,7 +142,7 @@ def save(self, file_path, sep=" "): self.utt_info.to_csv(file_path, sep=sep, header=False, index=False) @classmethod - def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}): + def load(cls, file_path, sep=" ", dtype={0: np.str_, 1: np.str_}): """Loads utt2info list from text file. Args: @@ -261,7 +261,7 @@ def shuffle(self, seed=1024, rng=None): Index used to shuffle the list. """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) index = np.arange(len(self.key)) rng.shuffle(index) self.utt_info = self.utt_info.iloc[index] diff --git a/hyperion/utils/vad_utils.py b/hyperion/utils/vad_utils.py index 2d68bc5c..4f3f980e 100644 --- a/hyperion/utils/vad_utils.py +++ b/hyperion/utils/vad_utils.py @@ -135,7 +135,7 @@ def vad_timestamps_to_bin( if max_frames is not None and num_frames < max_frames: num_frames = max_frames - vad = np.zeros((num_frames,), dtype=np.bool) + vad = np.zeros((num_frames,), dtype=bool) frame_start = np.ceil( (in_timestamps[:, 0] - (pad + frame_center)) / frame_shift ).astype(dtype=np.int) @@ -242,7 +242,7 @@ def intersect_segment_timestamps_with_vad(in_timestamps, vad_timestamps): vad_start = vad_timestamps[:, 0] vad_end = vad_timestamps[:, 1] num_vad_segs = len(vad_start) - speech_idx = np.zeros((in_timestamps.shape[0],), dtype=np.bool) + speech_idx = np.zeros((in_timestamps.shape[0],), dtype=bool) out_timestamps = [] out_timestamps2speech_segs = [] count_speech = 0 diff --git a/hyperion/vb_pdfs/core/exponential_family.py b/hyperion/vb_pdfs/core/exponential_family.py deleted file mode 100644 index c3e59040..00000000 --- a/hyperion/vb_pdfs/core/exponential_family.py +++ /dev/null @@ -1,139 +0,0 @@ -""" - Copyright 2017 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from abc import ABCMeta, abstractmethod -from .pdf import PDF - - -class ExpFamily(PDF): - __metaclass__ = ABCMeta - - def __init__(self, eta=None, **kwargs): - super(ExpFamily, self).__init__(**kwargs) - self.eta = eta - self.A = None - - def fit( - self, x, sample_weight=None, x_val=None, sample_weight_val=None, batch_size=None - ): - - N, u_x = self.Estep(x=x, sample_weight=sample_weight, batch_size=batch_size) - self.Mstep(N, u_x) - elbo = self.elbo(x, N=N, u_x=u_x) - elbo = [elbo, elbo / N] - - if x_val is not None: - N, u_x = self.Estep( - x=x_val, sample_weight=sample_weight_val, batch_size=batch_size - ) - elbo_val = self.elbo(x_val, N=N, u_x=u_x) - elbo += [elbo_val, elbo_val / N] - return elbo - - def log_h(self, x): - return 0 - - def accum_logh(self, x, sample_weight=None): - if sample_weight is None: - return np.sum(self.logh(x)) - return np.sum(sample_weight * self.logh(x)) - - def compute_suff_stats(self, x): - return x - - def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): - if u_x is not None or batch_size is None: - return self._accum_suff_stats_1batch(x, u_x, sample_weight) - else: - return self._accum_suff_stats_nbatches(x, sample_weight, batch_size) - - def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): - if u_x is None: - u_x = self.compute_suff_stats(x) - if sample_weight is None: - N = u_x.shape[0] - else: - u_x *= sample_weight[:, None] - N = np.sum(sample_weight) - acc_u_x = np.sum(u_x, axis=0) - return N, acc_u_x - - def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): - sw_i = None - for i1 in range(0, x.shape[0], batch_size): - i2 = np.minimum(i1 + batch_size, x.shape[0]) - x_i = x[i1:i2, :] - if sample_weight is not None: - sw_i = sample_weight[i1:i2] - N_i, u_x_i = self._accum_suff_stats_1batch(x_i, sample_weight=sw_i) - if i1 == 0: - N = N_i - u_x = u_x_i - else: - N += N_i - u_x += u_x_i - return N, u_x - - def add_suff_stats(self, N, u_x): - assert len(N) == len(u_x) - acc_N = N[1] - acc_u_x = u_x[1] - for i in range(1, len(N)): - acc_N += N - acc_u_x += u[i] - return acc_N, acc_u_x - - def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): - return self.accum_suff_stats(x, u_x, sample_weight, batch_size) - - @abstractmethod - def Mstep(self, stats): - pass - - def elbo(self, x, u_x=None, N=1, logh=None, sample_weight=None, batch_size=None): - if u_x is None: - N, u_x = self.accum_suff_stats( - x, sample_weight=sample_weight, batch_size=batch_size - ) - if logh is None: - logh = self.accum_logh(x, sample_weight=sample_weight) - return logh + np.inner(u_x, self.eta) - N * self.A - - def eval_llk(self, x, u_x=None, mode="nat"): - if mode == "nat": - return self.eval_llk_nat(x, u_x) - else: - return self.eval_llk_std(x) - - def eval_llk_nat(self, x, u_x=None): - if u_x is None: - u_x = self.compute_suff_stats(x) - return self.logh(x) + np.inner(u_x, self.eta) - self.A - - @staticmethod - def compute_A_nat(eta): - raise NotImplementedError() - - @staticmethod - def compute_A_std(params): - raise NotImplementedError() - - @staticmethod - def compute_eta(param): - raise NotImplementedError() - - @staticmethod - def compute_std(eta): - raise NotImplementedError() - - @abstractmethod - def _compute_nat_params(self): - pass - - @abstractmethod - def _compute_std_params(self): - pass diff --git a/hyperion/vb_pdfs/core/pdf.py b/hyperion/vb_pdfs/core/pdf.py deleted file mode 100644 index 012ff96c..00000000 --- a/hyperion/vb_pdfs/core/pdf.py +++ /dev/null @@ -1,32 +0,0 @@ -""" - Copyright 2017 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from abc import ABCMeta, abstractmethod -from ...hyp_model import HypModel - - -class PDF(HypModel): - __metaclass__ = ABCMeta - - def __init__(self, **kwargs): - super(PDF, self).__init__(**kwargs) - - # def get_config(self): - # config = {'x_dim': self.x_dim } - # base_config = super(PDF, self).get_config() - # return dict(list(base_config.items()) + list(config.items())) - - @abstractmethod - def log_prob(self, x): - pass - - def log_cdf(self, x): - raise NotImplementedError - - @abstractmethod - def sample(self, num_samples): - pass diff --git a/notebooks/tutorial_jsalt22/ivectors.ipynb b/notebooks/tutorial_jsalt22/ivectors.ipynb new file mode 100644 index 00000000..7f2780d9 --- /dev/null +++ b/notebooks/tutorial_jsalt22/ivectors.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# i-Vectors Tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hpath='/exp/jvillalba/hyperion/hyperion-persephone'\n", + "import sys\n", + "sys.path.append(hpath)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import hyperion as hyp\n", + "import hyperion.np as hnp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_data(num_dims, num_spks=10, num_utts=10, num_units=10, unit_length=10, tv_dim=2):\n", + " \"\"\" Generate data following the i-vector model\n", + "\n", + " Args:\n", + " num_dims: number of dimensions of the features.\n", + " num_spks: number of speakers.\n", + " num_utts: number of utterances per speaker.\n", + " num_units: number of phonetic units per utterance.\n", + " unit_length: duration of each phonetic unit.\n", + " \"\"\"\n", + " rng = np.random.RandomState(seed=1234)\n", + " # we set the number of phonetic classes to 2^num_dim\n", + " num_comp = 2**num_dims\n", + " \n", + " # Define UBM\n", + " # Means of the GMM-UBM\n", + " ubm_means = np.zeros((num_comp, num_dims))\n", + " kernel=np.array([1.,-1.])[:,None]\n", + " ubm_means = kernel\n", + " for i in range(1,num_dims):\n", + " ubm_means = np.concatenate((np.repeat(kernel, int(2**i), axis=0), np.tile(ubm_means,(2,1))), axis=1)\n", + " \n", + " # Covariances of the GMM-UBM\n", + " ubm_cov = 0.1 * np.ones((num_comp, num_dims))\n", + " ubm_prec = 1./ubm_cov\n", + "\n", + " # Weights of the GMM-UBM\n", + " ubm_weights = np.ones((num_comp))/num_comp\n", + " \n", + "\n", + " # Define between and within speaker covariances\n", + " sb = 0.7\n", + " sw = 0.3\n", + "\n", + " # Define Total Variability sub-space\n", + " T = rng.randn(tv_dim, num_dims * num_comp)\n", + " T = 0.2 * T/np.max(T)\n", + " \n", + " # Sample speakers\n", + " spk_ids = np.arange(num_spks)\n", + " y = np.sqrt(sb) * rng.randn(num_spks, tv_dim)\n", + "\n", + " # Sample i-vectors\n", + " spk_ids = np.repeat(spk_ids, num_utts, axis=0)\n", + " y = np.repeat(y, num_utts, axis=0)\n", + " w = y + np.sqrt(sw) * rng.randn(num_spks*num_utts, tv_dim)\n", + "\n", + " x = []\n", + " r_idx = []\n", + " # Sample features\n", + " for i in range(w.shape[0]):\n", + " # For each utterance\n", + " # Compute the GMM mean of the utterance\n", + " means_i = ubm_means + np.dot(w[i],T).reshape(num_dims,num_comp).T\n", + "\n", + " # Create a GMM for the utterance.\n", + " gmm = hnp.pdfs.GMMDiagCov(pi=ubm_weights, mu=means_i, Lambda=ubm_prec)\n", + "\n", + " # Sample the Gaussian components\n", + " r_i = rng.multinomial(1, ubm_weights, size=(num_units,))\n", + " # Assume that we stay in the same component several time steps.\n", + " r_i = np.repeat(r_i, unit_length, axis=0)\n", + " # Draw samples from the GMM\n", + " x_i = gmm.sample(r=r_i)\n", + " x.append(x_i)\n", + " r_idx.append(r_i.argmax(axis=-1))\n", + "\n", + " return x, r_idx, spk_ids\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_dim=3\n", + "x, r_idx, spk_ids=generate_data(num_dims=x_dim)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_cat=np.concatenate(x, axis=0)\n", + "fig = plt.figure()\n", + "ax = fig.add_subplot(projection='3d')\n", + "ax.scatter(x_cat[:,0], x_cat[:,1], x_cat[:,2], marker='o')\n", + "ax.set_xlabel('x1')\n", + "ax.set_ylabel('x2')\n", + "ax.set_zlabel('x3')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_comp=8\n", + "y_dim=2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ubm_gmm = hnp.pdfs.GMMDiagCov(num_comp=num_comp, x_dim=x_dim)\n", + "elbo, elbo_norm = ubm_gmm.fit(x_cat, epochs=10)\n", + "fig = plt.figure()\n", + "plt.plot(elbo_norm)\n", + "plt.xlabel('iters')\n", + "plt.ylabel('log(p(x))')\n", + "plt.grid(True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ubm_gmm.mu\n", + "ubm_gmm.pi\n", + "ubm_gmm.Sigma\n", + "fig=plt.figure()\n", + "ax=fig.add_subplot(111, projection=\"3d\")\n", + "ubm_gmm.plot3D_ellipsoid(num_sigmas=1, ax=ax)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iv_model = hnp.pdfs.JFATotal(K=num_comp, x_dim=x_dim, y_dim=y_dim)\n", + "N=[]\n", + "F=[]\n", + "for x_i in x:\n", + " N_i, u_x_i = ubm_gmm.accum_suff_stats(x_i)\n", + " N_i, F_i = ubm_gmm.norm_suff_stats(N_i, u_x_i)\n", + " N.append(N_i.reshape(1,-1))\n", + " F.append(F_i.reshape(1,-1))\n", + "\n", + "N = np.concatenate(N, axis=0)\n", + "F = np.concatenate(F, axis=0)\n", + "\n", + "elbo, elbo_norm = iv_model.fit(N, F)\n", + "fig = plt.figure()\n", + "plt.plot(elbo_norm)\n", + "plt.xlabel('iters')\n", + "plt.ylabel('log(p(x))')\n", + "plt.grid(True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_utts=100\n", + "w = np.randn(num_utts, 2)\n", + "M = ubm_gmm.mu.ravel() + ubm_gmm.cholLambda.ravel() * np.dot(w, iv_model.T)\n", + "M = M.reshape(num_utts, num_comp, x_dim)\n", + "fig = plt.figure()\n", + "ax = fig.add_subplot(projection='3d')\n", + "colors = ['b', 'g','r','c','m','y','k','b']\n", + "for i in range(num_comp):\n", + " ax.plot_surface(M[:,i,0], M[:,i,1], M[:,i,2], alpha=0.2, color=colors[i])\n", + " ax.scatter(M[:,i,0], M[:,i,1], M[:,i,2], marker='o', color=colors[i])\n", + "ax.set_xlabel('x1')\n", + "ax.set_ylabel('x2')\n", + "ax.set_zlabel('x3')\n", + "plt.show()\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "488a239b304e646027d6710c3377746db4487e56624448f35f81edd765904a6d" + }, + "kernelspec": { + "display_name": "Python 3.8.12 ('py38_pt101_cu112')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/tutorial_jsalt22/xvector.ipynb b/notebooks/tutorial_jsalt22/xvector.ipynb new file mode 100644 index 00000000..70f01057 --- /dev/null +++ b/notebooks/tutorial_jsalt22/xvector.ipynb @@ -0,0 +1,3760 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "hpath='/exp/jvillalba/hyperion/hyperion-persephone'\n", + "import sys\n", + "sys.path.append(hpath)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from copy import deepcopy\n", + "import logging\n", + "import numpy as np\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "from torch.utils.data import Dataset, DataLoader\n", + "\n", + "import hyperion as hyp\n", + "import hyperion.np as hnp\n", + "import hyperion.torch as ht\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "class IVDataset(Dataset):\n", + " \"\"\"Datasets that generates utterances following the i-vector model.\n", + " \n", + " Attributes:\n", + " num_samples: num. of utterances in the dataset.\n", + " num_spks: num. of speakers in the dataset.\n", + " x_dim: feature dimension.\n", + " num_gauss: number of Gaussian components in UBM-GMM\n", + " w_dim: i-vector dimension.\n", + " sb: isotropic between-class cov.\n", + " sw: isotropic within-class cov.\n", + " \n", + " \"\"\"\n", + " \n", + " def __init__(self, num_samples=10000, num_spks=100, x_dim=16, num_gauss=32, w_dim=16, sb=0.7, sw=0.3, utt_length=200, unit_length=25, seed=1234):\n", + " self.rng = np.random.RandomState(seed=seed)\n", + " self.num_samples = num_samples\n", + " self.num_spks = num_spks\n", + " self.x_dim = x_dim\n", + " self.w_dim = w_dim\n", + " self.num_gauss = num_gauss\n", + " self.utt_length = utt_length\n", + " self.unit_length = unit_length\n", + " self.sb = sb\n", + " self.sw = sw\n", + " self.y = self._make_spks(num_spks, w_dim, sb, self.rng)\n", + " self.gmm_ubm = self._make_ubm(x_dim, num_gauss, self.rng)\n", + " self.T = self._make_ivector(x_dim, num_gauss, w_dim, self.rng)\n", + "\n", + " def __len__(self):\n", + " return self.num_samples\n", + "\n", + " def __getitem__(self, idx):\n", + " spk_idx = idx % self.y.shape[0]\n", + " x = self.sample_utterance(spk_idx)\n", + " x = x.astype('float32')\n", + " return x, spk_idx\n", + "\n", + "\n", + " @staticmethod\n", + " def _make_spks(num_spks, w_dim, sb, rng):\n", + " \"\"\"Creates the speaker identity vectors\"\"\"\n", + " return np.sqrt(sb) * rng.randn(num_spks, w_dim)\n", + "\n", + " @staticmethod\n", + " def _make_ubm(x_dim, num_gauss, rng):\n", + " \"\"\"Creates the UBM GMM\"\"\"\n", + " # Define UBM\n", + " # Means of the GMM-UBM\n", + " ubm_means = rng.randn(num_gauss, x_dim)\n", + " ubm_means = np.sqrt(x_dim) * ubm_means/np.linalg.norm(ubm_means, axis=-1, keepdims=True)\n", + "\n", + " # Covariances of the GMM-UBM\n", + " ubm_cov = 0.1 * np.ones((num_gauss, x_dim))\n", + " ubm_prec = 1./ubm_cov\n", + "\n", + " # Weights of the GMM-UBM\n", + " ubm_weights = np.ones((num_gauss))/num_gauss\n", + "\n", + " return hnp.pdfs.GMMDiagCov(pi=ubm_weights, mu=ubm_means, Lambda=ubm_prec)\n", + "\n", + " @staticmethod\n", + " def _make_ivector(x_dim, num_gauss, w_dim, rng):\n", + " # Define Total Variability sub-space\n", + " T = rng.randn(w_dim, x_dim * num_gauss)\n", + " T = 0.2 * T/np.max(T)\n", + " return T\n", + "\n", + " def sample_utterance(self, spk_idx):\n", + " # generate i-vector\n", + " y = self.y[spk_idx] # spk factor\n", + " w = y + np.sqrt(self.sw) * self.rng.randn(self.w_dim)\n", + "\n", + " # For each utterance\n", + " # Compute the GMM mean of the utterance\n", + " means_i = self.gmm_ubm.mu + np.dot(w, self.T).reshape(self.x_dim, self.num_gauss).T\n", + "\n", + " # Create a GMM for the utterance.\n", + " gmm = self.gmm_ubm.copy()\n", + " gmm.mu = means_i\n", + " \n", + " # Sample the Gaussian components\n", + " num_units = self.utt_length // self.unit_length\n", + " r = self.rng.multinomial(1, gmm.pi, size=(num_units,))\n", + " # Assume that we stay in the same component several time steps.\n", + " r = np.repeat(r, self.unit_length, axis=0)\n", + " # Draw samples from the GMM\n", + " x = gmm.sample(r=r)\n", + " return x\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the Training dataset\n", + "train_data = IVDataset()\n", + "# The Validation dataset is a copy of the training data but with less samples\n", + "val_data = deepcopy(train_data)\n", + "val_data.num_samples = 100\n", + "\n", + "# Create data loaders\n", + "train_loader = DataLoader(train_data, batch_size=32, shuffle=True)\n", + "val_loader = DataLoader(val_data, batch_size=32, shuffle=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "class StemBlock(nn.Module):\n", + " \"\"\"Build block input layer of x-vector model\n", + "\n", + " Args:\n", + " in_channels: input channels.\n", + " out_channels: output channels.\n", + " kernel_size: kernels size for the convolution.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " in_channels,\n", + " out_channels,\n", + " kernel_size,\n", + " ):\n", + "\n", + " super().__init__()\n", + "\n", + " self.activation = nn.ReLU()\n", + " padding = int((kernel_size - 1) // 2)\n", + " self.bn = nn.BatchNorm1d(out_channels)\n", + " self.conv = nn.Conv1d(\n", + " in_channels,\n", + " out_channels,\n", + " bias=False,\n", + " kernel_size=kernel_size,\n", + " padding=padding,\n", + " )\n", + "\n", + " def forward(self, x, x_mask=None):\n", + " \"\"\"Forward function.\n", + "\n", + " Args:\n", + " x: input tensor with shape = (batch, in_channels, in_time).\n", + " x_mask: unused.\n", + "\n", + " Returns:\n", + " Tensor with shape = (batch, out_channels, out_time).\n", + " \"\"\"\n", + "\n", + " x = self.conv(x)\n", + " x = self.bn(x)\n", + " x = self.activation(x)\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "class TDNNBlock(nn.Module):\n", + " \"\"\"TDNN Layer block.\n", + "\n", + " Attributes:\n", + " in_channels: input channels.\n", + " out_channels: output channels.\n", + " kernel_size: kernel size.\n", + " dilation: dilation factor of the conv. kernels.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " in_channels,\n", + " out_channels,\n", + " kernel_size=3,\n", + " dilation=1,\n", + " ):\n", + "\n", + " super().__init__()\n", + " self.in_channels = in_channels\n", + " self.out_channels = out_channels\n", + " self.activation = nn.ReLU()\n", + " self.bn = nn.BatchNorm1d(out_channels)\n", + "\n", + " padding = int(dilation * (kernel_size - 1) // 2)\n", + " self.conv = nn.Conv1d(\n", + " in_channels,\n", + " out_channels,\n", + " bias=False,\n", + " kernel_size=kernel_size,\n", + " dilation=dilation,\n", + " padding=padding,\n", + " )\n", + "\n", + "\n", + " def forward(self, x):\n", + " \"\"\"Forward function.\n", + "\n", + " Args:\n", + " x: input tensor with shape = (batch, in_channels, time).\n", + " x_mask: unused.\n", + "\n", + " Returns:\n", + " Tensor with shape = (batch, out_channels, time).\n", + " \"\"\"\n", + " x = self.conv(x)\n", + " x = self.bn(x)\n", + " x = self.activation(x)\n", + " return x\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "class StatsPooling(nn.Module):\n", + " \"\"\"mean + stddev pooling layer.\"\"\"\n", + "\n", + " def forward(self, x):\n", + " x_mean = torch.mean(x, dim=-1)\n", + " x2_mean = torch.mean(x**2, dim=-1)\n", + " x_std = torch.sqrt((x2_mean-x_mean**2).clamp(min=1e-5))\n", + " return torch.cat((x_mean, x_std), dim=-1)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "class TDNNXVec(ht.TorchModel):\n", + "\n", + " def __init__(self, feat_dim, num_layers, layer_dim, embed_dim, num_classes):\n", + " super().__init__()\n", + " self.in_block = StemBlock(feat_dim, layer_dim, kernel_size=5)\n", + " tdnn_layers = []\n", + " for i in range(num_layers):\n", + " layer_i = TDNNBlock(layer_dim, layer_dim, kernel_size=3, dilation=i+2)\n", + " tdnn_layers.append(layer_i)\n", + "\n", + " self.tdnn_layers = nn.ModuleList(tdnn_layers)\n", + " self.pooling = StatsPooling()\n", + " self.projection = nn.Linear(2*layer_dim, embed_dim)\n", + " self.output = nn.Linear(embed_dim, num_classes)\n", + "\n", + " def update_loss_margin(self, epoch):\n", + " pass\n", + "\n", + " def forward(self, x, y=None, infer=False):\n", + " x = x.transpose(1,2)\n", + " x = self.in_block(x)\n", + " for i, layer in enumerate(self.tdnn_layers):\n", + " x = layer(x)\n", + "\n", + " x = self.pooling(x)\n", + " z = self.projection(x)\n", + " if infer:\n", + " return z\n", + "\n", + " return self.output(z)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "model = TDNNXVec(16, 2, 32, 16, 100)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:optimizer args={'opt_type': 'sgd', 'lr': 0.01, 'momentum': 0.9, 'oss': False}\n", + "INFO:root:lr scheduler args={'lrsch_type': 'exp_lr', 'decay_rate': 0.5, 'decay_steps': 4000, 'hold_steps': 2000, 'warmup_steps': 1000, 'update_lr_on_opt_step': True}\n" + ] + } + ], + "source": [ + "from hyperion.torch.trainers import XVectorTrainer\n", + "from hyperion.torch.metrics import CategoricalAccuracy\n", + "\n", + "optim = {\"opt_type\": \"sgd\", \"lr\": 0.01, \"momentum\": 0.9}\n", + "lrsched = {\"lrsch_type\": \"exp_lr\", \"decay_rate\": 0.5, \"decay_steps\": 4000, \"hold_steps\": 2000, \"warmup_steps\": 1000, \"update_lr_on_opt_step\": True}\n", + "metrics = {\"acc\": CategoricalAccuracy()}\n", + "trainer = XVectorTrainer(model, optim=optim, lrsched=lrsched, exp_path='./tdnn_xvec', device=torch.device('cpu'), train_mode=\"full\", metrics=metrics )" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:epoch: 1/100 starts\n", + "INFO:root:epoch: 1/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 4.601476 acc: 0.018750 lr: 0.000090\n", + "INFO:root:epoch: 1/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 4.598933 acc: 0.018750 lr: 0.000190\n", + "INFO:root:epoch: 1/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 4.598896 acc: 0.018750 lr: 0.000290\n", + "INFO:root:epoch: 1/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 4.598870 acc: 0.016406 lr: 0.000390\n", + "INFO:root:epoch: 1/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 4.598446 acc: 0.016250 lr: 0.000490\n", + "INFO:root:epoch: 1/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 4.596868 acc: 0.015625 lr: 0.000590\n", + "INFO:root:epoch: 1/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 4.596662 acc: 0.016071 lr: 0.000690\n", + "INFO:root:epoch: 1/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 4.596169 acc: 0.016797 lr: 0.000790\n", + "INFO:root:epoch: 1/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 4.596366 acc: 0.017361 lr: 0.000890\n", + "INFO:root:epoch: 1/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 4.598048 acc: 0.016562 lr: 0.000990\n", + "INFO:root:epoch: 1/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 4.597743 acc: 0.017330 lr: 0.001090\n", + "INFO:root:epoch: 1/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 4.597656 acc: 0.017969 lr: 0.001190\n", + "INFO:root:epoch: 1/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 4.597299 acc: 0.018510 lr: 0.001290\n", + "INFO:root:epoch: 1/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 4.596856 acc: 0.018750 lr: 0.001390\n", + "INFO:root:epoch: 1/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 4.596479 acc: 0.018542 lr: 0.001490\n", + "INFO:root:epoch: 1/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 4.596669 acc: 0.018359 lr: 0.001590\n", + "INFO:root:epoch: 1/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 4.596362 acc: 0.019301 lr: 0.001690\n", + "INFO:root:epoch: 1/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 4.596951 acc: 0.019271 lr: 0.001790\n", + "INFO:root:epoch: 1/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 4.597304 acc: 0.019572 lr: 0.001890\n", + "INFO:root:epoch: 1/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 4.597318 acc: 0.019063 lr: 0.001990\n", + "INFO:root:epoch: 1/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 4.596914 acc: 0.019494 lr: 0.002090\n", + "INFO:root:epoch: 1/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 4.597153 acc: 0.019176 lr: 0.002190\n", + "INFO:root:epoch: 1/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 4.597729 acc: 0.018750 lr: 0.002290\n", + "INFO:root:epoch: 1/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 4.597708 acc: 0.018750 lr: 0.002390\n", + "INFO:root:epoch: 1/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 4.597762 acc: 0.019375 lr: 0.002490\n", + "INFO:root:epoch: 1/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 4.597500 acc: 0.019351 lr: 0.002590\n", + "INFO:root:epoch: 1/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 4.597324 acc: 0.018981 lr: 0.002690\n", + "INFO:root:epoch: 1/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 4.597246 acc: 0.018527 lr: 0.002790\n", + "INFO:root:epoch: 1/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 4.597722 acc: 0.018211 lr: 0.002890\n", + "INFO:root:epoch: 1/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 4.597453 acc: 0.018437 lr: 0.002990\n", + "INFO:root:epoch: 1/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 4.597106 acc: 0.018548 lr: 0.003090\n", + "INFO:root:epoch: 2/100 starts\n", + "INFO:root:epoch: 2/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 4.587101 acc: 0.031250 lr: 0.003220\n", + "INFO:root:epoch: 2/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 4.588415 acc: 0.026562 lr: 0.003320\n", + "INFO:root:epoch: 2/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 4.582873 acc: 0.021875 lr: 0.003420\n", + "INFO:root:epoch: 2/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 4.584817 acc: 0.021094 lr: 0.003520\n", + "INFO:root:epoch: 2/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 4.586176 acc: 0.021875 lr: 0.003620\n", + "INFO:root:epoch: 2/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 4.587892 acc: 0.021354 lr: 0.003720\n", + "INFO:root:epoch: 2/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 4.589204 acc: 0.021875 lr: 0.003820\n", + "INFO:root:epoch: 2/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 4.587747 acc: 0.021094 lr: 0.003920\n", + "INFO:root:epoch: 2/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 4.587081 acc: 0.020139 lr: 0.004020\n", + "INFO:root:epoch: 2/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 4.587247 acc: 0.020000 lr: 0.004120\n", + "INFO:root:epoch: 2/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 4.586554 acc: 0.021307 lr: 0.004220\n", + "INFO:root:epoch: 2/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 4.587415 acc: 0.021094 lr: 0.004320\n", + "INFO:root:epoch: 2/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 4.587646 acc: 0.021635 lr: 0.004420\n", + "INFO:root:epoch: 2/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 4.586850 acc: 0.022098 lr: 0.004520\n", + "INFO:root:epoch: 2/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 4.586491 acc: 0.021667 lr: 0.004620\n", + "INFO:root:epoch: 2/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 4.586102 acc: 0.021680 lr: 0.004720\n", + "INFO:root:epoch: 2/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 4.585205 acc: 0.021691 lr: 0.004820\n", + "INFO:root:epoch: 2/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 4.584884 acc: 0.022743 lr: 0.004920\n", + "INFO:root:epoch: 2/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 4.584379 acc: 0.022697 lr: 0.005020\n", + "INFO:root:epoch: 2/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 4.584310 acc: 0.022656 lr: 0.005120\n", + "INFO:root:epoch: 2/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 4.583735 acc: 0.023512 lr: 0.005220\n", + "INFO:root:epoch: 2/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 4.583194 acc: 0.023722 lr: 0.005320\n", + "INFO:root:epoch: 2/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 4.582450 acc: 0.023913 lr: 0.005420\n", + "INFO:root:epoch: 2/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 4.581911 acc: 0.024089 lr: 0.005520\n", + "INFO:root:epoch: 2/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 4.581052 acc: 0.024500 lr: 0.005620\n", + "INFO:root:epoch: 2/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 4.579837 acc: 0.024880 lr: 0.005720\n", + "INFO:root:epoch: 2/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 4.579425 acc: 0.024769 lr: 0.005820\n", + "INFO:root:epoch: 2/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 4.578843 acc: 0.024888 lr: 0.005920\n", + "INFO:root:epoch: 2/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 4.577931 acc: 0.025216 lr: 0.006020\n", + "INFO:root:epoch: 2/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 4.577415 acc: 0.025000 lr: 0.006120\n", + "INFO:root:epoch: 2/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 4.576722 acc: 0.024798 lr: 0.006220\n", + "INFO:root:epoch: 3/100 starts\n", + "INFO:root:epoch: 3/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 4.554732 acc: 0.018750 lr: 0.006350\n", + "INFO:root:epoch: 3/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 4.546799 acc: 0.025000 lr: 0.006450\n", + "INFO:root:epoch: 3/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 4.543545 acc: 0.021875 lr: 0.006550\n", + "INFO:root:epoch: 3/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 4.539997 acc: 0.023437 lr: 0.006650\n", + "INFO:root:epoch: 3/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 4.534892 acc: 0.027500 lr: 0.006750\n", + "INFO:root:epoch: 3/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 4.533372 acc: 0.026042 lr: 0.006850\n", + "INFO:root:epoch: 3/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 4.535993 acc: 0.027232 lr: 0.006950\n", + "INFO:root:epoch: 3/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 4.532067 acc: 0.027734 lr: 0.007050\n", + "INFO:root:epoch: 3/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 4.529817 acc: 0.027083 lr: 0.007150\n", + "INFO:root:epoch: 3/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 4.528748 acc: 0.026875 lr: 0.007250\n", + "INFO:root:epoch: 3/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 4.524458 acc: 0.027273 lr: 0.007350\n", + "INFO:root:epoch: 3/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 4.522450 acc: 0.027083 lr: 0.007450\n", + "INFO:root:epoch: 3/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 4.519957 acc: 0.027644 lr: 0.007550\n", + "INFO:root:epoch: 3/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 4.516462 acc: 0.029018 lr: 0.007650\n", + "INFO:root:epoch: 3/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 4.512551 acc: 0.029375 lr: 0.007750\n", + "INFO:root:epoch: 3/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 4.511358 acc: 0.029102 lr: 0.007850\n", + "INFO:root:epoch: 3/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 4.505479 acc: 0.030515 lr: 0.007950\n", + "INFO:root:epoch: 3/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 4.500308 acc: 0.030382 lr: 0.008050\n", + "INFO:root:epoch: 3/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 4.496550 acc: 0.030099 lr: 0.008150\n", + "INFO:root:epoch: 3/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 4.491571 acc: 0.030781 lr: 0.008250\n", + "INFO:root:epoch: 3/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 4.486266 acc: 0.031250 lr: 0.008350\n", + "INFO:root:epoch: 3/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 4.480225 acc: 0.031108 lr: 0.008450\n", + "INFO:root:epoch: 3/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 4.474944 acc: 0.030299 lr: 0.008550\n", + "INFO:root:epoch: 3/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 4.469195 acc: 0.030599 lr: 0.008650\n", + "INFO:root:epoch: 3/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 4.462828 acc: 0.031250 lr: 0.008750\n", + "INFO:root:epoch: 3/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 4.458274 acc: 0.031611 lr: 0.008850\n", + "INFO:root:epoch: 3/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 4.451506 acc: 0.032639 lr: 0.008950\n", + "INFO:root:epoch: 3/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 4.447941 acc: 0.032701 lr: 0.009050\n", + "INFO:root:epoch: 3/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 4.442886 acc: 0.033190 lr: 0.009150\n", + "INFO:root:epoch: 3/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 4.436341 acc: 0.033021 lr: 0.009250\n", + "INFO:root:epoch: 3/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 4.431607 acc: 0.033165 lr: 0.009350\n", + "INFO:root:epoch: 4/100 starts\n", + "INFO:root:epoch: 4/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 4.216670 acc: 0.040625 lr: 0.009480\n", + "INFO:root:epoch: 4/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 4.220620 acc: 0.039062 lr: 0.009580\n", + "INFO:root:epoch: 4/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 4.215897 acc: 0.040625 lr: 0.009680\n", + "INFO:root:epoch: 4/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 4.204186 acc: 0.042188 lr: 0.009780\n", + "INFO:root:epoch: 4/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 4.204257 acc: 0.041875 lr: 0.009880\n", + "INFO:root:epoch: 4/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 4.193417 acc: 0.048438 lr: 0.009980\n", + "INFO:root:epoch: 4/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 4.188381 acc: 0.048214 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 4.175587 acc: 0.049219 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 4.172535 acc: 0.048958 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 4.158615 acc: 0.051562 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 4.143053 acc: 0.053409 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 4.133410 acc: 0.054427 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 4.131123 acc: 0.053846 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 4.130420 acc: 0.055580 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 4.121361 acc: 0.054792 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 4.107443 acc: 0.057617 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 4.098401 acc: 0.060294 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 4.090190 acc: 0.060937 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 4.076577 acc: 0.061349 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 4.067421 acc: 0.061562 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 4.060067 acc: 0.060863 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 4.049889 acc: 0.062074 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 4.040142 acc: 0.062636 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 4.031185 acc: 0.063932 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 4.015913 acc: 0.065375 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 4.004460 acc: 0.066947 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 3.989642 acc: 0.069329 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 3.973949 acc: 0.072210 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 3.960340 acc: 0.074461 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 3.948678 acc: 0.075312 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 3.937246 acc: 0.076512 lr: 0.010000\n", + "INFO:root:epoch: 5/100 starts\n", + "INFO:root:epoch: 5/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 3.602264 acc: 0.146875 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 3.575348 acc: 0.131250 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 3.582449 acc: 0.118750 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 3.577921 acc: 0.121875 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 3.565801 acc: 0.121250 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 3.548683 acc: 0.118750 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 3.508676 acc: 0.120089 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 3.496036 acc: 0.123047 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 3.490975 acc: 0.122917 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 3.477949 acc: 0.124063 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 3.463334 acc: 0.125852 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 3.450261 acc: 0.124740 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 3.438849 acc: 0.128846 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 3.431314 acc: 0.129241 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 3.416327 acc: 0.133333 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 3.406788 acc: 0.136328 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 3.395345 acc: 0.138787 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 3.388426 acc: 0.140278 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 3.381553 acc: 0.142599 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 3.367660 acc: 0.145312 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 3.354665 acc: 0.147917 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 3.348045 acc: 0.148295 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 3.339810 acc: 0.151766 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 3.324989 acc: 0.154167 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 3.310998 acc: 0.156875 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 3.299634 acc: 0.158654 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 3.291406 acc: 0.160532 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 3.283068 acc: 0.161272 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 3.271852 acc: 0.162931 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 3.265716 acc: 0.163437 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 3.254701 acc: 0.165625 lr: 0.010000\n", + "INFO:root:epoch: 6/100 starts\n", + "INFO:root:epoch: 6/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 2.820441 acc: 0.228125 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 2.939542 acc: 0.231250 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 2.921857 acc: 0.229167 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 2.906477 acc: 0.224219 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 2.914442 acc: 0.230000 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 2.915281 acc: 0.225000 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 2.917999 acc: 0.219643 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 2.905005 acc: 0.221875 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 2.914437 acc: 0.220486 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 2.907124 acc: 0.222812 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 2.909275 acc: 0.222727 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 2.903210 acc: 0.222656 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 2.899808 acc: 0.221875 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 2.897036 acc: 0.222768 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 2.892817 acc: 0.225000 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 2.893514 acc: 0.225391 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 2.886759 acc: 0.227390 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 2.884362 acc: 0.228299 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 2.880840 acc: 0.228125 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 2.872577 acc: 0.230938 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 2.867449 acc: 0.233185 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 2.862959 acc: 0.234233 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 2.862449 acc: 0.234511 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 2.855118 acc: 0.236458 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 2.851627 acc: 0.236500 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 2.842585 acc: 0.238341 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 2.837825 acc: 0.239583 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 2.834812 acc: 0.238616 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 2.829549 acc: 0.238901 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 2.827458 acc: 0.239896 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 2.823910 acc: 0.238810 lr: 0.010000\n", + "INFO:root:epoch: 7/100 starts\n", + "INFO:root:epoch: 7/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 2.607040 acc: 0.287500 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 2.614367 acc: 0.289062 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 2.638236 acc: 0.283333 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 2.631425 acc: 0.284375 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 2.635860 acc: 0.283750 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 2.633486 acc: 0.283333 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 2.660004 acc: 0.282143 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 2.655013 acc: 0.284375 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 2.650505 acc: 0.284722 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 2.653058 acc: 0.281563 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 2.652163 acc: 0.280966 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 2.656446 acc: 0.280469 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 2.651380 acc: 0.280288 lr: 0.009988\n", + "INFO:root:epoch: 7/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 2.641273 acc: 0.280357 lr: 0.009971\n", + "INFO:root:epoch: 7/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 2.635538 acc: 0.280625 lr: 0.009953\n", + "INFO:root:epoch: 7/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 2.629010 acc: 0.280078 lr: 0.009936\n", + "INFO:root:epoch: 7/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 2.621454 acc: 0.282721 lr: 0.009919\n", + "INFO:root:epoch: 7/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 2.620412 acc: 0.282292 lr: 0.009902\n", + "INFO:root:epoch: 7/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 2.616242 acc: 0.282072 lr: 0.009885\n", + "INFO:root:epoch: 7/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 2.613541 acc: 0.283281 lr: 0.009867\n", + "INFO:root:epoch: 7/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 2.606067 acc: 0.285119 lr: 0.009850\n", + "INFO:root:epoch: 7/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 2.600847 acc: 0.285937 lr: 0.009833\n", + "INFO:root:epoch: 7/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 2.596402 acc: 0.287092 lr: 0.009816\n", + "INFO:root:epoch: 7/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 2.593893 acc: 0.287500 lr: 0.009799\n", + "INFO:root:epoch: 7/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 2.596835 acc: 0.287250 lr: 0.009782\n", + "INFO:root:epoch: 7/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 2.588553 acc: 0.289062 lr: 0.009765\n", + "INFO:root:epoch: 7/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 2.582253 acc: 0.290625 lr: 0.009748\n", + "INFO:root:epoch: 7/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 2.583183 acc: 0.289955 lr: 0.009732\n", + "INFO:root:epoch: 7/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 2.578796 acc: 0.290194 lr: 0.009715\n", + "INFO:root:epoch: 7/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 2.574179 acc: 0.289792 lr: 0.009698\n", + "INFO:root:epoch: 7/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 2.570780 acc: 0.290524 lr: 0.009681\n", + "INFO:root:epoch: 8/100 starts\n", + "INFO:root:epoch: 8/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 2.451610 acc: 0.346875 lr: 0.009659\n", + "INFO:root:epoch: 8/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 2.430122 acc: 0.337500 lr: 0.009643\n", + "INFO:root:epoch: 8/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 2.406345 acc: 0.322917 lr: 0.009626\n", + "INFO:root:epoch: 8/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 2.398308 acc: 0.324219 lr: 0.009609\n", + "INFO:root:epoch: 8/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 2.389222 acc: 0.323750 lr: 0.009593\n", + "INFO:root:epoch: 8/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 2.398692 acc: 0.321354 lr: 0.009576\n", + "INFO:root:epoch: 8/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 2.409715 acc: 0.319643 lr: 0.009559\n", + "INFO:root:epoch: 8/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 2.426385 acc: 0.313281 lr: 0.009543\n", + "INFO:root:epoch: 8/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 2.418058 acc: 0.312500 lr: 0.009526\n", + "INFO:root:epoch: 8/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 2.407412 acc: 0.319062 lr: 0.009510\n", + "INFO:root:epoch: 8/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 2.407154 acc: 0.318466 lr: 0.009493\n", + "INFO:root:epoch: 8/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 2.398121 acc: 0.319531 lr: 0.009477\n", + "INFO:root:epoch: 8/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 2.399252 acc: 0.318990 lr: 0.009461\n", + "INFO:root:epoch: 8/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 2.395213 acc: 0.321205 lr: 0.009444\n", + "INFO:root:epoch: 8/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 2.402388 acc: 0.321458 lr: 0.009428\n", + "INFO:root:epoch: 8/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 2.397477 acc: 0.323047 lr: 0.009412\n", + "INFO:root:epoch: 8/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 2.401867 acc: 0.320956 lr: 0.009395\n", + "INFO:root:epoch: 8/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 2.393594 acc: 0.321875 lr: 0.009379\n", + "INFO:root:epoch: 8/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 2.390463 acc: 0.323520 lr: 0.009363\n", + "INFO:root:epoch: 8/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 2.388567 acc: 0.324219 lr: 0.009347\n", + "INFO:root:epoch: 8/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 2.387305 acc: 0.324702 lr: 0.009330\n", + "INFO:root:epoch: 8/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 2.379817 acc: 0.327131 lr: 0.009314\n", + "INFO:root:epoch: 8/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 2.381477 acc: 0.326630 lr: 0.009298\n", + "INFO:root:epoch: 8/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 2.374062 acc: 0.328125 lr: 0.009282\n", + "INFO:root:epoch: 8/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 2.368610 acc: 0.330000 lr: 0.009266\n", + "INFO:root:epoch: 8/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 2.366560 acc: 0.330889 lr: 0.009250\n", + "INFO:root:epoch: 8/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 2.359682 acc: 0.332755 lr: 0.009234\n", + "INFO:root:epoch: 8/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 2.359157 acc: 0.332812 lr: 0.009218\n", + "INFO:root:epoch: 8/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 2.351791 acc: 0.334375 lr: 0.009202\n", + "INFO:root:epoch: 8/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 2.346438 acc: 0.335729 lr: 0.009186\n", + "INFO:root:epoch: 8/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 2.345128 acc: 0.336492 lr: 0.009170\n", + "INFO:root:epoch: 9/100 starts\n", + "INFO:root:epoch: 9/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 2.281513 acc: 0.356250 lr: 0.009149\n", + "INFO:root:epoch: 9/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 2.292913 acc: 0.357812 lr: 0.009134\n", + "INFO:root:epoch: 9/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 2.271482 acc: 0.366667 lr: 0.009118\n", + "INFO:root:epoch: 9/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 2.253157 acc: 0.363281 lr: 0.009102\n", + "INFO:root:epoch: 9/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 2.234764 acc: 0.366250 lr: 0.009086\n", + "INFO:root:epoch: 9/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 2.234404 acc: 0.369271 lr: 0.009070\n", + "INFO:root:epoch: 9/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 2.226868 acc: 0.370982 lr: 0.009055\n", + "INFO:root:epoch: 9/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 2.227068 acc: 0.371875 lr: 0.009039\n", + "INFO:root:epoch: 9/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 2.234855 acc: 0.369444 lr: 0.009023\n", + "INFO:root:epoch: 9/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 2.224417 acc: 0.371250 lr: 0.009008\n", + "INFO:root:epoch: 9/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 2.215646 acc: 0.369886 lr: 0.008992\n", + "INFO:root:epoch: 9/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 2.221938 acc: 0.367969 lr: 0.008977\n", + "INFO:root:epoch: 9/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 2.213379 acc: 0.369471 lr: 0.008961\n", + "INFO:root:epoch: 9/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 2.202343 acc: 0.370536 lr: 0.008946\n", + "INFO:root:epoch: 9/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 2.205875 acc: 0.365208 lr: 0.008930\n", + "INFO:root:epoch: 9/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 2.202338 acc: 0.367188 lr: 0.008915\n", + "INFO:root:epoch: 9/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 2.203373 acc: 0.365441 lr: 0.008899\n", + "INFO:root:epoch: 9/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 2.205812 acc: 0.366146 lr: 0.008884\n", + "INFO:root:epoch: 9/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 2.201259 acc: 0.367105 lr: 0.008868\n", + "INFO:root:epoch: 9/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 2.197683 acc: 0.367344 lr: 0.008853\n", + "INFO:root:epoch: 9/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 2.196061 acc: 0.367262 lr: 0.008838\n", + "INFO:root:epoch: 9/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 2.194003 acc: 0.368040 lr: 0.008822\n", + "INFO:root:epoch: 9/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 2.189800 acc: 0.369837 lr: 0.008807\n", + "INFO:root:epoch: 9/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 2.193373 acc: 0.369271 lr: 0.008792\n", + "INFO:root:epoch: 9/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 2.189737 acc: 0.370375 lr: 0.008777\n", + "INFO:root:epoch: 9/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 2.186865 acc: 0.371274 lr: 0.008762\n", + "INFO:root:epoch: 9/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 2.185678 acc: 0.370833 lr: 0.008746\n", + "INFO:root:epoch: 9/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 2.185041 acc: 0.369308 lr: 0.008731\n", + "INFO:root:epoch: 9/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 2.182037 acc: 0.369504 lr: 0.008716\n", + "INFO:root:epoch: 9/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 2.181730 acc: 0.370625 lr: 0.008701\n", + "INFO:root:epoch: 9/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 2.178883 acc: 0.372177 lr: 0.008686\n", + "INFO:root:epoch: 10/100 starts\n", + "INFO:root:epoch: 10/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 2.206240 acc: 0.356250 lr: 0.008666\n", + "INFO:root:epoch: 10/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 2.121857 acc: 0.393750 lr: 0.008651\n", + "INFO:root:epoch: 10/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 2.135537 acc: 0.379167 lr: 0.008636\n", + "INFO:root:epoch: 10/100 et: 1s eta: 13s batches: 40/313(12%) samples: 1280 loss: 2.120857 acc: 0.388281 lr: 0.008621\n", + "INFO:root:epoch: 10/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 2.131934 acc: 0.379375 lr: 0.008607\n", + "INFO:root:epoch: 10/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 2.105163 acc: 0.390625 lr: 0.008592\n", + "INFO:root:epoch: 10/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 2.096578 acc: 0.391071 lr: 0.008577\n", + "INFO:root:epoch: 10/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 2.085447 acc: 0.394141 lr: 0.008562\n", + "INFO:root:epoch: 10/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 2.076434 acc: 0.396875 lr: 0.008547\n", + "INFO:root:epoch: 10/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 2.064907 acc: 0.402188 lr: 0.008532\n", + "INFO:root:epoch: 10/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 2.054218 acc: 0.404830 lr: 0.008517\n", + "INFO:root:epoch: 10/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 2.041692 acc: 0.408073 lr: 0.008503\n", + "INFO:root:epoch: 10/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 2.031965 acc: 0.411058 lr: 0.008488\n", + "INFO:root:epoch: 10/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 2.017054 acc: 0.414286 lr: 0.008473\n", + "INFO:root:epoch: 10/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 2.022641 acc: 0.411667 lr: 0.008459\n", + "INFO:root:epoch: 10/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 2.028360 acc: 0.410938 lr: 0.008444\n", + "INFO:root:epoch: 10/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 2.032524 acc: 0.411029 lr: 0.008429\n", + "INFO:root:epoch: 10/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 2.033351 acc: 0.410417 lr: 0.008415\n", + "INFO:root:epoch: 10/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 2.031236 acc: 0.408553 lr: 0.008400\n", + "INFO:root:epoch: 10/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 2.030756 acc: 0.408750 lr: 0.008386\n", + "INFO:root:epoch: 10/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 2.030620 acc: 0.410119 lr: 0.008371\n", + "INFO:root:epoch: 10/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 2.023995 acc: 0.408097 lr: 0.008357\n", + "INFO:root:epoch: 10/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 2.021110 acc: 0.410870 lr: 0.008342\n", + "INFO:root:epoch: 10/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 2.029748 acc: 0.408984 lr: 0.008328\n", + "INFO:root:epoch: 10/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 2.029976 acc: 0.411000 lr: 0.008313\n", + "INFO:root:epoch: 10/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 2.032057 acc: 0.409014 lr: 0.008299\n", + "INFO:root:epoch: 10/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 2.026943 acc: 0.412153 lr: 0.008285\n", + "INFO:root:epoch: 10/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 2.024672 acc: 0.413951 lr: 0.008270\n", + "INFO:root:epoch: 10/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 2.022465 acc: 0.414871 lr: 0.008256\n", + "INFO:root:epoch: 10/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 2.018964 acc: 0.415521 lr: 0.008242\n", + "INFO:root:epoch: 10/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 2.013963 acc: 0.417036 lr: 0.008227\n", + "INFO:root:epoch: 11/100 starts\n", + "INFO:root:epoch: 11/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.937762 acc: 0.434375 lr: 0.008209\n", + "INFO:root:epoch: 11/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.951108 acc: 0.437500 lr: 0.008195\n", + "INFO:root:epoch: 11/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.933343 acc: 0.436458 lr: 0.008180\n", + "INFO:root:epoch: 11/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.929408 acc: 0.428906 lr: 0.008166\n", + "INFO:root:epoch: 11/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.915191 acc: 0.433125 lr: 0.008152\n", + "INFO:root:epoch: 11/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.903354 acc: 0.435938 lr: 0.008138\n", + "INFO:root:epoch: 11/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.914053 acc: 0.434821 lr: 0.008124\n", + "INFO:root:epoch: 11/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.927116 acc: 0.428125 lr: 0.008110\n", + "INFO:root:epoch: 11/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.913659 acc: 0.431250 lr: 0.008096\n", + "INFO:root:epoch: 11/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.905937 acc: 0.435000 lr: 0.008082\n", + "INFO:root:epoch: 11/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.909420 acc: 0.436932 lr: 0.008068\n", + "INFO:root:epoch: 11/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.912288 acc: 0.436719 lr: 0.008054\n", + "INFO:root:epoch: 11/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.908594 acc: 0.437500 lr: 0.008040\n", + "INFO:root:epoch: 11/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.900271 acc: 0.440402 lr: 0.008026\n", + "INFO:root:epoch: 11/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.896671 acc: 0.440000 lr: 0.008012\n", + "INFO:root:epoch: 11/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.901755 acc: 0.438867 lr: 0.007998\n", + "INFO:root:epoch: 11/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.905474 acc: 0.437684 lr: 0.007984\n", + "INFO:root:epoch: 11/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.909003 acc: 0.437153 lr: 0.007971\n", + "INFO:root:epoch: 11/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.919181 acc: 0.434704 lr: 0.007957\n", + "INFO:root:epoch: 11/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.924912 acc: 0.433437 lr: 0.007943\n", + "INFO:root:epoch: 11/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.922627 acc: 0.434077 lr: 0.007929\n", + "INFO:root:epoch: 11/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.919427 acc: 0.436506 lr: 0.007915\n", + "INFO:root:epoch: 11/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.915449 acc: 0.436821 lr: 0.007902\n", + "INFO:root:epoch: 11/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.911553 acc: 0.436328 lr: 0.007888\n", + "INFO:root:epoch: 11/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.911302 acc: 0.436375 lr: 0.007874\n", + "INFO:root:epoch: 11/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.908464 acc: 0.438942 lr: 0.007861\n", + "INFO:root:epoch: 11/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.901090 acc: 0.440972 lr: 0.007847\n", + "INFO:root:epoch: 11/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.895085 acc: 0.442522 lr: 0.007834\n", + "INFO:root:epoch: 11/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.893688 acc: 0.443427 lr: 0.007820\n", + "INFO:root:epoch: 11/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.894488 acc: 0.442813 lr: 0.007807\n", + "INFO:root:epoch: 11/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.895415 acc: 0.441230 lr: 0.007793\n", + "INFO:root:epoch: 12/100 starts\n", + "INFO:root:epoch: 12/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.809218 acc: 0.456250 lr: 0.007775\n", + "INFO:root:epoch: 12/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.880935 acc: 0.448437 lr: 0.007762\n", + "INFO:root:epoch: 12/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.879414 acc: 0.450000 lr: 0.007749\n", + "INFO:root:epoch: 12/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.840837 acc: 0.466406 lr: 0.007735\n", + "INFO:root:epoch: 12/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.828304 acc: 0.470000 lr: 0.007722\n", + "INFO:root:epoch: 12/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.816310 acc: 0.472917 lr: 0.007708\n", + "INFO:root:epoch: 12/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.795621 acc: 0.475446 lr: 0.007695\n", + "INFO:root:epoch: 12/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.800446 acc: 0.476562 lr: 0.007682\n", + "INFO:root:epoch: 12/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.794079 acc: 0.477431 lr: 0.007668\n", + "INFO:root:epoch: 12/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.795587 acc: 0.479375 lr: 0.007655\n", + "INFO:root:epoch: 12/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.795109 acc: 0.477557 lr: 0.007642\n", + "INFO:root:epoch: 12/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.810152 acc: 0.476562 lr: 0.007629\n", + "INFO:root:epoch: 12/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.814612 acc: 0.475240 lr: 0.007615\n", + "INFO:root:epoch: 12/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.816775 acc: 0.475446 lr: 0.007602\n", + "INFO:root:epoch: 12/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.808929 acc: 0.477708 lr: 0.007589\n", + "INFO:root:epoch: 12/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.802069 acc: 0.477930 lr: 0.007576\n", + "INFO:root:epoch: 12/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.791479 acc: 0.480882 lr: 0.007563\n", + "INFO:root:epoch: 12/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.791557 acc: 0.480382 lr: 0.007550\n", + "INFO:root:epoch: 12/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.790787 acc: 0.481086 lr: 0.007537\n", + "INFO:root:epoch: 12/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.782486 acc: 0.482187 lr: 0.007524\n", + "INFO:root:epoch: 12/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.786853 acc: 0.479018 lr: 0.007511\n", + "INFO:root:epoch: 12/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.784409 acc: 0.478409 lr: 0.007498\n", + "INFO:root:epoch: 12/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.782175 acc: 0.479755 lr: 0.007485\n", + "INFO:root:epoch: 12/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.777636 acc: 0.480339 lr: 0.007472\n", + "INFO:root:epoch: 12/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.775001 acc: 0.481250 lr: 0.007459\n", + "INFO:root:epoch: 12/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.773161 acc: 0.481010 lr: 0.007446\n", + "INFO:root:epoch: 12/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.769285 acc: 0.483333 lr: 0.007433\n", + "INFO:root:epoch: 12/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.771988 acc: 0.481250 lr: 0.007420\n", + "INFO:root:epoch: 12/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.768392 acc: 0.482651 lr: 0.007407\n", + "INFO:root:epoch: 12/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.773989 acc: 0.480000 lr: 0.007394\n", + "INFO:root:epoch: 12/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.773571 acc: 0.480645 lr: 0.007382\n", + "INFO:root:epoch: 13/100 starts\n", + "INFO:root:epoch: 13/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.907340 acc: 0.443750 lr: 0.007365\n", + "INFO:root:epoch: 13/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.854356 acc: 0.451563 lr: 0.007352\n", + "INFO:root:epoch: 13/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.850993 acc: 0.455208 lr: 0.007339\n", + "INFO:root:epoch: 13/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.807789 acc: 0.474219 lr: 0.007327\n", + "INFO:root:epoch: 13/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.767176 acc: 0.485000 lr: 0.007314\n", + "INFO:root:epoch: 13/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.754622 acc: 0.489062 lr: 0.007301\n", + "INFO:root:epoch: 13/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.735719 acc: 0.495536 lr: 0.007289\n", + "INFO:root:epoch: 13/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.729122 acc: 0.492578 lr: 0.007276\n", + "INFO:root:epoch: 13/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.739504 acc: 0.488194 lr: 0.007264\n", + "INFO:root:epoch: 13/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.723727 acc: 0.495000 lr: 0.007251\n", + "INFO:root:epoch: 13/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.726329 acc: 0.496307 lr: 0.007238\n", + "INFO:root:epoch: 13/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.720890 acc: 0.495833 lr: 0.007226\n", + "INFO:root:epoch: 13/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.717123 acc: 0.500240 lr: 0.007213\n", + "INFO:root:epoch: 13/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.702864 acc: 0.504687 lr: 0.007201\n", + "INFO:root:epoch: 13/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.716533 acc: 0.500208 lr: 0.007188\n", + "INFO:root:epoch: 13/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.707112 acc: 0.503711 lr: 0.007176\n", + "INFO:root:epoch: 13/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.704043 acc: 0.504044 lr: 0.007164\n", + "INFO:root:epoch: 13/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.706834 acc: 0.500868 lr: 0.007151\n", + "INFO:root:epoch: 13/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.701104 acc: 0.499671 lr: 0.007139\n", + "INFO:root:epoch: 13/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.706654 acc: 0.497969 lr: 0.007126\n", + "INFO:root:epoch: 13/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.709501 acc: 0.495238 lr: 0.007114\n", + "INFO:root:epoch: 13/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.712312 acc: 0.494602 lr: 0.007102\n", + "INFO:root:epoch: 13/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.709233 acc: 0.496060 lr: 0.007089\n", + "INFO:root:epoch: 13/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.706317 acc: 0.497266 lr: 0.007077\n", + "INFO:root:epoch: 13/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.709944 acc: 0.496625 lr: 0.007065\n", + "INFO:root:epoch: 13/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.708697 acc: 0.496034 lr: 0.007053\n", + "INFO:root:epoch: 13/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.708349 acc: 0.495370 lr: 0.007041\n", + "INFO:root:epoch: 13/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.705332 acc: 0.495647 lr: 0.007028\n", + "INFO:root:epoch: 13/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.702112 acc: 0.496444 lr: 0.007016\n", + "INFO:root:epoch: 13/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.699214 acc: 0.496354 lr: 0.007004\n", + "INFO:root:epoch: 13/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.700123 acc: 0.495968 lr: 0.006992\n", + "INFO:root:epoch: 14/100 starts\n", + "INFO:root:epoch: 14/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.590518 acc: 0.540625 lr: 0.006976\n", + "INFO:root:epoch: 14/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.565230 acc: 0.537500 lr: 0.006964\n", + "INFO:root:epoch: 14/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.629103 acc: 0.525000 lr: 0.006952\n", + "INFO:root:epoch: 14/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.609315 acc: 0.531250 lr: 0.006940\n", + "INFO:root:epoch: 14/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.618679 acc: 0.530000 lr: 0.006928\n", + "INFO:root:epoch: 14/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.636791 acc: 0.523958 lr: 0.006916\n", + "INFO:root:epoch: 14/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.622564 acc: 0.530357 lr: 0.006904\n", + "INFO:root:epoch: 14/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.633111 acc: 0.531250 lr: 0.006892\n", + "INFO:root:epoch: 14/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.629323 acc: 0.533333 lr: 0.006880\n", + "INFO:root:epoch: 14/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.631151 acc: 0.533125 lr: 0.006868\n", + "INFO:root:epoch: 14/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.631182 acc: 0.530966 lr: 0.006856\n", + "INFO:root:epoch: 14/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.630641 acc: 0.529167 lr: 0.006844\n", + "INFO:root:epoch: 14/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.631247 acc: 0.528365 lr: 0.006833\n", + "INFO:root:epoch: 14/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.642582 acc: 0.523884 lr: 0.006821\n", + "INFO:root:epoch: 14/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.634954 acc: 0.526250 lr: 0.006809\n", + "INFO:root:epoch: 14/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.633762 acc: 0.527734 lr: 0.006797\n", + "INFO:root:epoch: 14/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.627493 acc: 0.528309 lr: 0.006785\n", + "INFO:root:epoch: 14/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.622669 acc: 0.528299 lr: 0.006774\n", + "INFO:root:epoch: 14/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.621551 acc: 0.527632 lr: 0.006762\n", + "INFO:root:epoch: 14/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.618660 acc: 0.528750 lr: 0.006750\n", + "INFO:root:epoch: 14/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.620324 acc: 0.527976 lr: 0.006739\n", + "INFO:root:epoch: 14/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.619587 acc: 0.528267 lr: 0.006727\n", + "INFO:root:epoch: 14/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.617841 acc: 0.527310 lr: 0.006715\n", + "INFO:root:epoch: 14/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.620178 acc: 0.526042 lr: 0.006704\n", + "INFO:root:epoch: 14/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.620031 acc: 0.526500 lr: 0.006692\n", + "INFO:root:epoch: 14/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.626258 acc: 0.525601 lr: 0.006680\n", + "INFO:root:epoch: 14/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.629321 acc: 0.525000 lr: 0.006669\n", + "INFO:root:epoch: 14/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.624419 acc: 0.526339 lr: 0.006657\n", + "INFO:root:epoch: 14/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.620877 acc: 0.527155 lr: 0.006646\n", + "INFO:root:epoch: 14/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.620289 acc: 0.527292 lr: 0.006634\n", + "INFO:root:epoch: 14/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.618532 acc: 0.527319 lr: 0.006623\n", + "INFO:root:epoch: 15/100 starts\n", + "INFO:root:epoch: 15/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.670857 acc: 0.500000 lr: 0.006608\n", + "INFO:root:epoch: 15/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.671997 acc: 0.487500 lr: 0.006596\n", + "INFO:root:epoch: 15/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.660190 acc: 0.496875 lr: 0.006585\n", + "INFO:root:epoch: 15/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.652766 acc: 0.502344 lr: 0.006574\n", + "INFO:root:epoch: 15/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.599847 acc: 0.518125 lr: 0.006562\n", + "INFO:root:epoch: 15/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.589491 acc: 0.527604 lr: 0.006551\n", + "INFO:root:epoch: 15/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.575495 acc: 0.529018 lr: 0.006539\n", + "INFO:root:epoch: 15/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.584001 acc: 0.529297 lr: 0.006528\n", + "INFO:root:epoch: 15/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.589451 acc: 0.527083 lr: 0.006517\n", + "INFO:root:epoch: 15/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.582228 acc: 0.530000 lr: 0.006506\n", + "INFO:root:epoch: 15/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.584572 acc: 0.526989 lr: 0.006494\n", + "INFO:root:epoch: 15/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.575291 acc: 0.527865 lr: 0.006483\n", + "INFO:root:epoch: 15/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.568114 acc: 0.531490 lr: 0.006472\n", + "INFO:root:epoch: 15/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.570564 acc: 0.529911 lr: 0.006461\n", + "INFO:root:epoch: 15/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.564324 acc: 0.534583 lr: 0.006449\n", + "INFO:root:epoch: 15/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 1.562554 acc: 0.536914 lr: 0.006438\n", + "INFO:root:epoch: 15/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.560759 acc: 0.536397 lr: 0.006427\n", + "INFO:root:epoch: 15/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.559806 acc: 0.536979 lr: 0.006416\n", + "INFO:root:epoch: 15/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.553790 acc: 0.538487 lr: 0.006405\n", + "INFO:root:epoch: 15/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.557610 acc: 0.537500 lr: 0.006394\n", + "INFO:root:epoch: 15/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.560925 acc: 0.535863 lr: 0.006383\n", + "INFO:root:epoch: 15/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.561667 acc: 0.535085 lr: 0.006372\n", + "INFO:root:epoch: 15/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.560213 acc: 0.535462 lr: 0.006361\n", + "INFO:root:epoch: 15/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.562566 acc: 0.535547 lr: 0.006350\n", + "INFO:root:epoch: 15/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.570237 acc: 0.533750 lr: 0.006339\n", + "INFO:root:epoch: 15/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.569232 acc: 0.534014 lr: 0.006328\n", + "INFO:root:epoch: 15/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.570253 acc: 0.534028 lr: 0.006317\n", + "INFO:root:epoch: 15/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.567725 acc: 0.534710 lr: 0.006306\n", + "INFO:root:epoch: 15/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.563901 acc: 0.536099 lr: 0.006295\n", + "INFO:root:epoch: 15/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.562417 acc: 0.536146 lr: 0.006284\n", + "INFO:root:epoch: 15/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.560062 acc: 0.536895 lr: 0.006273\n", + "INFO:root:epoch: 16/100 starts\n", + "INFO:root:epoch: 16/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.412898 acc: 0.568750 lr: 0.006259\n", + "INFO:root:epoch: 16/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.427294 acc: 0.576563 lr: 0.006248\n", + "INFO:root:epoch: 16/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.443494 acc: 0.576042 lr: 0.006237\n", + "INFO:root:epoch: 16/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.482928 acc: 0.562500 lr: 0.006227\n", + "INFO:root:epoch: 16/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.471112 acc: 0.560625 lr: 0.006216\n", + "INFO:root:epoch: 16/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.481284 acc: 0.562500 lr: 0.006205\n", + "INFO:root:epoch: 16/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.476716 acc: 0.563839 lr: 0.006194\n", + "INFO:root:epoch: 16/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.483507 acc: 0.564062 lr: 0.006184\n", + "INFO:root:epoch: 16/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.500457 acc: 0.561458 lr: 0.006173\n", + "INFO:root:epoch: 16/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.494647 acc: 0.563125 lr: 0.006162\n", + "INFO:root:epoch: 16/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.493164 acc: 0.561364 lr: 0.006151\n", + "INFO:root:epoch: 16/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.485447 acc: 0.563021 lr: 0.006141\n", + "INFO:root:epoch: 16/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.490535 acc: 0.559856 lr: 0.006130\n", + "INFO:root:epoch: 16/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.488749 acc: 0.559598 lr: 0.006120\n", + "INFO:root:epoch: 16/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.488843 acc: 0.556250 lr: 0.006109\n", + "INFO:root:epoch: 16/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.487248 acc: 0.556445 lr: 0.006098\n", + "INFO:root:epoch: 16/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.491937 acc: 0.554596 lr: 0.006088\n", + "INFO:root:epoch: 16/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.488469 acc: 0.555729 lr: 0.006077\n", + "INFO:root:epoch: 16/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.490497 acc: 0.553947 lr: 0.006067\n", + "INFO:root:epoch: 16/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.495249 acc: 0.553125 lr: 0.006056\n", + "INFO:root:epoch: 16/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.495144 acc: 0.552976 lr: 0.006046\n", + "INFO:root:epoch: 16/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.491441 acc: 0.554830 lr: 0.006035\n", + "INFO:root:epoch: 16/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.488090 acc: 0.555571 lr: 0.006025\n", + "INFO:root:epoch: 16/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.485084 acc: 0.557031 lr: 0.006014\n", + "INFO:root:epoch: 16/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.489830 acc: 0.555625 lr: 0.006004\n", + "INFO:root:epoch: 16/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.488661 acc: 0.556130 lr: 0.005994\n", + "INFO:root:epoch: 16/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.488818 acc: 0.555903 lr: 0.005983\n", + "INFO:root:epoch: 16/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.491405 acc: 0.554464 lr: 0.005973\n", + "INFO:root:epoch: 16/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.495075 acc: 0.554310 lr: 0.005963\n", + "INFO:root:epoch: 16/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.492755 acc: 0.553542 lr: 0.005952\n", + "INFO:root:epoch: 16/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.491041 acc: 0.555141 lr: 0.005942\n", + "INFO:root:epoch: 17/100 starts\n", + "INFO:root:epoch: 17/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.449564 acc: 0.590625 lr: 0.005929\n", + "INFO:root:epoch: 17/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.427082 acc: 0.578125 lr: 0.005918\n", + "INFO:root:epoch: 17/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.475510 acc: 0.562500 lr: 0.005908\n", + "INFO:root:epoch: 17/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.448906 acc: 0.571875 lr: 0.005898\n", + "INFO:root:epoch: 17/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.455275 acc: 0.578125 lr: 0.005888\n", + "INFO:root:epoch: 17/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.463978 acc: 0.571875 lr: 0.005877\n", + "INFO:root:epoch: 17/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.450951 acc: 0.575000 lr: 0.005867\n", + "INFO:root:epoch: 17/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.455543 acc: 0.572266 lr: 0.005857\n", + "INFO:root:epoch: 17/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.461897 acc: 0.566319 lr: 0.005847\n", + "INFO:root:epoch: 17/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.459599 acc: 0.565000 lr: 0.005837\n", + "INFO:root:epoch: 17/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.445611 acc: 0.568182 lr: 0.005827\n", + "INFO:root:epoch: 17/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.447909 acc: 0.566146 lr: 0.005817\n", + "INFO:root:epoch: 17/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.450760 acc: 0.566346 lr: 0.005807\n", + "INFO:root:epoch: 17/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.442193 acc: 0.567634 lr: 0.005796\n", + "INFO:root:epoch: 17/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.434523 acc: 0.572292 lr: 0.005786\n", + "INFO:root:epoch: 17/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.432792 acc: 0.571875 lr: 0.005776\n", + "INFO:root:epoch: 17/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.425576 acc: 0.574265 lr: 0.005766\n", + "INFO:root:epoch: 17/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.426011 acc: 0.572917 lr: 0.005756\n", + "INFO:root:epoch: 17/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.427779 acc: 0.571217 lr: 0.005746\n", + "INFO:root:epoch: 17/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.430185 acc: 0.570625 lr: 0.005737\n", + "INFO:root:epoch: 17/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.423199 acc: 0.572173 lr: 0.005727\n", + "INFO:root:epoch: 17/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.421009 acc: 0.573011 lr: 0.005717\n", + "INFO:root:epoch: 17/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.416962 acc: 0.574049 lr: 0.005707\n", + "INFO:root:epoch: 17/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.410605 acc: 0.577995 lr: 0.005697\n", + "INFO:root:epoch: 17/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.410618 acc: 0.577625 lr: 0.005687\n", + "INFO:root:epoch: 17/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.408961 acc: 0.579808 lr: 0.005677\n", + "INFO:root:epoch: 17/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.408255 acc: 0.579514 lr: 0.005667\n", + "INFO:root:epoch: 17/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.411916 acc: 0.577790 lr: 0.005658\n", + "INFO:root:epoch: 17/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.416356 acc: 0.575754 lr: 0.005648\n", + "INFO:root:epoch: 17/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.414138 acc: 0.576667 lr: 0.005638\n", + "INFO:root:epoch: 17/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.412640 acc: 0.577520 lr: 0.005628\n", + "INFO:root:epoch: 18/100 starts\n", + "INFO:root:epoch: 18/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.434226 acc: 0.587500 lr: 0.005616\n", + "INFO:root:epoch: 18/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.439470 acc: 0.568750 lr: 0.005606\n", + "INFO:root:epoch: 18/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.417359 acc: 0.567708 lr: 0.005596\n", + "INFO:root:epoch: 18/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.435288 acc: 0.560156 lr: 0.005586\n", + "INFO:root:epoch: 18/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.424090 acc: 0.565000 lr: 0.005577\n", + "INFO:root:epoch: 18/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.410704 acc: 0.568229 lr: 0.005567\n", + "INFO:root:epoch: 18/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.392055 acc: 0.574554 lr: 0.005557\n", + "INFO:root:epoch: 18/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.389274 acc: 0.573047 lr: 0.005548\n", + "INFO:root:epoch: 18/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.394016 acc: 0.575000 lr: 0.005538\n", + "INFO:root:epoch: 18/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.399155 acc: 0.573125 lr: 0.005529\n", + "INFO:root:epoch: 18/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.393997 acc: 0.575568 lr: 0.005519\n", + "INFO:root:epoch: 18/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.394213 acc: 0.575781 lr: 0.005510\n", + "INFO:root:epoch: 18/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.389671 acc: 0.579327 lr: 0.005500\n", + "INFO:root:epoch: 18/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.405613 acc: 0.576116 lr: 0.005490\n", + "INFO:root:epoch: 18/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.401376 acc: 0.578333 lr: 0.005481\n", + "INFO:root:epoch: 18/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.409675 acc: 0.577930 lr: 0.005471\n", + "INFO:root:epoch: 18/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.400967 acc: 0.581066 lr: 0.005462\n", + "INFO:root:epoch: 18/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.394321 acc: 0.584028 lr: 0.005453\n", + "INFO:root:epoch: 18/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.395155 acc: 0.584704 lr: 0.005443\n", + "INFO:root:epoch: 18/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.387986 acc: 0.587344 lr: 0.005434\n", + "INFO:root:epoch: 18/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.382231 acc: 0.589286 lr: 0.005424\n", + "INFO:root:epoch: 18/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.381454 acc: 0.589915 lr: 0.005415\n", + "INFO:root:epoch: 18/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.381818 acc: 0.589402 lr: 0.005406\n", + "INFO:root:epoch: 18/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.382215 acc: 0.586979 lr: 0.005396\n", + "INFO:root:epoch: 18/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.378305 acc: 0.588500 lr: 0.005387\n", + "INFO:root:epoch: 18/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.376115 acc: 0.589303 lr: 0.005377\n", + "INFO:root:epoch: 18/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.372010 acc: 0.590278 lr: 0.005368\n", + "INFO:root:epoch: 18/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.377616 acc: 0.588504 lr: 0.005359\n", + "INFO:root:epoch: 18/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.377417 acc: 0.587823 lr: 0.005350\n", + "INFO:root:epoch: 18/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.379251 acc: 0.586771 lr: 0.005340\n", + "INFO:root:epoch: 18/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.379968 acc: 0.586996 lr: 0.005331\n", + "INFO:root:epoch: 19/100 starts\n", + "INFO:root:epoch: 19/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.266186 acc: 0.600000 lr: 0.005319\n", + "INFO:root:epoch: 19/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.324750 acc: 0.592187 lr: 0.005310\n", + "INFO:root:epoch: 19/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.323163 acc: 0.594792 lr: 0.005301\n", + "INFO:root:epoch: 19/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.319002 acc: 0.589844 lr: 0.005292\n", + "INFO:root:epoch: 19/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.336284 acc: 0.589375 lr: 0.005282\n", + "INFO:root:epoch: 19/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.336506 acc: 0.589062 lr: 0.005273\n", + "INFO:root:epoch: 19/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.334003 acc: 0.593750 lr: 0.005264\n", + "INFO:root:epoch: 19/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.328551 acc: 0.596094 lr: 0.005255\n", + "INFO:root:epoch: 19/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.327437 acc: 0.597222 lr: 0.005246\n", + "INFO:root:epoch: 19/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.332014 acc: 0.595937 lr: 0.005237\n", + "INFO:root:epoch: 19/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.328956 acc: 0.595170 lr: 0.005228\n", + "INFO:root:epoch: 19/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.332096 acc: 0.594010 lr: 0.005219\n", + "INFO:root:epoch: 19/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.337599 acc: 0.594231 lr: 0.005210\n", + "INFO:root:epoch: 19/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 1.338545 acc: 0.595536 lr: 0.005201\n", + "INFO:root:epoch: 19/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.336003 acc: 0.596250 lr: 0.005192\n", + "INFO:root:epoch: 19/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.334583 acc: 0.594922 lr: 0.005183\n", + "INFO:root:epoch: 19/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.336509 acc: 0.595221 lr: 0.005174\n", + "INFO:root:epoch: 19/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.342039 acc: 0.593576 lr: 0.005165\n", + "INFO:root:epoch: 19/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.332631 acc: 0.594737 lr: 0.005156\n", + "INFO:root:epoch: 19/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.325157 acc: 0.597344 lr: 0.005147\n", + "INFO:root:epoch: 19/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.327025 acc: 0.596875 lr: 0.005138\n", + "INFO:root:epoch: 19/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.321534 acc: 0.598153 lr: 0.005129\n", + "INFO:root:epoch: 19/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.320908 acc: 0.598098 lr: 0.005120\n", + "INFO:root:epoch: 19/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.317470 acc: 0.600000 lr: 0.005111\n", + "INFO:root:epoch: 19/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.320002 acc: 0.600625 lr: 0.005102\n", + "INFO:root:epoch: 19/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.325889 acc: 0.598918 lr: 0.005094\n", + "INFO:root:epoch: 19/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.325976 acc: 0.598148 lr: 0.005085\n", + "INFO:root:epoch: 19/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.326348 acc: 0.599554 lr: 0.005076\n", + "INFO:root:epoch: 19/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.325642 acc: 0.598922 lr: 0.005067\n", + "INFO:root:epoch: 19/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.324873 acc: 0.598646 lr: 0.005058\n", + "INFO:root:epoch: 19/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.325314 acc: 0.598690 lr: 0.005050\n", + "INFO:root:epoch: 20/100 starts\n", + "INFO:root:epoch: 20/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.280708 acc: 0.634375 lr: 0.005038\n", + "INFO:root:epoch: 20/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.235159 acc: 0.634375 lr: 0.005030\n", + "INFO:root:epoch: 20/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.306348 acc: 0.607292 lr: 0.005021\n", + "INFO:root:epoch: 20/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.309960 acc: 0.605469 lr: 0.005012\n", + "INFO:root:epoch: 20/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.307539 acc: 0.606875 lr: 0.005003\n", + "INFO:root:epoch: 20/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.297137 acc: 0.608854 lr: 0.004995\n", + "INFO:root:epoch: 20/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.275117 acc: 0.616964 lr: 0.004986\n", + "INFO:root:epoch: 20/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.299075 acc: 0.607422 lr: 0.004978\n", + "INFO:root:epoch: 20/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.289445 acc: 0.607639 lr: 0.004969\n", + "INFO:root:epoch: 20/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.277936 acc: 0.610312 lr: 0.004960\n", + "INFO:root:epoch: 20/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.297285 acc: 0.604830 lr: 0.004952\n", + "INFO:root:epoch: 20/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.289607 acc: 0.609115 lr: 0.004943\n", + "INFO:root:epoch: 20/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.295020 acc: 0.609135 lr: 0.004935\n", + "INFO:root:epoch: 20/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.303560 acc: 0.608482 lr: 0.004926\n", + "INFO:root:epoch: 20/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.300337 acc: 0.609792 lr: 0.004918\n", + "INFO:root:epoch: 20/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.296412 acc: 0.612695 lr: 0.004909\n", + "INFO:root:epoch: 20/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.292786 acc: 0.613603 lr: 0.004900\n", + "INFO:root:epoch: 20/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.292811 acc: 0.613368 lr: 0.004892\n", + "INFO:root:epoch: 20/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.283824 acc: 0.615625 lr: 0.004884\n", + "INFO:root:epoch: 20/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.280958 acc: 0.616406 lr: 0.004875\n", + "INFO:root:epoch: 20/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.274672 acc: 0.618006 lr: 0.004867\n", + "INFO:root:epoch: 20/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.278945 acc: 0.617045 lr: 0.004858\n", + "INFO:root:epoch: 20/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.284810 acc: 0.616168 lr: 0.004850\n", + "INFO:root:epoch: 20/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.287501 acc: 0.615755 lr: 0.004841\n", + "INFO:root:epoch: 20/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.283725 acc: 0.616625 lr: 0.004833\n", + "INFO:root:epoch: 20/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.285955 acc: 0.615505 lr: 0.004825\n", + "INFO:root:epoch: 20/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.286474 acc: 0.614583 lr: 0.004816\n", + "INFO:root:epoch: 20/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.281536 acc: 0.615737 lr: 0.004808\n", + "INFO:root:epoch: 20/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.279781 acc: 0.616272 lr: 0.004800\n", + "INFO:root:epoch: 20/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.278089 acc: 0.617500 lr: 0.004791\n", + "INFO:root:epoch: 20/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.275513 acc: 0.617641 lr: 0.004783\n", + "INFO:root:epoch: 21/100 starts\n", + "INFO:root:epoch: 21/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.225873 acc: 0.621875 lr: 0.004772\n", + "INFO:root:epoch: 21/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.342575 acc: 0.593750 lr: 0.004764\n", + "INFO:root:epoch: 21/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.354508 acc: 0.590625 lr: 0.004756\n", + "INFO:root:epoch: 21/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.306069 acc: 0.599219 lr: 0.004748\n", + "INFO:root:epoch: 21/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.305087 acc: 0.601875 lr: 0.004739\n", + "INFO:root:epoch: 21/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.281034 acc: 0.611458 lr: 0.004731\n", + "INFO:root:epoch: 21/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.252286 acc: 0.620536 lr: 0.004723\n", + "INFO:root:epoch: 21/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.254262 acc: 0.621094 lr: 0.004715\n", + "INFO:root:epoch: 21/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.265275 acc: 0.620833 lr: 0.004707\n", + "INFO:root:epoch: 21/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.276074 acc: 0.618125 lr: 0.004698\n", + "INFO:root:epoch: 21/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.272756 acc: 0.617045 lr: 0.004690\n", + "INFO:root:epoch: 21/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.277829 acc: 0.615104 lr: 0.004682\n", + "INFO:root:epoch: 21/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.274355 acc: 0.615625 lr: 0.004674\n", + "INFO:root:epoch: 21/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.280641 acc: 0.613616 lr: 0.004666\n", + "INFO:root:epoch: 21/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.284520 acc: 0.613333 lr: 0.004658\n", + "INFO:root:epoch: 21/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.276668 acc: 0.614648 lr: 0.004650\n", + "INFO:root:epoch: 21/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.276837 acc: 0.612500 lr: 0.004642\n", + "INFO:root:epoch: 21/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.281698 acc: 0.612674 lr: 0.004634\n", + "INFO:root:epoch: 21/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.273948 acc: 0.613816 lr: 0.004626\n", + "INFO:root:epoch: 21/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.263855 acc: 0.616250 lr: 0.004618\n", + "INFO:root:epoch: 21/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.266876 acc: 0.616815 lr: 0.004610\n", + "INFO:root:epoch: 21/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.269532 acc: 0.615625 lr: 0.004602\n", + "INFO:root:epoch: 21/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.269190 acc: 0.616576 lr: 0.004594\n", + "INFO:root:epoch: 21/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.270316 acc: 0.614844 lr: 0.004586\n", + "INFO:root:epoch: 21/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.269163 acc: 0.614125 lr: 0.004578\n", + "INFO:root:epoch: 21/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.272235 acc: 0.613702 lr: 0.004570\n", + "INFO:root:epoch: 21/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.272532 acc: 0.614583 lr: 0.004562\n", + "INFO:root:epoch: 21/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.270373 acc: 0.614955 lr: 0.004554\n", + "INFO:root:epoch: 21/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.268165 acc: 0.614978 lr: 0.004546\n", + "INFO:root:epoch: 21/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.272549 acc: 0.614479 lr: 0.004538\n", + "INFO:root:epoch: 21/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.274493 acc: 0.613609 lr: 0.004531\n", + "INFO:root:epoch: 22/100 starts\n", + "INFO:root:epoch: 22/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.269285 acc: 0.631250 lr: 0.004520\n", + "INFO:root:epoch: 22/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.262039 acc: 0.620312 lr: 0.004513\n", + "INFO:root:epoch: 22/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.256257 acc: 0.618750 lr: 0.004505\n", + "INFO:root:epoch: 22/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.254888 acc: 0.620312 lr: 0.004497\n", + "INFO:root:epoch: 22/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.241882 acc: 0.625000 lr: 0.004489\n", + "INFO:root:epoch: 22/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.238519 acc: 0.621875 lr: 0.004481\n", + "INFO:root:epoch: 22/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.222724 acc: 0.625446 lr: 0.004474\n", + "INFO:root:epoch: 22/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.206227 acc: 0.632812 lr: 0.004466\n", + "INFO:root:epoch: 22/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.200030 acc: 0.632292 lr: 0.004458\n", + "INFO:root:epoch: 22/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.195600 acc: 0.635313 lr: 0.004450\n", + "INFO:root:epoch: 22/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.191617 acc: 0.640341 lr: 0.004443\n", + "INFO:root:epoch: 22/100 et: 5s eta: 9s batches: 120/313(38%) samples: 3840 loss: 1.192565 acc: 0.638802 lr: 0.004435\n", + "INFO:root:epoch: 22/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.197187 acc: 0.639183 lr: 0.004427\n", + "INFO:root:epoch: 22/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 1.200389 acc: 0.637946 lr: 0.004420\n", + "INFO:root:epoch: 22/100 et: 7s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.200150 acc: 0.636042 lr: 0.004412\n", + "INFO:root:epoch: 22/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.198456 acc: 0.636719 lr: 0.004404\n", + "INFO:root:epoch: 22/100 et: 8s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.194059 acc: 0.638419 lr: 0.004397\n", + "INFO:root:epoch: 22/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.188364 acc: 0.640625 lr: 0.004389\n", + "INFO:root:epoch: 22/100 et: 9s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.187410 acc: 0.640296 lr: 0.004382\n", + "INFO:root:epoch: 22/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.189615 acc: 0.640469 lr: 0.004374\n", + "INFO:root:epoch: 22/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.187115 acc: 0.642411 lr: 0.004366\n", + "INFO:root:epoch: 22/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.193034 acc: 0.640767 lr: 0.004359\n", + "INFO:root:epoch: 22/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.191524 acc: 0.640897 lr: 0.004351\n", + "INFO:root:epoch: 22/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.191986 acc: 0.641536 lr: 0.004344\n", + "INFO:root:epoch: 22/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.194074 acc: 0.641000 lr: 0.004336\n", + "INFO:root:epoch: 22/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.187428 acc: 0.643990 lr: 0.004329\n", + "INFO:root:epoch: 22/100 et: 12s eta: 2s batches: 270/313(86%) samples: 8640 loss: 1.182578 acc: 0.645949 lr: 0.004321\n", + "INFO:root:epoch: 22/100 et: 13s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.183534 acc: 0.646652 lr: 0.004314\n", + "INFO:root:epoch: 22/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.182079 acc: 0.647306 lr: 0.004306\n", + "INFO:root:epoch: 22/100 et: 14s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.181875 acc: 0.646979 lr: 0.004299\n", + "INFO:root:epoch: 22/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.180883 acc: 0.647379 lr: 0.004291\n", + "INFO:root:epoch: 23/100 starts\n", + "INFO:root:epoch: 23/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.335497 acc: 0.603125 lr: 0.004282\n", + "INFO:root:epoch: 23/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.256130 acc: 0.623437 lr: 0.004274\n", + "INFO:root:epoch: 23/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.220576 acc: 0.625000 lr: 0.004267\n", + "INFO:root:epoch: 23/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.215305 acc: 0.626563 lr: 0.004259\n", + "INFO:root:epoch: 23/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.196881 acc: 0.626250 lr: 0.004252\n", + "INFO:root:epoch: 23/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.220484 acc: 0.626042 lr: 0.004245\n", + "INFO:root:epoch: 23/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.205832 acc: 0.625893 lr: 0.004237\n", + "INFO:root:epoch: 23/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.209833 acc: 0.628125 lr: 0.004230\n", + "INFO:root:epoch: 23/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.212374 acc: 0.626736 lr: 0.004223\n", + "INFO:root:epoch: 23/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.205179 acc: 0.629688 lr: 0.004215\n", + "INFO:root:epoch: 23/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.206868 acc: 0.629261 lr: 0.004208\n", + "INFO:root:epoch: 23/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.201353 acc: 0.630990 lr: 0.004201\n", + "INFO:root:epoch: 23/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.207954 acc: 0.630769 lr: 0.004194\n", + "INFO:root:epoch: 23/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.211173 acc: 0.630134 lr: 0.004186\n", + "INFO:root:epoch: 23/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.209848 acc: 0.632083 lr: 0.004179\n", + "INFO:root:epoch: 23/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.210798 acc: 0.630469 lr: 0.004172\n", + "INFO:root:epoch: 23/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.209039 acc: 0.630882 lr: 0.004165\n", + "INFO:root:epoch: 23/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.211289 acc: 0.631771 lr: 0.004157\n", + "INFO:root:epoch: 23/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.219773 acc: 0.628783 lr: 0.004150\n", + "INFO:root:epoch: 23/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.214747 acc: 0.629375 lr: 0.004143\n", + "INFO:root:epoch: 23/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.218350 acc: 0.628571 lr: 0.004136\n", + "INFO:root:epoch: 23/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.219696 acc: 0.629545 lr: 0.004129\n", + "INFO:root:epoch: 23/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.218164 acc: 0.629755 lr: 0.004122\n", + "INFO:root:epoch: 23/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.224656 acc: 0.627734 lr: 0.004114\n", + "INFO:root:epoch: 23/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.228053 acc: 0.625875 lr: 0.004107\n", + "INFO:root:epoch: 23/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.226239 acc: 0.625841 lr: 0.004100\n", + "INFO:root:epoch: 23/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.223572 acc: 0.625810 lr: 0.004093\n", + "INFO:root:epoch: 23/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.221086 acc: 0.626786 lr: 0.004086\n", + "INFO:root:epoch: 23/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.223270 acc: 0.626401 lr: 0.004079\n", + "INFO:root:epoch: 23/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.221303 acc: 0.626979 lr: 0.004072\n", + "INFO:root:epoch: 23/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.218277 acc: 0.627419 lr: 0.004065\n", + "INFO:root:epoch: 24/100 starts\n", + "INFO:root:epoch: 24/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.220275 acc: 0.634375 lr: 0.004056\n", + "INFO:root:epoch: 24/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.152032 acc: 0.654688 lr: 0.004049\n", + "INFO:root:epoch: 24/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.201902 acc: 0.634375 lr: 0.004042\n", + "INFO:root:epoch: 24/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.187698 acc: 0.639063 lr: 0.004035\n", + "INFO:root:epoch: 24/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.199120 acc: 0.638750 lr: 0.004028\n", + "INFO:root:epoch: 24/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.171957 acc: 0.648438 lr: 0.004021\n", + "INFO:root:epoch: 24/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.179553 acc: 0.645536 lr: 0.004014\n", + "INFO:root:epoch: 24/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.193809 acc: 0.642578 lr: 0.004007\n", + "INFO:root:epoch: 24/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.189958 acc: 0.641667 lr: 0.004000\n", + "INFO:root:epoch: 24/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.191840 acc: 0.638438 lr: 0.003993\n", + "INFO:root:epoch: 24/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.175894 acc: 0.644602 lr: 0.003986\n", + "INFO:root:epoch: 24/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.177396 acc: 0.642708 lr: 0.003979\n", + "INFO:root:epoch: 24/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.178631 acc: 0.640865 lr: 0.003972\n", + "INFO:root:epoch: 24/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.181614 acc: 0.639732 lr: 0.003965\n", + "INFO:root:epoch: 24/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.179924 acc: 0.639375 lr: 0.003958\n", + "INFO:root:epoch: 24/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.183492 acc: 0.637305 lr: 0.003952\n", + "INFO:root:epoch: 24/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.179016 acc: 0.638419 lr: 0.003945\n", + "INFO:root:epoch: 24/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.174863 acc: 0.640278 lr: 0.003938\n", + "INFO:root:epoch: 24/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.176255 acc: 0.641118 lr: 0.003931\n", + "INFO:root:epoch: 24/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.176602 acc: 0.640625 lr: 0.003924\n", + "INFO:root:epoch: 24/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.174983 acc: 0.641220 lr: 0.003917\n", + "INFO:root:epoch: 24/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.178939 acc: 0.640199 lr: 0.003911\n", + "INFO:root:epoch: 24/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.180348 acc: 0.639946 lr: 0.003904\n", + "INFO:root:epoch: 24/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.179358 acc: 0.641536 lr: 0.003897\n", + "INFO:root:epoch: 24/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.172712 acc: 0.643625 lr: 0.003890\n", + "INFO:root:epoch: 24/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.169369 acc: 0.642668 lr: 0.003884\n", + "INFO:root:epoch: 24/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.169085 acc: 0.642708 lr: 0.003877\n", + "INFO:root:epoch: 24/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.167223 acc: 0.643527 lr: 0.003870\n", + "INFO:root:epoch: 24/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.164962 acc: 0.643750 lr: 0.003864\n", + "INFO:root:epoch: 24/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.163042 acc: 0.643958 lr: 0.003857\n", + "INFO:root:epoch: 24/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.163734 acc: 0.643750 lr: 0.003850\n", + "INFO:root:epoch: 25/100 starts\n", + "INFO:root:epoch: 25/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.140937 acc: 0.643750 lr: 0.003842\n", + "INFO:root:epoch: 25/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.089915 acc: 0.660937 lr: 0.003835\n", + "INFO:root:epoch: 25/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.104070 acc: 0.659375 lr: 0.003828\n", + "INFO:root:epoch: 25/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.111904 acc: 0.656250 lr: 0.003822\n", + "INFO:root:epoch: 25/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.104860 acc: 0.661250 lr: 0.003815\n", + "INFO:root:epoch: 25/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.106434 acc: 0.659896 lr: 0.003808\n", + "INFO:root:epoch: 25/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.104390 acc: 0.660714 lr: 0.003802\n", + "INFO:root:epoch: 25/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.096111 acc: 0.663281 lr: 0.003795\n", + "INFO:root:epoch: 25/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.104070 acc: 0.662847 lr: 0.003789\n", + "INFO:root:epoch: 25/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.088546 acc: 0.667812 lr: 0.003782\n", + "INFO:root:epoch: 25/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.088519 acc: 0.669886 lr: 0.003776\n", + "INFO:root:epoch: 25/100 et: 5s eta: 9s batches: 120/313(38%) samples: 3840 loss: 1.094578 acc: 0.669010 lr: 0.003769\n", + "INFO:root:epoch: 25/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.099351 acc: 0.667548 lr: 0.003762\n", + "INFO:root:epoch: 25/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 1.101559 acc: 0.664509 lr: 0.003756\n", + "INFO:root:epoch: 25/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.098943 acc: 0.666458 lr: 0.003749\n", + "INFO:root:epoch: 25/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.095799 acc: 0.667578 lr: 0.003743\n", + "INFO:root:epoch: 25/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.085671 acc: 0.671324 lr: 0.003736\n", + "INFO:root:epoch: 25/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.087780 acc: 0.671354 lr: 0.003730\n", + "INFO:root:epoch: 25/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.086485 acc: 0.671217 lr: 0.003724\n", + "INFO:root:epoch: 25/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.092644 acc: 0.669219 lr: 0.003717\n", + "INFO:root:epoch: 25/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.095882 acc: 0.669643 lr: 0.003711\n", + "INFO:root:epoch: 25/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.095610 acc: 0.670312 lr: 0.003704\n", + "INFO:root:epoch: 25/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.094495 acc: 0.670516 lr: 0.003698\n", + "INFO:root:epoch: 25/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.093734 acc: 0.671484 lr: 0.003691\n", + "INFO:root:epoch: 25/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.091036 acc: 0.672125 lr: 0.003685\n", + "INFO:root:epoch: 25/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.096078 acc: 0.670673 lr: 0.003679\n", + "INFO:root:epoch: 25/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.102640 acc: 0.668287 lr: 0.003672\n", + "INFO:root:epoch: 25/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.105674 acc: 0.666853 lr: 0.003666\n", + "INFO:root:epoch: 25/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.107556 acc: 0.666272 lr: 0.003660\n", + "INFO:root:epoch: 25/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.108547 acc: 0.665000 lr: 0.003653\n", + "INFO:root:epoch: 25/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.110510 acc: 0.663810 lr: 0.003647\n", + "INFO:root:epoch: 26/100 starts\n", + "INFO:root:epoch: 26/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.032128 acc: 0.675000 lr: 0.003639\n", + "INFO:root:epoch: 26/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.131578 acc: 0.660937 lr: 0.003632\n", + "INFO:root:epoch: 26/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.152281 acc: 0.651042 lr: 0.003626\n", + "INFO:root:epoch: 26/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.123092 acc: 0.660156 lr: 0.003620\n", + "INFO:root:epoch: 26/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.104351 acc: 0.666250 lr: 0.003614\n", + "INFO:root:epoch: 26/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.095404 acc: 0.663542 lr: 0.003607\n", + "INFO:root:epoch: 26/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.109545 acc: 0.657589 lr: 0.003601\n", + "INFO:root:epoch: 26/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.099196 acc: 0.661719 lr: 0.003595\n", + "INFO:root:epoch: 26/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.082922 acc: 0.669444 lr: 0.003589\n", + "INFO:root:epoch: 26/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.089567 acc: 0.666250 lr: 0.003582\n", + "INFO:root:epoch: 26/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.100713 acc: 0.662500 lr: 0.003576\n", + "INFO:root:epoch: 26/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.100876 acc: 0.664844 lr: 0.003570\n", + "INFO:root:epoch: 26/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.099064 acc: 0.666106 lr: 0.003564\n", + "INFO:root:epoch: 26/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.102101 acc: 0.665402 lr: 0.003558\n", + "INFO:root:epoch: 26/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.097139 acc: 0.667083 lr: 0.003551\n", + "INFO:root:epoch: 26/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 1.098766 acc: 0.666406 lr: 0.003545\n", + "INFO:root:epoch: 26/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.102775 acc: 0.664890 lr: 0.003539\n", + "INFO:root:epoch: 26/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.102847 acc: 0.665104 lr: 0.003533\n", + "INFO:root:epoch: 26/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.104444 acc: 0.663158 lr: 0.003527\n", + "INFO:root:epoch: 26/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.108503 acc: 0.662656 lr: 0.003521\n", + "INFO:root:epoch: 26/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.100149 acc: 0.663542 lr: 0.003515\n", + "INFO:root:epoch: 26/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.092307 acc: 0.666619 lr: 0.003509\n", + "INFO:root:epoch: 26/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.098929 acc: 0.663859 lr: 0.003503\n", + "INFO:root:epoch: 26/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.098367 acc: 0.664453 lr: 0.003497\n", + "INFO:root:epoch: 26/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.101024 acc: 0.664125 lr: 0.003490\n", + "INFO:root:epoch: 26/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.100228 acc: 0.664303 lr: 0.003484\n", + "INFO:root:epoch: 26/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.100727 acc: 0.663426 lr: 0.003478\n", + "INFO:root:epoch: 26/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.097767 acc: 0.663504 lr: 0.003472\n", + "INFO:root:epoch: 26/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.094813 acc: 0.664547 lr: 0.003466\n", + "INFO:root:epoch: 26/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.095614 acc: 0.665312 lr: 0.003460\n", + "INFO:root:epoch: 26/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.097389 acc: 0.665020 lr: 0.003454\n", + "INFO:root:epoch: 27/100 starts\n", + "INFO:root:epoch: 27/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.184505 acc: 0.662500 lr: 0.003447\n", + "INFO:root:epoch: 27/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.138632 acc: 0.659375 lr: 0.003441\n", + "INFO:root:epoch: 27/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.120402 acc: 0.669792 lr: 0.003435\n", + "INFO:root:epoch: 27/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.113013 acc: 0.670312 lr: 0.003429\n", + "INFO:root:epoch: 27/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.081761 acc: 0.670625 lr: 0.003423\n", + "INFO:root:epoch: 27/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.101537 acc: 0.664062 lr: 0.003417\n", + "INFO:root:epoch: 27/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.086233 acc: 0.666518 lr: 0.003411\n", + "INFO:root:epoch: 27/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.086780 acc: 0.663281 lr: 0.003405\n", + "INFO:root:epoch: 27/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.085530 acc: 0.664931 lr: 0.003399\n", + "INFO:root:epoch: 27/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.083584 acc: 0.665000 lr: 0.003393\n", + "INFO:root:epoch: 27/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.077050 acc: 0.667898 lr: 0.003387\n", + "INFO:root:epoch: 27/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.086782 acc: 0.665885 lr: 0.003382\n", + "INFO:root:epoch: 27/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.092208 acc: 0.663462 lr: 0.003376\n", + "INFO:root:epoch: 27/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 1.100480 acc: 0.661830 lr: 0.003370\n", + "INFO:root:epoch: 27/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.097543 acc: 0.662708 lr: 0.003364\n", + "INFO:root:epoch: 27/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.103326 acc: 0.661914 lr: 0.003358\n", + "INFO:root:epoch: 27/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.094556 acc: 0.664154 lr: 0.003352\n", + "INFO:root:epoch: 27/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.097146 acc: 0.662674 lr: 0.003347\n", + "INFO:root:epoch: 27/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.097222 acc: 0.662829 lr: 0.003341\n", + "INFO:root:epoch: 27/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.095615 acc: 0.663281 lr: 0.003335\n", + "INFO:root:epoch: 27/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.095298 acc: 0.662649 lr: 0.003329\n", + "INFO:root:epoch: 27/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.093007 acc: 0.662358 lr: 0.003323\n", + "INFO:root:epoch: 27/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.097329 acc: 0.660462 lr: 0.003318\n", + "INFO:root:epoch: 27/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.101940 acc: 0.659635 lr: 0.003312\n", + "INFO:root:epoch: 27/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.101342 acc: 0.658500 lr: 0.003306\n", + "INFO:root:epoch: 27/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.098830 acc: 0.660096 lr: 0.003300\n", + "INFO:root:epoch: 27/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.104528 acc: 0.659838 lr: 0.003295\n", + "INFO:root:epoch: 27/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.100905 acc: 0.661272 lr: 0.003289\n", + "INFO:root:epoch: 27/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.102857 acc: 0.660560 lr: 0.003283\n", + "INFO:root:epoch: 27/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.105049 acc: 0.659792 lr: 0.003278\n", + "INFO:root:epoch: 27/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.103576 acc: 0.660081 lr: 0.003272\n", + "INFO:root:epoch: 28/100 starts\n", + "INFO:root:epoch: 28/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.227883 acc: 0.637500 lr: 0.003265\n", + "INFO:root:epoch: 28/100 et: 0s eta: 14s batches: 20/313(6%) samples: 640 loss: 1.145105 acc: 0.670313 lr: 0.003259\n", + "INFO:root:epoch: 28/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.129914 acc: 0.670833 lr: 0.003253\n", + "INFO:root:epoch: 28/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.166356 acc: 0.654687 lr: 0.003248\n", + "INFO:root:epoch: 28/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.128786 acc: 0.663750 lr: 0.003242\n", + "INFO:root:epoch: 28/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.114269 acc: 0.670833 lr: 0.003236\n", + "INFO:root:epoch: 28/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.103913 acc: 0.672321 lr: 0.003231\n", + "INFO:root:epoch: 28/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.088877 acc: 0.677344 lr: 0.003225\n", + "INFO:root:epoch: 28/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.089763 acc: 0.677083 lr: 0.003220\n", + "INFO:root:epoch: 28/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.081860 acc: 0.675000 lr: 0.003214\n", + "INFO:root:epoch: 28/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.081396 acc: 0.676705 lr: 0.003209\n", + "INFO:root:epoch: 28/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.083598 acc: 0.672656 lr: 0.003203\n", + "INFO:root:epoch: 28/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.074400 acc: 0.675000 lr: 0.003197\n", + "INFO:root:epoch: 28/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.066627 acc: 0.675223 lr: 0.003192\n", + "INFO:root:epoch: 28/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.074734 acc: 0.672083 lr: 0.003186\n", + "INFO:root:epoch: 28/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.073389 acc: 0.674219 lr: 0.003181\n", + "INFO:root:epoch: 28/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.073848 acc: 0.675184 lr: 0.003175\n", + "INFO:root:epoch: 28/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.065460 acc: 0.676562 lr: 0.003170\n", + "INFO:root:epoch: 28/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.056416 acc: 0.679605 lr: 0.003164\n", + "INFO:root:epoch: 28/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.055903 acc: 0.677656 lr: 0.003159\n", + "INFO:root:epoch: 28/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.056110 acc: 0.676637 lr: 0.003153\n", + "INFO:root:epoch: 28/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.054593 acc: 0.677131 lr: 0.003148\n", + "INFO:root:epoch: 28/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.055392 acc: 0.675815 lr: 0.003143\n", + "INFO:root:epoch: 28/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.053244 acc: 0.675911 lr: 0.003137\n", + "INFO:root:epoch: 28/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.051967 acc: 0.675625 lr: 0.003132\n", + "INFO:root:epoch: 28/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.052468 acc: 0.675361 lr: 0.003126\n", + "INFO:root:epoch: 28/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.052359 acc: 0.676042 lr: 0.003121\n", + "INFO:root:epoch: 28/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.051918 acc: 0.676451 lr: 0.003115\n", + "INFO:root:epoch: 28/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.052795 acc: 0.675754 lr: 0.003110\n", + "INFO:root:epoch: 28/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.060497 acc: 0.674167 lr: 0.003105\n", + "INFO:root:epoch: 28/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.061997 acc: 0.673286 lr: 0.003099\n", + "INFO:root:epoch: 29/100 starts\n", + "INFO:root:epoch: 29/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.970789 acc: 0.700000 lr: 0.003092\n", + "INFO:root:epoch: 29/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.958573 acc: 0.709375 lr: 0.003087\n", + "INFO:root:epoch: 29/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.977617 acc: 0.702083 lr: 0.003082\n", + "INFO:root:epoch: 29/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.991939 acc: 0.691406 lr: 0.003076\n", + "INFO:root:epoch: 29/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.050411 acc: 0.686250 lr: 0.003071\n", + "INFO:root:epoch: 29/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.055734 acc: 0.684375 lr: 0.003066\n", + "INFO:root:epoch: 29/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.057889 acc: 0.683929 lr: 0.003060\n", + "INFO:root:epoch: 29/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.046052 acc: 0.686719 lr: 0.003055\n", + "INFO:root:epoch: 29/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.056212 acc: 0.683681 lr: 0.003050\n", + "INFO:root:epoch: 29/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.045250 acc: 0.687813 lr: 0.003044\n", + "INFO:root:epoch: 29/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.047811 acc: 0.685795 lr: 0.003039\n", + "INFO:root:epoch: 29/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.047982 acc: 0.685156 lr: 0.003034\n", + "INFO:root:epoch: 29/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.045031 acc: 0.684135 lr: 0.003029\n", + "INFO:root:epoch: 29/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 1.062456 acc: 0.680134 lr: 0.003023\n", + "INFO:root:epoch: 29/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.055318 acc: 0.682292 lr: 0.003018\n", + "INFO:root:epoch: 29/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.055131 acc: 0.683398 lr: 0.003013\n", + "INFO:root:epoch: 29/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.054986 acc: 0.682353 lr: 0.003008\n", + "INFO:root:epoch: 29/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.056818 acc: 0.681771 lr: 0.003003\n", + "INFO:root:epoch: 29/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.054681 acc: 0.681414 lr: 0.002997\n", + "INFO:root:epoch: 29/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.049460 acc: 0.682969 lr: 0.002992\n", + "INFO:root:epoch: 29/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.051103 acc: 0.682887 lr: 0.002987\n", + "INFO:root:epoch: 29/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.052097 acc: 0.682386 lr: 0.002982\n", + "INFO:root:epoch: 29/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.051775 acc: 0.682065 lr: 0.002977\n", + "INFO:root:epoch: 29/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.051963 acc: 0.681641 lr: 0.002971\n", + "INFO:root:epoch: 29/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.051166 acc: 0.682250 lr: 0.002966\n", + "INFO:root:epoch: 29/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.053132 acc: 0.681611 lr: 0.002961\n", + "INFO:root:epoch: 29/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.048926 acc: 0.683218 lr: 0.002956\n", + "INFO:root:epoch: 29/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.055068 acc: 0.680246 lr: 0.002951\n", + "INFO:root:epoch: 29/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.052462 acc: 0.681034 lr: 0.002946\n", + "INFO:root:epoch: 29/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.053347 acc: 0.680313 lr: 0.002941\n", + "INFO:root:epoch: 29/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.051773 acc: 0.679335 lr: 0.002936\n", + "INFO:root:epoch: 30/100 starts\n", + "INFO:root:epoch: 30/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.058173 acc: 0.687500 lr: 0.002929\n", + "INFO:root:epoch: 30/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.028388 acc: 0.709375 lr: 0.002924\n", + "INFO:root:epoch: 30/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.005509 acc: 0.704167 lr: 0.002919\n", + "INFO:root:epoch: 30/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.011474 acc: 0.703906 lr: 0.002914\n", + "INFO:root:epoch: 30/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.029843 acc: 0.694375 lr: 0.002909\n", + "INFO:root:epoch: 30/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.032557 acc: 0.692188 lr: 0.002904\n", + "INFO:root:epoch: 30/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.029639 acc: 0.687946 lr: 0.002899\n", + "INFO:root:epoch: 30/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.019300 acc: 0.691797 lr: 0.002894\n", + "INFO:root:epoch: 30/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.022530 acc: 0.690278 lr: 0.002889\n", + "INFO:root:epoch: 30/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.038184 acc: 0.687188 lr: 0.002884\n", + "INFO:root:epoch: 30/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.032987 acc: 0.690341 lr: 0.002879\n", + "INFO:root:epoch: 30/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.038354 acc: 0.688021 lr: 0.002874\n", + "INFO:root:epoch: 30/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.044560 acc: 0.686779 lr: 0.002869\n", + "INFO:root:epoch: 30/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.046210 acc: 0.685268 lr: 0.002864\n", + "INFO:root:epoch: 30/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.041039 acc: 0.684792 lr: 0.002859\n", + "INFO:root:epoch: 30/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.040980 acc: 0.683008 lr: 0.002854\n", + "INFO:root:epoch: 30/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.038761 acc: 0.684559 lr: 0.002849\n", + "INFO:root:epoch: 30/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.037539 acc: 0.683333 lr: 0.002844\n", + "INFO:root:epoch: 30/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.029771 acc: 0.684868 lr: 0.002839\n", + "INFO:root:epoch: 30/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.030799 acc: 0.685156 lr: 0.002834\n", + "INFO:root:epoch: 30/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.030693 acc: 0.686012 lr: 0.002829\n", + "INFO:root:epoch: 30/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.032144 acc: 0.685511 lr: 0.002824\n", + "INFO:root:epoch: 30/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.032008 acc: 0.683967 lr: 0.002819\n", + "INFO:root:epoch: 30/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.028640 acc: 0.684896 lr: 0.002815\n", + "INFO:root:epoch: 30/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.025480 acc: 0.686500 lr: 0.002810\n", + "INFO:root:epoch: 30/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.027657 acc: 0.684976 lr: 0.002805\n", + "INFO:root:epoch: 30/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.031818 acc: 0.683565 lr: 0.002800\n", + "INFO:root:epoch: 30/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.026295 acc: 0.684933 lr: 0.002795\n", + "INFO:root:epoch: 30/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.026661 acc: 0.685453 lr: 0.002790\n", + "INFO:root:epoch: 30/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.025083 acc: 0.685625 lr: 0.002785\n", + "INFO:root:epoch: 30/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.023058 acc: 0.686794 lr: 0.002781\n", + "INFO:root:epoch: 31/100 starts\n", + "INFO:root:epoch: 31/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.878643 acc: 0.725000 lr: 0.002774\n", + "INFO:root:epoch: 31/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.947811 acc: 0.707812 lr: 0.002770\n", + "INFO:root:epoch: 31/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.990170 acc: 0.684375 lr: 0.002765\n", + "INFO:root:epoch: 31/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.010024 acc: 0.678906 lr: 0.002760\n", + "INFO:root:epoch: 31/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.987055 acc: 0.692500 lr: 0.002755\n", + "INFO:root:epoch: 31/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.997396 acc: 0.686979 lr: 0.002750\n", + "INFO:root:epoch: 31/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.007594 acc: 0.682143 lr: 0.002746\n", + "INFO:root:epoch: 31/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.005843 acc: 0.683594 lr: 0.002741\n", + "INFO:root:epoch: 31/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.002664 acc: 0.683681 lr: 0.002736\n", + "INFO:root:epoch: 31/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.002633 acc: 0.682812 lr: 0.002731\n", + "INFO:root:epoch: 31/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.006251 acc: 0.684091 lr: 0.002727\n", + "INFO:root:epoch: 31/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.006203 acc: 0.684635 lr: 0.002722\n", + "INFO:root:epoch: 31/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.008185 acc: 0.685096 lr: 0.002717\n", + "INFO:root:epoch: 31/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.006810 acc: 0.686607 lr: 0.002713\n", + "INFO:root:epoch: 31/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.003456 acc: 0.688333 lr: 0.002708\n", + "INFO:root:epoch: 31/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.008638 acc: 0.688477 lr: 0.002703\n", + "INFO:root:epoch: 31/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.011245 acc: 0.688787 lr: 0.002699\n", + "INFO:root:epoch: 31/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.007533 acc: 0.688889 lr: 0.002694\n", + "INFO:root:epoch: 31/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.008936 acc: 0.688322 lr: 0.002689\n", + "INFO:root:epoch: 31/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.008792 acc: 0.687969 lr: 0.002685\n", + "INFO:root:epoch: 31/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.008772 acc: 0.688542 lr: 0.002680\n", + "INFO:root:epoch: 31/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.004743 acc: 0.689773 lr: 0.002675\n", + "INFO:root:epoch: 31/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.006531 acc: 0.689946 lr: 0.002671\n", + "INFO:root:epoch: 31/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.007611 acc: 0.690495 lr: 0.002666\n", + "INFO:root:epoch: 31/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.009190 acc: 0.689000 lr: 0.002661\n", + "INFO:root:epoch: 31/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.009587 acc: 0.688101 lr: 0.002657\n", + "INFO:root:epoch: 31/100 et: 12s eta: 2s batches: 270/313(86%) samples: 8640 loss: 1.005554 acc: 0.689236 lr: 0.002652\n", + "INFO:root:epoch: 31/100 et: 13s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.007945 acc: 0.687612 lr: 0.002648\n", + "INFO:root:epoch: 31/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.008952 acc: 0.688470 lr: 0.002643\n", + "INFO:root:epoch: 31/100 et: 14s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.002400 acc: 0.690521 lr: 0.002638\n", + "INFO:root:epoch: 31/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.000607 acc: 0.691331 lr: 0.002634\n", + "INFO:root:epoch: 32/100 starts\n", + "INFO:root:epoch: 32/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.120256 acc: 0.646875 lr: 0.002628\n", + "INFO:root:epoch: 32/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.032334 acc: 0.671875 lr: 0.002623\n", + "INFO:root:epoch: 32/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.055711 acc: 0.673958 lr: 0.002619\n", + "INFO:root:epoch: 32/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.031622 acc: 0.691406 lr: 0.002614\n", + "INFO:root:epoch: 32/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.043867 acc: 0.685000 lr: 0.002610\n", + "INFO:root:epoch: 32/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.012254 acc: 0.692188 lr: 0.002605\n", + "INFO:root:epoch: 32/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.005487 acc: 0.685714 lr: 0.002601\n", + "INFO:root:epoch: 32/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.022546 acc: 0.682422 lr: 0.002596\n", + "INFO:root:epoch: 32/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.046185 acc: 0.678125 lr: 0.002592\n", + "INFO:root:epoch: 32/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.050502 acc: 0.675938 lr: 0.002587\n", + "INFO:root:epoch: 32/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.039648 acc: 0.680114 lr: 0.002583\n", + "INFO:root:epoch: 32/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.029496 acc: 0.682552 lr: 0.002578\n", + "INFO:root:epoch: 32/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.033686 acc: 0.681971 lr: 0.002574\n", + "INFO:root:epoch: 32/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.024202 acc: 0.685714 lr: 0.002569\n", + "INFO:root:epoch: 32/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.021629 acc: 0.686875 lr: 0.002565\n", + "INFO:root:epoch: 32/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.012940 acc: 0.689844 lr: 0.002561\n", + "INFO:root:epoch: 32/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.002413 acc: 0.693015 lr: 0.002556\n", + "INFO:root:epoch: 32/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.996699 acc: 0.693750 lr: 0.002552\n", + "INFO:root:epoch: 32/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.992887 acc: 0.694572 lr: 0.002547\n", + "INFO:root:epoch: 32/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.989263 acc: 0.695781 lr: 0.002543\n", + "INFO:root:epoch: 32/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.991710 acc: 0.694792 lr: 0.002538\n", + "INFO:root:epoch: 32/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.992421 acc: 0.695170 lr: 0.002534\n", + "INFO:root:epoch: 32/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.993581 acc: 0.696196 lr: 0.002530\n", + "INFO:root:epoch: 32/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.988297 acc: 0.698047 lr: 0.002525\n", + "INFO:root:epoch: 32/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.982323 acc: 0.699125 lr: 0.002521\n", + "INFO:root:epoch: 32/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.981103 acc: 0.699038 lr: 0.002517\n", + "INFO:root:epoch: 32/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.983450 acc: 0.697685 lr: 0.002512\n", + "INFO:root:epoch: 32/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.984714 acc: 0.696429 lr: 0.002508\n", + "INFO:root:epoch: 32/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.982777 acc: 0.696552 lr: 0.002503\n", + "INFO:root:epoch: 32/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.986487 acc: 0.695104 lr: 0.002499\n", + "INFO:root:epoch: 32/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.986540 acc: 0.694758 lr: 0.002495\n", + "INFO:root:epoch: 33/100 starts\n", + "INFO:root:epoch: 33/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.017195 acc: 0.696875 lr: 0.002489\n", + "INFO:root:epoch: 33/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.995858 acc: 0.687500 lr: 0.002485\n", + "INFO:root:epoch: 33/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.054737 acc: 0.675000 lr: 0.002481\n", + "INFO:root:epoch: 33/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.030615 acc: 0.679688 lr: 0.002476\n", + "INFO:root:epoch: 33/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.012855 acc: 0.690000 lr: 0.002472\n", + "INFO:root:epoch: 33/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.986055 acc: 0.692708 lr: 0.002468\n", + "INFO:root:epoch: 33/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.975835 acc: 0.696875 lr: 0.002463\n", + "INFO:root:epoch: 33/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.981859 acc: 0.692187 lr: 0.002459\n", + "INFO:root:epoch: 33/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.985633 acc: 0.694444 lr: 0.002455\n", + "INFO:root:epoch: 33/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.983942 acc: 0.696250 lr: 0.002451\n", + "INFO:root:epoch: 33/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.982093 acc: 0.695739 lr: 0.002446\n", + "INFO:root:epoch: 33/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.985689 acc: 0.696615 lr: 0.002442\n", + "INFO:root:epoch: 33/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.995205 acc: 0.692067 lr: 0.002438\n", + "INFO:root:epoch: 33/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.989432 acc: 0.692187 lr: 0.002434\n", + "INFO:root:epoch: 33/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.005926 acc: 0.687292 lr: 0.002430\n", + "INFO:root:epoch: 33/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.006885 acc: 0.687500 lr: 0.002425\n", + "INFO:root:epoch: 33/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.001893 acc: 0.688603 lr: 0.002421\n", + "INFO:root:epoch: 33/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.000617 acc: 0.689236 lr: 0.002417\n", + "INFO:root:epoch: 33/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.003761 acc: 0.689474 lr: 0.002413\n", + "INFO:root:epoch: 33/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.995013 acc: 0.691875 lr: 0.002409\n", + "INFO:root:epoch: 33/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.990297 acc: 0.694494 lr: 0.002404\n", + "INFO:root:epoch: 33/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.995231 acc: 0.692045 lr: 0.002400\n", + "INFO:root:epoch: 33/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.995214 acc: 0.692391 lr: 0.002396\n", + "INFO:root:epoch: 33/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.993845 acc: 0.692188 lr: 0.002392\n", + "INFO:root:epoch: 33/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.993042 acc: 0.692125 lr: 0.002388\n", + "INFO:root:epoch: 33/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.993076 acc: 0.691827 lr: 0.002384\n", + "INFO:root:epoch: 33/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.994440 acc: 0.691667 lr: 0.002380\n", + "INFO:root:epoch: 33/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.992376 acc: 0.692076 lr: 0.002375\n", + "INFO:root:epoch: 33/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.990601 acc: 0.692888 lr: 0.002371\n", + "INFO:root:epoch: 33/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.993241 acc: 0.692604 lr: 0.002367\n", + "INFO:root:epoch: 33/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.994272 acc: 0.692238 lr: 0.002363\n", + "INFO:root:epoch: 34/100 starts\n", + "INFO:root:epoch: 34/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.829187 acc: 0.737500 lr: 0.002358\n", + "INFO:root:epoch: 34/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.951155 acc: 0.692187 lr: 0.002354\n", + "INFO:root:epoch: 34/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.957783 acc: 0.692708 lr: 0.002350\n", + "INFO:root:epoch: 34/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.989606 acc: 0.681250 lr: 0.002346\n", + "INFO:root:epoch: 34/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.983047 acc: 0.685000 lr: 0.002341\n", + "INFO:root:epoch: 34/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.972665 acc: 0.692708 lr: 0.002337\n", + "INFO:root:epoch: 34/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.970072 acc: 0.696429 lr: 0.002333\n", + "INFO:root:epoch: 34/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.965343 acc: 0.698438 lr: 0.002329\n", + "INFO:root:epoch: 34/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.952787 acc: 0.703472 lr: 0.002325\n", + "INFO:root:epoch: 34/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.951892 acc: 0.702187 lr: 0.002321\n", + "INFO:root:epoch: 34/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.951053 acc: 0.702557 lr: 0.002317\n", + "INFO:root:epoch: 34/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.964162 acc: 0.698438 lr: 0.002313\n", + "INFO:root:epoch: 34/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.965018 acc: 0.699279 lr: 0.002309\n", + "INFO:root:epoch: 34/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.961824 acc: 0.699554 lr: 0.002305\n", + "INFO:root:epoch: 34/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.957807 acc: 0.701875 lr: 0.002301\n", + "INFO:root:epoch: 34/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.959694 acc: 0.700195 lr: 0.002297\n", + "INFO:root:epoch: 34/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.955124 acc: 0.702022 lr: 0.002293\n", + "INFO:root:epoch: 34/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.961792 acc: 0.700694 lr: 0.002289\n", + "INFO:root:epoch: 34/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.963842 acc: 0.700329 lr: 0.002285\n", + "INFO:root:epoch: 34/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.970577 acc: 0.699687 lr: 0.002281\n", + "INFO:root:epoch: 34/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.968876 acc: 0.700893 lr: 0.002277\n", + "INFO:root:epoch: 34/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.965493 acc: 0.702273 lr: 0.002274\n", + "INFO:root:epoch: 34/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.965751 acc: 0.700815 lr: 0.002270\n", + "INFO:root:epoch: 34/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.967889 acc: 0.700391 lr: 0.002266\n", + "INFO:root:epoch: 34/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.970088 acc: 0.699500 lr: 0.002262\n", + "INFO:root:epoch: 34/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.969501 acc: 0.699880 lr: 0.002258\n", + "INFO:root:epoch: 34/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.965315 acc: 0.701620 lr: 0.002254\n", + "INFO:root:epoch: 34/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.962363 acc: 0.702679 lr: 0.002250\n", + "INFO:root:epoch: 34/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.962301 acc: 0.701401 lr: 0.002246\n", + "INFO:root:epoch: 34/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.963125 acc: 0.699896 lr: 0.002242\n", + "INFO:root:epoch: 34/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.964709 acc: 0.700000 lr: 0.002238\n", + "INFO:root:epoch: 35/100 starts\n", + "INFO:root:epoch: 35/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.009297 acc: 0.696875 lr: 0.002233\n", + "INFO:root:epoch: 35/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.023766 acc: 0.692188 lr: 0.002229\n", + "INFO:root:epoch: 35/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.057812 acc: 0.683333 lr: 0.002226\n", + "INFO:root:epoch: 35/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.042550 acc: 0.686719 lr: 0.002222\n", + "INFO:root:epoch: 35/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.014933 acc: 0.692500 lr: 0.002218\n", + "INFO:root:epoch: 35/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.001648 acc: 0.700000 lr: 0.002214\n", + "INFO:root:epoch: 35/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.987618 acc: 0.702232 lr: 0.002210\n", + "INFO:root:epoch: 35/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.991443 acc: 0.701172 lr: 0.002206\n", + "INFO:root:epoch: 35/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.997950 acc: 0.698958 lr: 0.002203\n", + "INFO:root:epoch: 35/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.994920 acc: 0.699063 lr: 0.002199\n", + "INFO:root:epoch: 35/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.984630 acc: 0.701705 lr: 0.002195\n", + "INFO:root:epoch: 35/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.991273 acc: 0.700000 lr: 0.002191\n", + "INFO:root:epoch: 35/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.988474 acc: 0.699519 lr: 0.002187\n", + "INFO:root:epoch: 35/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.996823 acc: 0.697768 lr: 0.002184\n", + "INFO:root:epoch: 35/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.989630 acc: 0.697292 lr: 0.002180\n", + "INFO:root:epoch: 35/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.985122 acc: 0.698047 lr: 0.002176\n", + "INFO:root:epoch: 35/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.984472 acc: 0.696875 lr: 0.002172\n", + "INFO:root:epoch: 35/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.979952 acc: 0.697743 lr: 0.002168\n", + "INFO:root:epoch: 35/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.978553 acc: 0.698191 lr: 0.002165\n", + "INFO:root:epoch: 35/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.977551 acc: 0.698281 lr: 0.002161\n", + "INFO:root:epoch: 35/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.970447 acc: 0.701488 lr: 0.002157\n", + "INFO:root:epoch: 35/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.968814 acc: 0.702841 lr: 0.002153\n", + "INFO:root:epoch: 35/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.968661 acc: 0.702174 lr: 0.002150\n", + "INFO:root:epoch: 35/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.969574 acc: 0.702214 lr: 0.002146\n", + "INFO:root:epoch: 35/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.970585 acc: 0.702125 lr: 0.002142\n", + "INFO:root:epoch: 35/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.972630 acc: 0.702284 lr: 0.002139\n", + "INFO:root:epoch: 35/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.971067 acc: 0.702083 lr: 0.002135\n", + "INFO:root:epoch: 35/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.968114 acc: 0.702455 lr: 0.002131\n", + "INFO:root:epoch: 35/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.966663 acc: 0.702802 lr: 0.002128\n", + "INFO:root:epoch: 35/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.964420 acc: 0.703958 lr: 0.002124\n", + "INFO:root:epoch: 35/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.960788 acc: 0.704839 lr: 0.002120\n", + "INFO:root:epoch: 36/100 starts\n", + "INFO:root:epoch: 36/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.091243 acc: 0.687500 lr: 0.002115\n", + "INFO:root:epoch: 36/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.977320 acc: 0.712500 lr: 0.002112\n", + "INFO:root:epoch: 36/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.966894 acc: 0.710417 lr: 0.002108\n", + "INFO:root:epoch: 36/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.945779 acc: 0.711719 lr: 0.002104\n", + "INFO:root:epoch: 36/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.927921 acc: 0.715625 lr: 0.002101\n", + "INFO:root:epoch: 36/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.918030 acc: 0.718750 lr: 0.002097\n", + "INFO:root:epoch: 36/100 et: 3s eta: 10s batches: 70/313(22%) samples: 2240 loss: 0.921133 acc: 0.718304 lr: 0.002094\n", + "INFO:root:epoch: 36/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.933628 acc: 0.718750 lr: 0.002090\n", + "INFO:root:epoch: 36/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.930386 acc: 0.717361 lr: 0.002086\n", + "INFO:root:epoch: 36/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.934669 acc: 0.716562 lr: 0.002083\n", + "INFO:root:epoch: 36/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.944246 acc: 0.713920 lr: 0.002079\n", + "INFO:root:epoch: 36/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.947096 acc: 0.713802 lr: 0.002075\n", + "INFO:root:epoch: 36/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.939754 acc: 0.716827 lr: 0.002072\n", + "INFO:root:epoch: 36/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.941787 acc: 0.715625 lr: 0.002068\n", + "INFO:root:epoch: 36/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.941816 acc: 0.715833 lr: 0.002065\n", + "INFO:root:epoch: 36/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.943801 acc: 0.714453 lr: 0.002061\n", + "INFO:root:epoch: 36/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.946200 acc: 0.714338 lr: 0.002058\n", + "INFO:root:epoch: 36/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.940760 acc: 0.714757 lr: 0.002054\n", + "INFO:root:epoch: 36/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.945329 acc: 0.713651 lr: 0.002050\n", + "INFO:root:epoch: 36/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.946733 acc: 0.713594 lr: 0.002047\n", + "INFO:root:epoch: 36/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.945527 acc: 0.714137 lr: 0.002043\n", + "INFO:root:epoch: 36/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.949205 acc: 0.712784 lr: 0.002040\n", + "INFO:root:epoch: 36/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.955257 acc: 0.711549 lr: 0.002036\n", + "INFO:root:epoch: 36/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.967109 acc: 0.706901 lr: 0.002033\n", + "INFO:root:epoch: 36/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.966555 acc: 0.707125 lr: 0.002029\n", + "INFO:root:epoch: 36/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.963576 acc: 0.708774 lr: 0.002026\n", + "INFO:root:epoch: 36/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.958365 acc: 0.709606 lr: 0.002022\n", + "INFO:root:epoch: 36/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.956237 acc: 0.710379 lr: 0.002019\n", + "INFO:root:epoch: 36/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.958801 acc: 0.709698 lr: 0.002015\n", + "INFO:root:epoch: 36/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.956470 acc: 0.709375 lr: 0.002012\n", + "INFO:root:epoch: 36/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.959181 acc: 0.708770 lr: 0.002008\n", + "INFO:root:epoch: 37/100 starts\n", + "INFO:root:epoch: 37/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.947274 acc: 0.696875 lr: 0.002004\n", + "INFO:root:epoch: 37/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.937142 acc: 0.706250 lr: 0.002000\n", + "INFO:root:epoch: 37/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.940878 acc: 0.715625 lr: 0.001997\n", + "INFO:root:epoch: 37/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.966049 acc: 0.709375 lr: 0.001993\n", + "INFO:root:epoch: 37/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.001778 acc: 0.698750 lr: 0.001990\n", + "INFO:root:epoch: 37/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.985789 acc: 0.703646 lr: 0.001986\n", + "INFO:root:epoch: 37/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.976270 acc: 0.703571 lr: 0.001983\n", + "INFO:root:epoch: 37/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.971587 acc: 0.704688 lr: 0.001980\n", + "INFO:root:epoch: 37/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.962920 acc: 0.707639 lr: 0.001976\n", + "INFO:root:epoch: 37/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.954290 acc: 0.710625 lr: 0.001973\n", + "INFO:root:epoch: 37/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.943198 acc: 0.713920 lr: 0.001969\n", + "INFO:root:epoch: 37/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.946688 acc: 0.715365 lr: 0.001966\n", + "INFO:root:epoch: 37/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.947125 acc: 0.713942 lr: 0.001962\n", + "INFO:root:epoch: 37/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.945415 acc: 0.714732 lr: 0.001959\n", + "INFO:root:epoch: 37/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.939685 acc: 0.714583 lr: 0.001956\n", + "INFO:root:epoch: 37/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.941070 acc: 0.714063 lr: 0.001952\n", + "INFO:root:epoch: 37/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.938366 acc: 0.713419 lr: 0.001949\n", + "INFO:root:epoch: 37/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.935093 acc: 0.713194 lr: 0.001946\n", + "INFO:root:epoch: 37/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.931260 acc: 0.713651 lr: 0.001942\n", + "INFO:root:epoch: 37/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.933529 acc: 0.711875 lr: 0.001939\n", + "INFO:root:epoch: 37/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.937069 acc: 0.711012 lr: 0.001935\n", + "INFO:root:epoch: 37/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.938863 acc: 0.710511 lr: 0.001932\n", + "INFO:root:epoch: 37/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.939828 acc: 0.709783 lr: 0.001929\n", + "INFO:root:epoch: 37/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.939845 acc: 0.710156 lr: 0.001925\n", + "INFO:root:epoch: 37/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.940458 acc: 0.709625 lr: 0.001922\n", + "INFO:root:epoch: 37/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.944485 acc: 0.707452 lr: 0.001919\n", + "INFO:root:epoch: 37/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.943645 acc: 0.706944 lr: 0.001915\n", + "INFO:root:epoch: 37/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.945993 acc: 0.707366 lr: 0.001912\n", + "INFO:root:epoch: 37/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.941415 acc: 0.708297 lr: 0.001909\n", + "INFO:root:epoch: 37/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.940199 acc: 0.709167 lr: 0.001906\n", + "INFO:root:epoch: 37/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.940290 acc: 0.709980 lr: 0.001902\n", + "INFO:root:epoch: 38/100 starts\n", + "INFO:root:epoch: 38/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.915666 acc: 0.690625 lr: 0.001898\n", + "INFO:root:epoch: 38/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.961178 acc: 0.693750 lr: 0.001895\n", + "INFO:root:epoch: 38/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.954265 acc: 0.690625 lr: 0.001891\n", + "INFO:root:epoch: 38/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.928703 acc: 0.699219 lr: 0.001888\n", + "INFO:root:epoch: 38/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.951760 acc: 0.690000 lr: 0.001885\n", + "INFO:root:epoch: 38/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.931337 acc: 0.694271 lr: 0.001882\n", + "INFO:root:epoch: 38/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.919142 acc: 0.704018 lr: 0.001878\n", + "INFO:root:epoch: 38/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.930935 acc: 0.697656 lr: 0.001875\n", + "INFO:root:epoch: 38/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.920327 acc: 0.706250 lr: 0.001872\n", + "INFO:root:epoch: 38/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.918248 acc: 0.709375 lr: 0.001869\n", + "INFO:root:epoch: 38/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.916976 acc: 0.708239 lr: 0.001865\n", + "INFO:root:epoch: 38/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.913504 acc: 0.708073 lr: 0.001862\n", + "INFO:root:epoch: 38/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.920087 acc: 0.707692 lr: 0.001859\n", + "INFO:root:epoch: 38/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.912807 acc: 0.710491 lr: 0.001856\n", + "INFO:root:epoch: 38/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.901150 acc: 0.715417 lr: 0.001852\n", + "INFO:root:epoch: 38/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.902415 acc: 0.716406 lr: 0.001849\n", + "INFO:root:epoch: 38/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.903299 acc: 0.716176 lr: 0.001846\n", + "INFO:root:epoch: 38/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.902306 acc: 0.715799 lr: 0.001843\n", + "INFO:root:epoch: 38/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.905362 acc: 0.714803 lr: 0.001840\n", + "INFO:root:epoch: 38/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.909509 acc: 0.715000 lr: 0.001836\n", + "INFO:root:epoch: 38/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.912586 acc: 0.715476 lr: 0.001833\n", + "INFO:root:epoch: 38/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.914302 acc: 0.713636 lr: 0.001830\n", + "INFO:root:epoch: 38/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.915388 acc: 0.714946 lr: 0.001827\n", + "INFO:root:epoch: 38/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.917406 acc: 0.713411 lr: 0.001824\n", + "INFO:root:epoch: 38/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.919715 acc: 0.712750 lr: 0.001821\n", + "INFO:root:epoch: 38/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.919115 acc: 0.713101 lr: 0.001817\n", + "INFO:root:epoch: 38/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.921368 acc: 0.712731 lr: 0.001814\n", + "INFO:root:epoch: 38/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.923399 acc: 0.712165 lr: 0.001811\n", + "INFO:root:epoch: 38/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.923672 acc: 0.711746 lr: 0.001808\n", + "INFO:root:epoch: 38/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.928174 acc: 0.709896 lr: 0.001805\n", + "INFO:root:epoch: 38/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.925099 acc: 0.710484 lr: 0.001802\n", + "INFO:root:epoch: 39/100 starts\n", + "INFO:root:epoch: 39/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.921871 acc: 0.718750 lr: 0.001798\n", + "INFO:root:epoch: 39/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.929147 acc: 0.712500 lr: 0.001795\n", + "INFO:root:epoch: 39/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.916779 acc: 0.726042 lr: 0.001792\n", + "INFO:root:epoch: 39/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.913380 acc: 0.714063 lr: 0.001788\n", + "INFO:root:epoch: 39/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.898546 acc: 0.716875 lr: 0.001785\n", + "INFO:root:epoch: 39/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.891703 acc: 0.719792 lr: 0.001782\n", + "INFO:root:epoch: 39/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.891161 acc: 0.718304 lr: 0.001779\n", + "INFO:root:epoch: 39/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.884714 acc: 0.716406 lr: 0.001776\n", + "INFO:root:epoch: 39/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.902276 acc: 0.711458 lr: 0.001773\n", + "INFO:root:epoch: 39/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.900820 acc: 0.714375 lr: 0.001770\n", + "INFO:root:epoch: 39/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.901034 acc: 0.715057 lr: 0.001767\n", + "INFO:root:epoch: 39/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.905887 acc: 0.715365 lr: 0.001764\n", + "INFO:root:epoch: 39/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.908046 acc: 0.713942 lr: 0.001761\n", + "INFO:root:epoch: 39/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.906175 acc: 0.711830 lr: 0.001758\n", + "INFO:root:epoch: 39/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.903609 acc: 0.712708 lr: 0.001755\n", + "INFO:root:epoch: 39/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.908699 acc: 0.711133 lr: 0.001752\n", + "INFO:root:epoch: 39/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.909322 acc: 0.711581 lr: 0.001749\n", + "INFO:root:epoch: 39/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.904612 acc: 0.714236 lr: 0.001746\n", + "INFO:root:epoch: 39/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.908094 acc: 0.714474 lr: 0.001743\n", + "INFO:root:epoch: 39/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.906864 acc: 0.716562 lr: 0.001740\n", + "INFO:root:epoch: 39/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.908377 acc: 0.715774 lr: 0.001736\n", + "INFO:root:epoch: 39/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.909021 acc: 0.716903 lr: 0.001733\n", + "INFO:root:epoch: 39/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.907651 acc: 0.716168 lr: 0.001730\n", + "INFO:root:epoch: 39/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.910696 acc: 0.715234 lr: 0.001727\n", + "INFO:root:epoch: 39/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.907546 acc: 0.715625 lr: 0.001724\n", + "INFO:root:epoch: 39/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.908025 acc: 0.716707 lr: 0.001722\n", + "INFO:root:epoch: 39/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.912086 acc: 0.715509 lr: 0.001719\n", + "INFO:root:epoch: 39/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.912558 acc: 0.714732 lr: 0.001716\n", + "INFO:root:epoch: 39/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.909682 acc: 0.715302 lr: 0.001713\n", + "INFO:root:epoch: 39/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.906029 acc: 0.716979 lr: 0.001710\n", + "INFO:root:epoch: 39/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.903505 acc: 0.717036 lr: 0.001707\n", + "INFO:root:epoch: 40/100 starts\n", + "INFO:root:epoch: 40/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.780566 acc: 0.775000 lr: 0.001703\n", + "INFO:root:epoch: 40/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.838111 acc: 0.748437 lr: 0.001700\n", + "INFO:root:epoch: 40/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.856948 acc: 0.733333 lr: 0.001697\n", + "INFO:root:epoch: 40/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.862112 acc: 0.732812 lr: 0.001694\n", + "INFO:root:epoch: 40/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.860197 acc: 0.730000 lr: 0.001691\n", + "INFO:root:epoch: 40/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.863356 acc: 0.728125 lr: 0.001688\n", + "INFO:root:epoch: 40/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.878911 acc: 0.724554 lr: 0.001685\n", + "INFO:root:epoch: 40/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.888058 acc: 0.722656 lr: 0.001682\n", + "INFO:root:epoch: 40/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.894968 acc: 0.722222 lr: 0.001679\n", + "INFO:root:epoch: 40/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.894365 acc: 0.721563 lr: 0.001676\n", + "INFO:root:epoch: 40/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.903343 acc: 0.718466 lr: 0.001674\n", + "INFO:root:epoch: 40/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.910384 acc: 0.715365 lr: 0.001671\n", + "INFO:root:epoch: 40/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.899847 acc: 0.717788 lr: 0.001668\n", + "INFO:root:epoch: 40/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.894946 acc: 0.718973 lr: 0.001665\n", + "INFO:root:epoch: 40/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.903341 acc: 0.717708 lr: 0.001662\n", + "INFO:root:epoch: 40/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.897506 acc: 0.718750 lr: 0.001659\n", + "INFO:root:epoch: 40/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.898148 acc: 0.720956 lr: 0.001656\n", + "INFO:root:epoch: 40/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.905616 acc: 0.718750 lr: 0.001653\n", + "INFO:root:epoch: 40/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.904865 acc: 0.719408 lr: 0.001651\n", + "INFO:root:epoch: 40/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.906802 acc: 0.718438 lr: 0.001648\n", + "INFO:root:epoch: 40/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.904453 acc: 0.719345 lr: 0.001645\n", + "INFO:root:epoch: 40/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.903755 acc: 0.720170 lr: 0.001642\n", + "INFO:root:epoch: 40/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.899484 acc: 0.721196 lr: 0.001639\n", + "INFO:root:epoch: 40/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.897623 acc: 0.721484 lr: 0.001636\n", + "INFO:root:epoch: 40/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.893251 acc: 0.722375 lr: 0.001633\n", + "INFO:root:epoch: 40/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.892736 acc: 0.722356 lr: 0.001631\n", + "INFO:root:epoch: 40/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.897582 acc: 0.722106 lr: 0.001628\n", + "INFO:root:epoch: 40/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.900767 acc: 0.721987 lr: 0.001625\n", + "INFO:root:epoch: 40/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.904848 acc: 0.721659 lr: 0.001622\n", + "INFO:root:epoch: 40/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.904811 acc: 0.721250 lr: 0.001619\n", + "INFO:root:epoch: 40/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.905520 acc: 0.720464 lr: 0.001617\n", + "INFO:root:epoch: 41/100 starts\n", + "INFO:root:epoch: 41/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.893831 acc: 0.746875 lr: 0.001613\n", + "INFO:root:epoch: 41/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.886461 acc: 0.735937 lr: 0.001610\n", + "INFO:root:epoch: 41/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.885964 acc: 0.736458 lr: 0.001607\n", + "INFO:root:epoch: 41/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.863613 acc: 0.744531 lr: 0.001605\n", + "INFO:root:epoch: 41/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.868521 acc: 0.738750 lr: 0.001602\n", + "INFO:root:epoch: 41/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.889519 acc: 0.728125 lr: 0.001599\n", + "INFO:root:epoch: 41/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.882022 acc: 0.731250 lr: 0.001596\n", + "INFO:root:epoch: 41/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.883197 acc: 0.727344 lr: 0.001593\n", + "INFO:root:epoch: 41/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.870593 acc: 0.729167 lr: 0.001591\n", + "INFO:root:epoch: 41/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.859885 acc: 0.731250 lr: 0.001588\n", + "INFO:root:epoch: 41/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.863553 acc: 0.730682 lr: 0.001585\n", + "INFO:root:epoch: 41/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.859644 acc: 0.734375 lr: 0.001582\n", + "INFO:root:epoch: 41/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.865917 acc: 0.733654 lr: 0.001580\n", + "INFO:root:epoch: 41/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.870035 acc: 0.731250 lr: 0.001577\n", + "INFO:root:epoch: 41/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.862183 acc: 0.733542 lr: 0.001574\n", + "INFO:root:epoch: 41/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.863656 acc: 0.732812 lr: 0.001572\n", + "INFO:root:epoch: 41/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.866855 acc: 0.731250 lr: 0.001569\n", + "INFO:root:epoch: 41/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.867640 acc: 0.730903 lr: 0.001566\n", + "INFO:root:epoch: 41/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.866226 acc: 0.730757 lr: 0.001563\n", + "INFO:root:epoch: 41/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.863282 acc: 0.731875 lr: 0.001561\n", + "INFO:root:epoch: 41/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.864556 acc: 0.731101 lr: 0.001558\n", + "INFO:root:epoch: 41/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.863321 acc: 0.730966 lr: 0.001555\n", + "INFO:root:epoch: 41/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.861878 acc: 0.731522 lr: 0.001553\n", + "INFO:root:epoch: 41/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.859133 acc: 0.733594 lr: 0.001550\n", + "INFO:root:epoch: 41/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.862252 acc: 0.732375 lr: 0.001547\n", + "INFO:root:epoch: 41/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.861156 acc: 0.733053 lr: 0.001545\n", + "INFO:root:epoch: 41/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.861156 acc: 0.733681 lr: 0.001542\n", + "INFO:root:epoch: 41/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.862086 acc: 0.732924 lr: 0.001539\n", + "INFO:root:epoch: 41/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.864426 acc: 0.732651 lr: 0.001537\n", + "INFO:root:epoch: 41/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.866150 acc: 0.732083 lr: 0.001534\n", + "INFO:root:epoch: 41/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.864671 acc: 0.731855 lr: 0.001531\n", + "INFO:root:epoch: 42/100 starts\n", + "INFO:root:epoch: 42/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.892114 acc: 0.743750 lr: 0.001528\n", + "INFO:root:epoch: 42/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.844824 acc: 0.765625 lr: 0.001525\n", + "INFO:root:epoch: 42/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.867760 acc: 0.752083 lr: 0.001522\n", + "INFO:root:epoch: 42/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.872542 acc: 0.754687 lr: 0.001520\n", + "INFO:root:epoch: 42/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.869582 acc: 0.753125 lr: 0.001517\n", + "INFO:root:epoch: 42/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.867329 acc: 0.750521 lr: 0.001515\n", + "INFO:root:epoch: 42/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.850506 acc: 0.753571 lr: 0.001512\n", + "INFO:root:epoch: 42/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.838864 acc: 0.755469 lr: 0.001509\n", + "INFO:root:epoch: 42/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.852119 acc: 0.747917 lr: 0.001507\n", + "INFO:root:epoch: 42/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.847287 acc: 0.746875 lr: 0.001504\n", + "INFO:root:epoch: 42/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.849393 acc: 0.747159 lr: 0.001502\n", + "INFO:root:epoch: 42/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.849027 acc: 0.746354 lr: 0.001499\n", + "INFO:root:epoch: 42/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.852098 acc: 0.745433 lr: 0.001496\n", + "INFO:root:epoch: 42/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.856265 acc: 0.741295 lr: 0.001494\n", + "INFO:root:epoch: 42/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.859484 acc: 0.738750 lr: 0.001491\n", + "INFO:root:epoch: 42/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.858422 acc: 0.738086 lr: 0.001489\n", + "INFO:root:epoch: 42/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.860048 acc: 0.739522 lr: 0.001486\n", + "INFO:root:epoch: 42/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.863799 acc: 0.736979 lr: 0.001483\n", + "INFO:root:epoch: 42/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.864196 acc: 0.736184 lr: 0.001481\n", + "INFO:root:epoch: 42/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.868736 acc: 0.733594 lr: 0.001478\n", + "INFO:root:epoch: 42/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.871624 acc: 0.733333 lr: 0.001476\n", + "INFO:root:epoch: 42/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.870885 acc: 0.734375 lr: 0.001473\n", + "INFO:root:epoch: 42/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.870002 acc: 0.733696 lr: 0.001471\n", + "INFO:root:epoch: 42/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.870474 acc: 0.733984 lr: 0.001468\n", + "INFO:root:epoch: 42/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.873119 acc: 0.732250 lr: 0.001466\n", + "INFO:root:epoch: 42/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.870922 acc: 0.732452 lr: 0.001463\n", + "INFO:root:epoch: 42/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.875132 acc: 0.731944 lr: 0.001460\n", + "INFO:root:epoch: 42/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.877856 acc: 0.730915 lr: 0.001458\n", + "INFO:root:epoch: 42/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.874014 acc: 0.731897 lr: 0.001455\n", + "INFO:root:epoch: 42/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.877910 acc: 0.730625 lr: 0.001453\n", + "INFO:root:epoch: 42/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.880198 acc: 0.730242 lr: 0.001450\n", + "INFO:root:epoch: 43/100 starts\n", + "INFO:root:epoch: 43/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.932633 acc: 0.715625 lr: 0.001447\n", + "INFO:root:epoch: 43/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.976757 acc: 0.687500 lr: 0.001445\n", + "INFO:root:epoch: 43/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.899957 acc: 0.709375 lr: 0.001442\n", + "INFO:root:epoch: 43/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.895332 acc: 0.713281 lr: 0.001440\n", + "INFO:root:epoch: 43/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.928037 acc: 0.708125 lr: 0.001437\n", + "INFO:root:epoch: 43/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.916536 acc: 0.705729 lr: 0.001435\n", + "INFO:root:epoch: 43/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.907680 acc: 0.708482 lr: 0.001432\n", + "INFO:root:epoch: 43/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.906414 acc: 0.710937 lr: 0.001430\n", + "INFO:root:epoch: 43/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.915844 acc: 0.707639 lr: 0.001427\n", + "INFO:root:epoch: 43/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.908398 acc: 0.711875 lr: 0.001425\n", + "INFO:root:epoch: 43/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.912975 acc: 0.708807 lr: 0.001422\n", + "INFO:root:epoch: 43/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.912979 acc: 0.710417 lr: 0.001420\n", + "INFO:root:epoch: 43/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.906021 acc: 0.712981 lr: 0.001417\n", + "INFO:root:epoch: 43/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.916571 acc: 0.710268 lr: 0.001415\n", + "INFO:root:epoch: 43/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.909473 acc: 0.712083 lr: 0.001412\n", + "INFO:root:epoch: 43/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.907134 acc: 0.713867 lr: 0.001410\n", + "INFO:root:epoch: 43/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.904646 acc: 0.713419 lr: 0.001408\n", + "INFO:root:epoch: 43/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.906878 acc: 0.713889 lr: 0.001405\n", + "INFO:root:epoch: 43/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.902582 acc: 0.714967 lr: 0.001403\n", + "INFO:root:epoch: 43/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.900949 acc: 0.716563 lr: 0.001400\n", + "INFO:root:epoch: 43/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.894646 acc: 0.718601 lr: 0.001398\n", + "INFO:root:epoch: 43/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.891527 acc: 0.720739 lr: 0.001395\n", + "INFO:root:epoch: 43/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.892044 acc: 0.721603 lr: 0.001393\n", + "INFO:root:epoch: 43/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.890487 acc: 0.722396 lr: 0.001391\n", + "INFO:root:epoch: 43/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.893807 acc: 0.721125 lr: 0.001388\n", + "INFO:root:epoch: 43/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.895236 acc: 0.721274 lr: 0.001386\n", + "INFO:root:epoch: 43/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.895607 acc: 0.720718 lr: 0.001383\n", + "INFO:root:epoch: 43/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.895230 acc: 0.720424 lr: 0.001381\n", + "INFO:root:epoch: 43/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.894518 acc: 0.720259 lr: 0.001379\n", + "INFO:root:epoch: 43/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.894952 acc: 0.720521 lr: 0.001376\n", + "INFO:root:epoch: 43/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.896812 acc: 0.721169 lr: 0.001374\n", + "INFO:root:epoch: 44/100 starts\n", + "INFO:root:epoch: 44/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.854088 acc: 0.734375 lr: 0.001371\n", + "INFO:root:epoch: 44/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.893576 acc: 0.731250 lr: 0.001368\n", + "INFO:root:epoch: 44/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.901208 acc: 0.722917 lr: 0.001366\n", + "INFO:root:epoch: 44/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.911228 acc: 0.722656 lr: 0.001364\n", + "INFO:root:epoch: 44/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.913262 acc: 0.727500 lr: 0.001361\n", + "INFO:root:epoch: 44/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.915631 acc: 0.722917 lr: 0.001359\n", + "INFO:root:epoch: 44/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.918094 acc: 0.719196 lr: 0.001357\n", + "INFO:root:epoch: 44/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.907978 acc: 0.722266 lr: 0.001354\n", + "INFO:root:epoch: 44/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.915693 acc: 0.718403 lr: 0.001352\n", + "INFO:root:epoch: 44/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.908236 acc: 0.719688 lr: 0.001350\n", + "INFO:root:epoch: 44/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.903183 acc: 0.722727 lr: 0.001347\n", + "INFO:root:epoch: 44/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.888709 acc: 0.726563 lr: 0.001345\n", + "INFO:root:epoch: 44/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.881539 acc: 0.728365 lr: 0.001343\n", + "INFO:root:epoch: 44/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.882548 acc: 0.727009 lr: 0.001340\n", + "INFO:root:epoch: 44/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.886563 acc: 0.725000 lr: 0.001338\n", + "INFO:root:epoch: 44/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.880728 acc: 0.728125 lr: 0.001336\n", + "INFO:root:epoch: 44/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.888216 acc: 0.726471 lr: 0.001333\n", + "INFO:root:epoch: 44/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.882822 acc: 0.728472 lr: 0.001331\n", + "INFO:root:epoch: 44/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.881073 acc: 0.729112 lr: 0.001329\n", + "INFO:root:epoch: 44/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.882001 acc: 0.729063 lr: 0.001326\n", + "INFO:root:epoch: 44/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.879309 acc: 0.730357 lr: 0.001324\n", + "INFO:root:epoch: 44/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.877268 acc: 0.730256 lr: 0.001322\n", + "INFO:root:epoch: 44/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.871074 acc: 0.732065 lr: 0.001319\n", + "INFO:root:epoch: 44/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.869375 acc: 0.732682 lr: 0.001317\n", + "INFO:root:epoch: 44/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.869586 acc: 0.732750 lr: 0.001315\n", + "INFO:root:epoch: 44/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.870473 acc: 0.732572 lr: 0.001313\n", + "INFO:root:epoch: 44/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.874371 acc: 0.731366 lr: 0.001310\n", + "INFO:root:epoch: 44/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.878666 acc: 0.730022 lr: 0.001308\n", + "INFO:root:epoch: 44/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.881055 acc: 0.729310 lr: 0.001306\n", + "INFO:root:epoch: 44/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.882445 acc: 0.728854 lr: 0.001304\n", + "INFO:root:epoch: 44/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.887197 acc: 0.727520 lr: 0.001301\n", + "INFO:root:epoch: 45/100 starts\n", + "INFO:root:epoch: 45/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.721088 acc: 0.812500 lr: 0.001298\n", + "INFO:root:epoch: 45/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.854947 acc: 0.745313 lr: 0.001296\n", + "INFO:root:epoch: 45/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.881816 acc: 0.736458 lr: 0.001294\n", + "INFO:root:epoch: 45/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.884462 acc: 0.729687 lr: 0.001292\n", + "INFO:root:epoch: 45/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.880657 acc: 0.730625 lr: 0.001289\n", + "INFO:root:epoch: 45/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.873069 acc: 0.731771 lr: 0.001287\n", + "INFO:root:epoch: 45/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.875560 acc: 0.733036 lr: 0.001285\n", + "INFO:root:epoch: 45/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.876301 acc: 0.732813 lr: 0.001283\n", + "INFO:root:epoch: 45/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.869964 acc: 0.734375 lr: 0.001280\n", + "INFO:root:epoch: 45/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.861509 acc: 0.733438 lr: 0.001278\n", + "INFO:root:epoch: 45/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.858668 acc: 0.732102 lr: 0.001276\n", + "INFO:root:epoch: 45/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.861969 acc: 0.731250 lr: 0.001274\n", + "INFO:root:epoch: 45/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.862078 acc: 0.730288 lr: 0.001272\n", + "INFO:root:epoch: 45/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.864396 acc: 0.731250 lr: 0.001269\n", + "INFO:root:epoch: 45/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.864452 acc: 0.731875 lr: 0.001267\n", + "INFO:root:epoch: 45/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.873887 acc: 0.727148 lr: 0.001265\n", + "INFO:root:epoch: 45/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.874892 acc: 0.727022 lr: 0.001263\n", + "INFO:root:epoch: 45/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.868370 acc: 0.730035 lr: 0.001261\n", + "INFO:root:epoch: 45/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.864252 acc: 0.730099 lr: 0.001258\n", + "INFO:root:epoch: 45/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.868312 acc: 0.728906 lr: 0.001256\n", + "INFO:root:epoch: 45/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.870158 acc: 0.729018 lr: 0.001254\n", + "INFO:root:epoch: 45/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.867791 acc: 0.729688 lr: 0.001252\n", + "INFO:root:epoch: 45/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.866661 acc: 0.729484 lr: 0.001250\n", + "INFO:root:epoch: 45/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.868936 acc: 0.728385 lr: 0.001248\n", + "INFO:root:epoch: 45/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.874296 acc: 0.727375 lr: 0.001245\n", + "INFO:root:epoch: 45/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.875546 acc: 0.728125 lr: 0.001243\n", + "INFO:root:epoch: 45/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.877334 acc: 0.727778 lr: 0.001241\n", + "INFO:root:epoch: 45/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.879317 acc: 0.726897 lr: 0.001239\n", + "INFO:root:epoch: 45/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.876840 acc: 0.727802 lr: 0.001237\n", + "INFO:root:epoch: 45/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.873443 acc: 0.728958 lr: 0.001235\n", + "INFO:root:epoch: 45/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.878095 acc: 0.727823 lr: 0.001233\n", + "INFO:root:epoch: 46/100 starts\n", + "INFO:root:epoch: 46/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.801696 acc: 0.737500 lr: 0.001230\n", + "INFO:root:epoch: 46/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.773932 acc: 0.745313 lr: 0.001228\n", + "INFO:root:epoch: 46/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.809889 acc: 0.736458 lr: 0.001226\n", + "INFO:root:epoch: 46/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.833769 acc: 0.737500 lr: 0.001223\n", + "INFO:root:epoch: 46/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.865570 acc: 0.736875 lr: 0.001221\n", + "INFO:root:epoch: 46/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.871965 acc: 0.733333 lr: 0.001219\n", + "INFO:root:epoch: 46/100 et: 3s eta: 10s batches: 70/313(22%) samples: 2240 loss: 0.891527 acc: 0.728125 lr: 0.001217\n", + "INFO:root:epoch: 46/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.870470 acc: 0.733594 lr: 0.001215\n", + "INFO:root:epoch: 46/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.866191 acc: 0.734375 lr: 0.001213\n", + "INFO:root:epoch: 46/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.853962 acc: 0.739688 lr: 0.001211\n", + "INFO:root:epoch: 46/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.864541 acc: 0.736080 lr: 0.001209\n", + "INFO:root:epoch: 46/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.865457 acc: 0.738021 lr: 0.001207\n", + "INFO:root:epoch: 46/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.866309 acc: 0.737260 lr: 0.001204\n", + "INFO:root:epoch: 46/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.862072 acc: 0.737500 lr: 0.001202\n", + "INFO:root:epoch: 46/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.859909 acc: 0.738125 lr: 0.001200\n", + "INFO:root:epoch: 46/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.863566 acc: 0.736133 lr: 0.001198\n", + "INFO:root:epoch: 46/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.866700 acc: 0.736029 lr: 0.001196\n", + "INFO:root:epoch: 46/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.862114 acc: 0.737847 lr: 0.001194\n", + "INFO:root:epoch: 46/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.861451 acc: 0.738651 lr: 0.001192\n", + "INFO:root:epoch: 46/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.864858 acc: 0.738437 lr: 0.001190\n", + "INFO:root:epoch: 46/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.861198 acc: 0.738542 lr: 0.001188\n", + "INFO:root:epoch: 46/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.858953 acc: 0.738210 lr: 0.001186\n", + "INFO:root:epoch: 46/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.858204 acc: 0.737772 lr: 0.001184\n", + "INFO:root:epoch: 46/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.862027 acc: 0.736328 lr: 0.001182\n", + "INFO:root:epoch: 46/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.859693 acc: 0.737125 lr: 0.001180\n", + "INFO:root:epoch: 46/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.864086 acc: 0.735697 lr: 0.001178\n", + "INFO:root:epoch: 46/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.859521 acc: 0.736690 lr: 0.001176\n", + "INFO:root:epoch: 46/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.863924 acc: 0.735156 lr: 0.001174\n", + "INFO:root:epoch: 46/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.868049 acc: 0.734806 lr: 0.001172\n", + "INFO:root:epoch: 46/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.870123 acc: 0.733854 lr: 0.001170\n", + "INFO:root:epoch: 46/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.872136 acc: 0.733266 lr: 0.001168\n", + "INFO:root:epoch: 47/100 starts\n", + "INFO:root:epoch: 47/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.869445 acc: 0.712500 lr: 0.001165\n", + "INFO:root:epoch: 47/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.851833 acc: 0.735938 lr: 0.001163\n", + "INFO:root:epoch: 47/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.853480 acc: 0.728125 lr: 0.001161\n", + "INFO:root:epoch: 47/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.871268 acc: 0.719531 lr: 0.001159\n", + "INFO:root:epoch: 47/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.853625 acc: 0.726250 lr: 0.001157\n", + "INFO:root:epoch: 47/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.847571 acc: 0.731771 lr: 0.001155\n", + "INFO:root:epoch: 47/100 et: 3s eta: 10s batches: 70/313(22%) samples: 2240 loss: 0.846769 acc: 0.729911 lr: 0.001153\n", + "INFO:root:epoch: 47/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.858061 acc: 0.727344 lr: 0.001151\n", + "INFO:root:epoch: 47/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.864871 acc: 0.725000 lr: 0.001149\n", + "INFO:root:epoch: 47/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.858079 acc: 0.727500 lr: 0.001147\n", + "INFO:root:epoch: 47/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.861945 acc: 0.727841 lr: 0.001145\n", + "INFO:root:epoch: 47/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.862408 acc: 0.727865 lr: 0.001143\n", + "INFO:root:epoch: 47/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.862439 acc: 0.728606 lr: 0.001141\n", + "INFO:root:epoch: 47/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.860833 acc: 0.727902 lr: 0.001139\n", + "INFO:root:epoch: 47/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.863450 acc: 0.726458 lr: 0.001137\n", + "INFO:root:epoch: 47/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.861756 acc: 0.726758 lr: 0.001135\n", + "INFO:root:epoch: 47/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.860222 acc: 0.728309 lr: 0.001133\n", + "INFO:root:epoch: 47/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.858540 acc: 0.730556 lr: 0.001131\n", + "INFO:root:epoch: 47/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.861725 acc: 0.729770 lr: 0.001129\n", + "INFO:root:epoch: 47/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.868071 acc: 0.728750 lr: 0.001127\n", + "INFO:root:epoch: 47/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.873154 acc: 0.726042 lr: 0.001125\n", + "INFO:root:epoch: 47/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.875945 acc: 0.726420 lr: 0.001123\n", + "INFO:root:epoch: 47/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.875667 acc: 0.726223 lr: 0.001121\n", + "INFO:root:epoch: 47/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.869773 acc: 0.727344 lr: 0.001119\n", + "INFO:root:epoch: 47/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.869154 acc: 0.727875 lr: 0.001117\n", + "INFO:root:epoch: 47/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.870076 acc: 0.727043 lr: 0.001115\n", + "INFO:root:epoch: 47/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.870282 acc: 0.728009 lr: 0.001114\n", + "INFO:root:epoch: 47/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.871832 acc: 0.727902 lr: 0.001112\n", + "INFO:root:epoch: 47/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.874218 acc: 0.727047 lr: 0.001110\n", + "INFO:root:epoch: 47/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.874666 acc: 0.727500 lr: 0.001108\n", + "INFO:root:epoch: 47/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.873458 acc: 0.727722 lr: 0.001106\n", + "INFO:root:epoch: 48/100 starts\n", + "INFO:root:epoch: 48/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.915660 acc: 0.706250 lr: 0.001103\n", + "INFO:root:epoch: 48/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.908028 acc: 0.718750 lr: 0.001101\n", + "INFO:root:epoch: 48/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.908596 acc: 0.717708 lr: 0.001100\n", + "INFO:root:epoch: 48/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.874110 acc: 0.732813 lr: 0.001098\n", + "INFO:root:epoch: 48/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.858332 acc: 0.737500 lr: 0.001096\n", + "INFO:root:epoch: 48/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.871197 acc: 0.732292 lr: 0.001094\n", + "INFO:root:epoch: 48/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.861462 acc: 0.733482 lr: 0.001092\n", + "INFO:root:epoch: 48/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.870519 acc: 0.730859 lr: 0.001090\n", + "INFO:root:epoch: 48/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.856348 acc: 0.736806 lr: 0.001088\n", + "INFO:root:epoch: 48/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.850899 acc: 0.736250 lr: 0.001086\n", + "INFO:root:epoch: 48/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.851499 acc: 0.736648 lr: 0.001084\n", + "INFO:root:epoch: 48/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.849902 acc: 0.738542 lr: 0.001083\n", + "INFO:root:epoch: 48/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.848409 acc: 0.741827 lr: 0.001081\n", + "INFO:root:epoch: 48/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.851525 acc: 0.739732 lr: 0.001079\n", + "INFO:root:epoch: 48/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.847849 acc: 0.740000 lr: 0.001077\n", + "INFO:root:epoch: 48/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.838211 acc: 0.743945 lr: 0.001075\n", + "INFO:root:epoch: 48/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.838068 acc: 0.743566 lr: 0.001073\n", + "INFO:root:epoch: 48/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.840692 acc: 0.741667 lr: 0.001071\n", + "INFO:root:epoch: 48/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.848038 acc: 0.739474 lr: 0.001069\n", + "INFO:root:epoch: 48/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.841606 acc: 0.742187 lr: 0.001068\n", + "INFO:root:epoch: 48/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.843021 acc: 0.741518 lr: 0.001066\n", + "INFO:root:epoch: 48/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.839918 acc: 0.743324 lr: 0.001064\n", + "INFO:root:epoch: 48/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.839910 acc: 0.743071 lr: 0.001062\n", + "INFO:root:epoch: 48/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.838402 acc: 0.743359 lr: 0.001060\n", + "INFO:root:epoch: 48/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.843530 acc: 0.742250 lr: 0.001058\n", + "INFO:root:epoch: 48/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.844521 acc: 0.742067 lr: 0.001057\n", + "INFO:root:epoch: 48/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.849001 acc: 0.740046 lr: 0.001055\n", + "INFO:root:epoch: 48/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.849362 acc: 0.740067 lr: 0.001053\n", + "INFO:root:epoch: 48/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.849835 acc: 0.739655 lr: 0.001051\n", + "INFO:root:epoch: 48/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.849499 acc: 0.739375 lr: 0.001049\n", + "INFO:root:epoch: 48/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.846603 acc: 0.740222 lr: 0.001047\n", + "INFO:root:epoch: 49/100 starts\n", + "INFO:root:epoch: 49/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.871459 acc: 0.740625 lr: 0.001045\n", + "INFO:root:epoch: 49/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.880735 acc: 0.728125 lr: 0.001043\n", + "INFO:root:epoch: 49/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.887872 acc: 0.728125 lr: 0.001042\n", + "INFO:root:epoch: 49/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.875082 acc: 0.734375 lr: 0.001040\n", + "INFO:root:epoch: 49/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.870226 acc: 0.735000 lr: 0.001038\n", + "INFO:root:epoch: 49/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.869364 acc: 0.733333 lr: 0.001036\n", + "INFO:root:epoch: 49/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.873440 acc: 0.733482 lr: 0.001034\n", + "INFO:root:epoch: 49/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.861439 acc: 0.737891 lr: 0.001033\n", + "INFO:root:epoch: 49/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.862657 acc: 0.734028 lr: 0.001031\n", + "INFO:root:epoch: 49/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.854155 acc: 0.736563 lr: 0.001029\n", + "INFO:root:epoch: 49/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.850740 acc: 0.734659 lr: 0.001027\n", + "INFO:root:epoch: 49/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.844839 acc: 0.737500 lr: 0.001025\n", + "INFO:root:epoch: 49/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.844869 acc: 0.737019 lr: 0.001024\n", + "INFO:root:epoch: 49/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.847296 acc: 0.735268 lr: 0.001022\n", + "INFO:root:epoch: 49/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.849787 acc: 0.736458 lr: 0.001020\n", + "INFO:root:epoch: 49/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.847962 acc: 0.736719 lr: 0.001018\n", + "INFO:root:epoch: 49/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.847930 acc: 0.738787 lr: 0.001017\n", + "INFO:root:epoch: 49/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.841681 acc: 0.740625 lr: 0.001015\n", + "INFO:root:epoch: 49/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.849310 acc: 0.738487 lr: 0.001013\n", + "INFO:root:epoch: 49/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.847989 acc: 0.738906 lr: 0.001011\n", + "INFO:root:epoch: 49/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.844989 acc: 0.740030 lr: 0.001010\n", + "INFO:root:epoch: 49/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.844893 acc: 0.739205 lr: 0.001008\n", + "INFO:root:epoch: 49/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.844115 acc: 0.739402 lr: 0.001006\n", + "INFO:root:epoch: 49/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.842952 acc: 0.740755 lr: 0.001004\n", + "INFO:root:epoch: 49/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.841943 acc: 0.740875 lr: 0.001003\n", + "INFO:root:epoch: 49/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.842328 acc: 0.740745 lr: 0.001001\n", + "INFO:root:epoch: 49/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.846242 acc: 0.739352 lr: 0.000999\n", + "INFO:root:epoch: 49/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.844139 acc: 0.739509 lr: 0.000997\n", + "INFO:root:epoch: 49/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.844395 acc: 0.739224 lr: 0.000996\n", + "INFO:root:epoch: 49/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.847630 acc: 0.737812 lr: 0.000994\n", + "INFO:root:epoch: 49/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.851312 acc: 0.736290 lr: 0.000992\n", + "INFO:root:epoch: 50/100 starts\n", + "INFO:root:epoch: 50/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.015676 acc: 0.703125 lr: 0.000990\n", + "INFO:root:epoch: 50/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.923647 acc: 0.734375 lr: 0.000988\n", + "INFO:root:epoch: 50/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.890851 acc: 0.737500 lr: 0.000987\n", + "INFO:root:epoch: 50/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.874218 acc: 0.742969 lr: 0.000985\n", + "INFO:root:epoch: 50/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.854517 acc: 0.743125 lr: 0.000983\n", + "INFO:root:epoch: 50/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.849495 acc: 0.748958 lr: 0.000981\n", + "INFO:root:epoch: 50/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.858147 acc: 0.745089 lr: 0.000980\n", + "INFO:root:epoch: 50/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.857522 acc: 0.745312 lr: 0.000978\n", + "INFO:root:epoch: 50/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.850765 acc: 0.746181 lr: 0.000976\n", + "INFO:root:epoch: 50/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.850748 acc: 0.745312 lr: 0.000975\n", + "INFO:root:epoch: 50/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.853070 acc: 0.741761 lr: 0.000973\n", + "INFO:root:epoch: 50/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.849600 acc: 0.742448 lr: 0.000971\n", + "INFO:root:epoch: 50/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.841722 acc: 0.743990 lr: 0.000970\n", + "INFO:root:epoch: 50/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.839816 acc: 0.743080 lr: 0.000968\n", + "INFO:root:epoch: 50/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.840112 acc: 0.741667 lr: 0.000966\n", + "INFO:root:epoch: 50/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.845308 acc: 0.740820 lr: 0.000965\n", + "INFO:root:epoch: 50/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.847078 acc: 0.740625 lr: 0.000963\n", + "INFO:root:epoch: 50/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.852572 acc: 0.738715 lr: 0.000961\n", + "INFO:root:epoch: 50/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.853428 acc: 0.738980 lr: 0.000960\n", + "INFO:root:epoch: 50/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.850048 acc: 0.740781 lr: 0.000958\n", + "INFO:root:epoch: 50/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.846018 acc: 0.741667 lr: 0.000956\n", + "INFO:root:epoch: 50/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.845517 acc: 0.742330 lr: 0.000955\n", + "INFO:root:epoch: 50/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.844762 acc: 0.741304 lr: 0.000953\n", + "INFO:root:epoch: 50/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.841995 acc: 0.741667 lr: 0.000951\n", + "INFO:root:epoch: 50/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.836288 acc: 0.743875 lr: 0.000950\n", + "INFO:root:epoch: 50/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.835791 acc: 0.743870 lr: 0.000948\n", + "INFO:root:epoch: 50/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.838450 acc: 0.742940 lr: 0.000946\n", + "INFO:root:epoch: 50/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.838960 acc: 0.742522 lr: 0.000945\n", + "INFO:root:epoch: 50/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.844399 acc: 0.740409 lr: 0.000943\n", + "INFO:root:epoch: 50/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.845635 acc: 0.739896 lr: 0.000941\n", + "INFO:root:epoch: 50/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.849859 acc: 0.738105 lr: 0.000940\n", + "INFO:root:epoch: 51/100 starts\n", + "INFO:root:epoch: 51/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.831457 acc: 0.706250 lr: 0.000938\n", + "INFO:root:epoch: 51/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.797039 acc: 0.731250 lr: 0.000936\n", + "INFO:root:epoch: 51/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.826571 acc: 0.731250 lr: 0.000934\n", + "INFO:root:epoch: 51/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.827990 acc: 0.736719 lr: 0.000933\n", + "INFO:root:epoch: 51/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.848228 acc: 0.731875 lr: 0.000931\n", + "INFO:root:epoch: 51/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.847784 acc: 0.734375 lr: 0.000930\n", + "INFO:root:epoch: 51/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.853702 acc: 0.733929 lr: 0.000928\n", + "INFO:root:epoch: 51/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.849845 acc: 0.736328 lr: 0.000926\n", + "INFO:root:epoch: 51/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.855950 acc: 0.734722 lr: 0.000925\n", + "INFO:root:epoch: 51/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.853829 acc: 0.736563 lr: 0.000923\n", + "INFO:root:epoch: 51/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.863996 acc: 0.734091 lr: 0.000922\n", + "INFO:root:epoch: 51/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.867579 acc: 0.734896 lr: 0.000920\n", + "INFO:root:epoch: 51/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.861326 acc: 0.736298 lr: 0.000918\n", + "INFO:root:epoch: 51/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.858348 acc: 0.737277 lr: 0.000917\n", + "INFO:root:epoch: 51/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.864448 acc: 0.735625 lr: 0.000915\n", + "INFO:root:epoch: 51/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.865278 acc: 0.733789 lr: 0.000914\n", + "INFO:root:epoch: 51/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.858263 acc: 0.736581 lr: 0.000912\n", + "INFO:root:epoch: 51/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.855220 acc: 0.738194 lr: 0.000910\n", + "INFO:root:epoch: 51/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.857610 acc: 0.737500 lr: 0.000909\n", + "INFO:root:epoch: 51/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.858362 acc: 0.736094 lr: 0.000907\n", + "INFO:root:epoch: 51/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.856905 acc: 0.736310 lr: 0.000906\n", + "INFO:root:epoch: 51/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.860737 acc: 0.735653 lr: 0.000904\n", + "INFO:root:epoch: 51/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.857090 acc: 0.736957 lr: 0.000903\n", + "INFO:root:epoch: 51/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.855965 acc: 0.737109 lr: 0.000901\n", + "INFO:root:epoch: 51/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.856065 acc: 0.736125 lr: 0.000899\n", + "INFO:root:epoch: 51/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.858335 acc: 0.735577 lr: 0.000898\n", + "INFO:root:epoch: 51/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.863159 acc: 0.734722 lr: 0.000896\n", + "INFO:root:epoch: 51/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.863976 acc: 0.734263 lr: 0.000895\n", + "INFO:root:epoch: 51/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.863092 acc: 0.734698 lr: 0.000893\n", + "INFO:root:epoch: 51/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.863980 acc: 0.734375 lr: 0.000892\n", + "INFO:root:epoch: 51/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.867559 acc: 0.732863 lr: 0.000890\n", + "INFO:root:epoch: 52/100 starts\n", + "INFO:root:epoch: 52/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.743803 acc: 0.771875 lr: 0.000888\n", + "INFO:root:epoch: 52/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.762042 acc: 0.759375 lr: 0.000887\n", + "INFO:root:epoch: 52/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.825565 acc: 0.748958 lr: 0.000885\n", + "INFO:root:epoch: 52/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.841342 acc: 0.739062 lr: 0.000884\n", + "INFO:root:epoch: 52/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.845629 acc: 0.740625 lr: 0.000882\n", + "INFO:root:epoch: 52/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.834545 acc: 0.738542 lr: 0.000881\n", + "INFO:root:epoch: 52/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.843534 acc: 0.736607 lr: 0.000879\n", + "INFO:root:epoch: 52/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.842049 acc: 0.736328 lr: 0.000877\n", + "INFO:root:epoch: 52/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.827880 acc: 0.740625 lr: 0.000876\n", + "INFO:root:epoch: 52/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.820315 acc: 0.742812 lr: 0.000874\n", + "INFO:root:epoch: 52/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.825259 acc: 0.743182 lr: 0.000873\n", + "INFO:root:epoch: 52/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.830174 acc: 0.741146 lr: 0.000871\n", + "INFO:root:epoch: 52/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.833646 acc: 0.739423 lr: 0.000870\n", + "INFO:root:epoch: 52/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.833089 acc: 0.739955 lr: 0.000868\n", + "INFO:root:epoch: 52/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.829194 acc: 0.741042 lr: 0.000867\n", + "INFO:root:epoch: 52/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.834378 acc: 0.740820 lr: 0.000865\n", + "INFO:root:epoch: 52/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.832258 acc: 0.740993 lr: 0.000864\n", + "INFO:root:epoch: 52/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.829481 acc: 0.740625 lr: 0.000862\n", + "INFO:root:epoch: 52/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.831247 acc: 0.740461 lr: 0.000861\n", + "INFO:root:epoch: 52/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.829492 acc: 0.740781 lr: 0.000859\n", + "INFO:root:epoch: 52/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.826394 acc: 0.742262 lr: 0.000858\n", + "INFO:root:epoch: 52/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.829801 acc: 0.741051 lr: 0.000856\n", + "INFO:root:epoch: 52/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.826304 acc: 0.741168 lr: 0.000855\n", + "INFO:root:epoch: 52/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.826644 acc: 0.740625 lr: 0.000853\n", + "INFO:root:epoch: 52/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.828600 acc: 0.740000 lr: 0.000852\n", + "INFO:root:epoch: 52/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.828024 acc: 0.740625 lr: 0.000851\n", + "INFO:root:epoch: 52/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.828098 acc: 0.740394 lr: 0.000849\n", + "INFO:root:epoch: 52/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.832659 acc: 0.738281 lr: 0.000848\n", + "INFO:root:epoch: 52/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.832256 acc: 0.738578 lr: 0.000846\n", + "INFO:root:epoch: 52/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.828255 acc: 0.740208 lr: 0.000845\n", + "INFO:root:epoch: 52/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.828766 acc: 0.740524 lr: 0.000843\n", + "INFO:root:epoch: 53/100 starts\n", + "INFO:root:epoch: 53/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.894693 acc: 0.715625 lr: 0.000841\n", + "INFO:root:epoch: 53/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.920465 acc: 0.707812 lr: 0.000840\n", + "INFO:root:epoch: 53/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.854437 acc: 0.734375 lr: 0.000838\n", + "INFO:root:epoch: 53/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.836487 acc: 0.739063 lr: 0.000837\n", + "INFO:root:epoch: 53/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.837251 acc: 0.740625 lr: 0.000835\n", + "INFO:root:epoch: 53/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.835642 acc: 0.744271 lr: 0.000834\n", + "INFO:root:epoch: 53/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.848960 acc: 0.740179 lr: 0.000833\n", + "INFO:root:epoch: 53/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.836181 acc: 0.744141 lr: 0.000831\n", + "INFO:root:epoch: 53/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.841505 acc: 0.745139 lr: 0.000830\n", + "INFO:root:epoch: 53/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.843772 acc: 0.744688 lr: 0.000828\n", + "INFO:root:epoch: 53/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.843461 acc: 0.743182 lr: 0.000827\n", + "INFO:root:epoch: 53/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.847543 acc: 0.742708 lr: 0.000825\n", + "INFO:root:epoch: 53/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.850061 acc: 0.740385 lr: 0.000824\n", + "INFO:root:epoch: 53/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.852352 acc: 0.736384 lr: 0.000823\n", + "INFO:root:epoch: 53/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.850986 acc: 0.738333 lr: 0.000821\n", + "INFO:root:epoch: 53/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.856137 acc: 0.736133 lr: 0.000820\n", + "INFO:root:epoch: 53/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.855595 acc: 0.736397 lr: 0.000818\n", + "INFO:root:epoch: 53/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.850728 acc: 0.736632 lr: 0.000817\n", + "INFO:root:epoch: 53/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.847100 acc: 0.737500 lr: 0.000815\n", + "INFO:root:epoch: 53/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.846957 acc: 0.736719 lr: 0.000814\n", + "INFO:root:epoch: 53/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.851465 acc: 0.734226 lr: 0.000813\n", + "INFO:root:epoch: 53/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.858108 acc: 0.731250 lr: 0.000811\n", + "INFO:root:epoch: 53/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.851864 acc: 0.733288 lr: 0.000810\n", + "INFO:root:epoch: 53/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.851618 acc: 0.733594 lr: 0.000808\n", + "INFO:root:epoch: 53/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.853848 acc: 0.733375 lr: 0.000807\n", + "INFO:root:epoch: 53/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.857994 acc: 0.731851 lr: 0.000806\n", + "INFO:root:epoch: 53/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.855323 acc: 0.733218 lr: 0.000804\n", + "INFO:root:epoch: 53/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.856908 acc: 0.732812 lr: 0.000803\n", + "INFO:root:epoch: 53/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.855603 acc: 0.732543 lr: 0.000801\n", + "INFO:root:epoch: 53/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.856527 acc: 0.731979 lr: 0.000800\n", + "INFO:root:epoch: 53/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.855646 acc: 0.733367 lr: 0.000799\n", + "INFO:root:epoch: 54/100 starts\n", + "INFO:root:epoch: 54/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.839867 acc: 0.753125 lr: 0.000797\n", + "INFO:root:epoch: 54/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.853545 acc: 0.742188 lr: 0.000795\n", + "INFO:root:epoch: 54/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.918257 acc: 0.725000 lr: 0.000794\n", + "INFO:root:epoch: 54/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.896112 acc: 0.725781 lr: 0.000793\n", + "INFO:root:epoch: 54/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.889089 acc: 0.729375 lr: 0.000791\n", + "INFO:root:epoch: 54/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.864582 acc: 0.732812 lr: 0.000790\n", + "INFO:root:epoch: 54/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.863641 acc: 0.731250 lr: 0.000789\n", + "INFO:root:epoch: 54/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.870458 acc: 0.729687 lr: 0.000787\n", + "INFO:root:epoch: 54/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.856251 acc: 0.734375 lr: 0.000786\n", + "INFO:root:epoch: 54/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.855672 acc: 0.733750 lr: 0.000785\n", + "INFO:root:epoch: 54/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.848062 acc: 0.737216 lr: 0.000783\n", + "INFO:root:epoch: 54/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.847672 acc: 0.737500 lr: 0.000782\n", + "INFO:root:epoch: 54/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.857222 acc: 0.735817 lr: 0.000780\n", + "INFO:root:epoch: 54/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.853165 acc: 0.737054 lr: 0.000779\n", + "INFO:root:epoch: 54/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.851349 acc: 0.737292 lr: 0.000778\n", + "INFO:root:epoch: 54/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.848894 acc: 0.740820 lr: 0.000776\n", + "INFO:root:epoch: 54/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.851145 acc: 0.739522 lr: 0.000775\n", + "INFO:root:epoch: 54/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.848070 acc: 0.740625 lr: 0.000774\n", + "INFO:root:epoch: 54/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.847512 acc: 0.740625 lr: 0.000772\n", + "INFO:root:epoch: 54/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.850677 acc: 0.739375 lr: 0.000771\n", + "INFO:root:epoch: 54/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.847481 acc: 0.739286 lr: 0.000770\n", + "INFO:root:epoch: 54/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.845264 acc: 0.739347 lr: 0.000768\n", + "INFO:root:epoch: 54/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.844385 acc: 0.738043 lr: 0.000767\n", + "INFO:root:epoch: 54/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.847182 acc: 0.736979 lr: 0.000766\n", + "INFO:root:epoch: 54/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.844987 acc: 0.737250 lr: 0.000764\n", + "INFO:root:epoch: 54/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.842700 acc: 0.738582 lr: 0.000763\n", + "INFO:root:epoch: 54/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.841688 acc: 0.738310 lr: 0.000762\n", + "INFO:root:epoch: 54/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.845397 acc: 0.737612 lr: 0.000760\n", + "INFO:root:epoch: 54/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.845304 acc: 0.737716 lr: 0.000759\n", + "INFO:root:epoch: 54/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.847718 acc: 0.738333 lr: 0.000758\n", + "INFO:root:epoch: 54/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.846684 acc: 0.738810 lr: 0.000757\n", + "INFO:root:epoch: 55/100 starts\n", + "INFO:root:epoch: 55/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.825940 acc: 0.775000 lr: 0.000755\n", + "INFO:root:epoch: 55/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.822556 acc: 0.745313 lr: 0.000753\n", + "INFO:root:epoch: 55/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.832248 acc: 0.738542 lr: 0.000752\n", + "INFO:root:epoch: 55/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.839558 acc: 0.739844 lr: 0.000751\n", + "INFO:root:epoch: 55/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.862434 acc: 0.735000 lr: 0.000750\n", + "INFO:root:epoch: 55/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.850984 acc: 0.738021 lr: 0.000748\n", + "INFO:root:epoch: 55/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.866064 acc: 0.730804 lr: 0.000747\n", + "INFO:root:epoch: 55/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.864999 acc: 0.731250 lr: 0.000746\n", + "INFO:root:epoch: 55/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.890002 acc: 0.724306 lr: 0.000744\n", + "INFO:root:epoch: 55/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.880109 acc: 0.726875 lr: 0.000743\n", + "INFO:root:epoch: 55/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.875342 acc: 0.729261 lr: 0.000742\n", + "INFO:root:epoch: 55/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.877625 acc: 0.729167 lr: 0.000741\n", + "INFO:root:epoch: 55/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.867685 acc: 0.732692 lr: 0.000739\n", + "INFO:root:epoch: 55/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.870108 acc: 0.731473 lr: 0.000738\n", + "INFO:root:epoch: 55/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.865714 acc: 0.732708 lr: 0.000737\n", + "INFO:root:epoch: 55/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.858313 acc: 0.735352 lr: 0.000735\n", + "INFO:root:epoch: 55/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.852047 acc: 0.736213 lr: 0.000734\n", + "INFO:root:epoch: 55/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.851131 acc: 0.736458 lr: 0.000733\n", + "INFO:root:epoch: 55/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.852184 acc: 0.735526 lr: 0.000732\n", + "INFO:root:epoch: 55/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.853389 acc: 0.735313 lr: 0.000730\n", + "INFO:root:epoch: 55/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.855622 acc: 0.734226 lr: 0.000729\n", + "INFO:root:epoch: 55/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.857671 acc: 0.733523 lr: 0.000728\n", + "INFO:root:epoch: 55/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.856719 acc: 0.734103 lr: 0.000727\n", + "INFO:root:epoch: 55/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.852812 acc: 0.735547 lr: 0.000725\n", + "INFO:root:epoch: 55/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.848211 acc: 0.737875 lr: 0.000724\n", + "INFO:root:epoch: 55/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.849574 acc: 0.738101 lr: 0.000723\n", + "INFO:root:epoch: 55/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.843678 acc: 0.740046 lr: 0.000722\n", + "INFO:root:epoch: 55/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.842221 acc: 0.740067 lr: 0.000720\n", + "INFO:root:epoch: 55/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.841687 acc: 0.740086 lr: 0.000719\n", + "INFO:root:epoch: 55/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.843619 acc: 0.738854 lr: 0.000718\n", + "INFO:root:epoch: 55/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.843388 acc: 0.738710 lr: 0.000717\n", + "INFO:root:epoch: 56/100 starts\n", + "INFO:root:epoch: 56/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.878755 acc: 0.743750 lr: 0.000715\n", + "INFO:root:epoch: 56/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.891985 acc: 0.732812 lr: 0.000714\n", + "INFO:root:epoch: 56/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.860375 acc: 0.737500 lr: 0.000712\n", + "INFO:root:epoch: 56/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.871171 acc: 0.731250 lr: 0.000711\n", + "INFO:root:epoch: 56/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.875896 acc: 0.728125 lr: 0.000710\n", + "INFO:root:epoch: 56/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.884268 acc: 0.722396 lr: 0.000709\n", + "INFO:root:epoch: 56/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.893236 acc: 0.718304 lr: 0.000708\n", + "INFO:root:epoch: 56/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.881897 acc: 0.721094 lr: 0.000706\n", + "INFO:root:epoch: 56/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.868662 acc: 0.729514 lr: 0.000705\n", + "INFO:root:epoch: 56/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.859089 acc: 0.730000 lr: 0.000704\n", + "INFO:root:epoch: 56/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.851870 acc: 0.731534 lr: 0.000703\n", + "INFO:root:epoch: 56/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.856896 acc: 0.731250 lr: 0.000701\n", + "INFO:root:epoch: 56/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.850384 acc: 0.733894 lr: 0.000700\n", + "INFO:root:epoch: 56/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.849512 acc: 0.734375 lr: 0.000699\n", + "INFO:root:epoch: 56/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.844825 acc: 0.736458 lr: 0.000698\n", + "INFO:root:epoch: 56/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.841548 acc: 0.737891 lr: 0.000697\n", + "INFO:root:epoch: 56/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.841932 acc: 0.736765 lr: 0.000695\n", + "INFO:root:epoch: 56/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.841608 acc: 0.735243 lr: 0.000694\n", + "INFO:root:epoch: 56/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.836072 acc: 0.737336 lr: 0.000693\n", + "INFO:root:epoch: 56/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.837611 acc: 0.737813 lr: 0.000692\n", + "INFO:root:epoch: 56/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.838714 acc: 0.736905 lr: 0.000691\n", + "INFO:root:epoch: 56/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.838372 acc: 0.737500 lr: 0.000689\n", + "INFO:root:epoch: 56/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.836598 acc: 0.737772 lr: 0.000688\n", + "INFO:root:epoch: 56/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.841651 acc: 0.737109 lr: 0.000687\n", + "INFO:root:epoch: 56/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.838858 acc: 0.737125 lr: 0.000686\n", + "INFO:root:epoch: 56/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.833190 acc: 0.739183 lr: 0.000685\n", + "INFO:root:epoch: 56/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.832718 acc: 0.739699 lr: 0.000683\n", + "INFO:root:epoch: 56/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.832299 acc: 0.739844 lr: 0.000682\n", + "INFO:root:epoch: 56/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.830403 acc: 0.740841 lr: 0.000681\n", + "INFO:root:epoch: 56/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.832268 acc: 0.740208 lr: 0.000680\n", + "INFO:root:epoch: 56/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.835169 acc: 0.738710 lr: 0.000679\n", + "INFO:root:epoch: 57/100 starts\n", + "INFO:root:epoch: 57/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.909517 acc: 0.725000 lr: 0.000677\n", + "INFO:root:epoch: 57/100 et: 0s eta: 14s batches: 20/313(6%) samples: 640 loss: 0.867697 acc: 0.728125 lr: 0.000676\n", + "INFO:root:epoch: 57/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.840410 acc: 0.731250 lr: 0.000675\n", + "INFO:root:epoch: 57/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.805389 acc: 0.741406 lr: 0.000674\n", + "INFO:root:epoch: 57/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.814985 acc: 0.741250 lr: 0.000673\n", + "INFO:root:epoch: 57/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.809866 acc: 0.744271 lr: 0.000671\n", + "INFO:root:epoch: 57/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.817273 acc: 0.741964 lr: 0.000670\n", + "INFO:root:epoch: 57/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.812742 acc: 0.741016 lr: 0.000669\n", + "INFO:root:epoch: 57/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.815324 acc: 0.743056 lr: 0.000668\n", + "INFO:root:epoch: 57/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.821292 acc: 0.738437 lr: 0.000667\n", + "INFO:root:epoch: 57/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.816058 acc: 0.739773 lr: 0.000666\n", + "INFO:root:epoch: 57/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.817590 acc: 0.739583 lr: 0.000664\n", + "INFO:root:epoch: 57/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.818817 acc: 0.738221 lr: 0.000663\n", + "INFO:root:epoch: 57/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.816559 acc: 0.736607 lr: 0.000662\n", + "INFO:root:epoch: 57/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.825244 acc: 0.737083 lr: 0.000661\n", + "INFO:root:epoch: 57/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.823392 acc: 0.738281 lr: 0.000660\n", + "INFO:root:epoch: 57/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.823858 acc: 0.739154 lr: 0.000659\n", + "INFO:root:epoch: 57/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.822548 acc: 0.739757 lr: 0.000658\n", + "INFO:root:epoch: 57/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.820885 acc: 0.741283 lr: 0.000656\n", + "INFO:root:epoch: 57/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.820076 acc: 0.742500 lr: 0.000655\n", + "INFO:root:epoch: 57/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.824961 acc: 0.741220 lr: 0.000654\n", + "INFO:root:epoch: 57/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.825085 acc: 0.741335 lr: 0.000653\n", + "INFO:root:epoch: 57/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.827870 acc: 0.739810 lr: 0.000652\n", + "INFO:root:epoch: 57/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.828434 acc: 0.739844 lr: 0.000651\n", + "INFO:root:epoch: 57/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.828785 acc: 0.739250 lr: 0.000650\n", + "INFO:root:epoch: 57/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.827633 acc: 0.739543 lr: 0.000648\n", + "INFO:root:epoch: 57/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.825659 acc: 0.739699 lr: 0.000647\n", + "INFO:root:epoch: 57/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.823095 acc: 0.739732 lr: 0.000646\n", + "INFO:root:epoch: 57/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.823756 acc: 0.739009 lr: 0.000645\n", + "INFO:root:epoch: 57/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.825304 acc: 0.738854 lr: 0.000644\n", + "INFO:root:epoch: 57/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.827320 acc: 0.738206 lr: 0.000643\n", + "INFO:root:epoch: 58/100 starts\n", + "INFO:root:epoch: 58/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.798118 acc: 0.734375 lr: 0.000641\n", + "INFO:root:epoch: 58/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.830979 acc: 0.731250 lr: 0.000640\n", + "INFO:root:epoch: 58/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.848662 acc: 0.725000 lr: 0.000639\n", + "INFO:root:epoch: 58/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.838207 acc: 0.728906 lr: 0.000638\n", + "INFO:root:epoch: 58/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.838796 acc: 0.736875 lr: 0.000637\n", + "INFO:root:epoch: 58/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.818739 acc: 0.743229 lr: 0.000636\n", + "INFO:root:epoch: 58/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.809720 acc: 0.745536 lr: 0.000635\n", + "INFO:root:epoch: 58/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.822597 acc: 0.743750 lr: 0.000634\n", + "INFO:root:epoch: 58/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.830851 acc: 0.738889 lr: 0.000633\n", + "INFO:root:epoch: 58/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.834010 acc: 0.738750 lr: 0.000632\n", + "INFO:root:epoch: 58/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.829917 acc: 0.738352 lr: 0.000630\n", + "INFO:root:epoch: 58/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.823273 acc: 0.741667 lr: 0.000629\n", + "INFO:root:epoch: 58/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.812068 acc: 0.746154 lr: 0.000628\n", + "INFO:root:epoch: 58/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.811988 acc: 0.746875 lr: 0.000627\n", + "INFO:root:epoch: 58/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.805599 acc: 0.748542 lr: 0.000626\n", + "INFO:root:epoch: 58/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.806976 acc: 0.748242 lr: 0.000625\n", + "INFO:root:epoch: 58/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.811154 acc: 0.746507 lr: 0.000624\n", + "INFO:root:epoch: 58/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.809796 acc: 0.746181 lr: 0.000623\n", + "INFO:root:epoch: 58/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.809086 acc: 0.748520 lr: 0.000622\n", + "INFO:root:epoch: 58/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.810409 acc: 0.746719 lr: 0.000621\n", + "INFO:root:epoch: 58/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.812442 acc: 0.745833 lr: 0.000620\n", + "INFO:root:epoch: 58/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.811211 acc: 0.746591 lr: 0.000619\n", + "INFO:root:epoch: 58/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.812541 acc: 0.746875 lr: 0.000617\n", + "INFO:root:epoch: 58/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.816855 acc: 0.744531 lr: 0.000616\n", + "INFO:root:epoch: 58/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.819121 acc: 0.744000 lr: 0.000615\n", + "INFO:root:epoch: 58/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.819055 acc: 0.742668 lr: 0.000614\n", + "INFO:root:epoch: 58/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.820233 acc: 0.742940 lr: 0.000613\n", + "INFO:root:epoch: 58/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.818864 acc: 0.743973 lr: 0.000612\n", + "INFO:root:epoch: 58/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.821589 acc: 0.743211 lr: 0.000611\n", + "INFO:root:epoch: 58/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.821042 acc: 0.743229 lr: 0.000610\n", + "INFO:root:epoch: 58/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.823857 acc: 0.743145 lr: 0.000609\n", + "INFO:root:epoch: 59/100 starts\n", + "INFO:root:epoch: 59/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.789549 acc: 0.765625 lr: 0.000608\n", + "INFO:root:epoch: 59/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.790341 acc: 0.760938 lr: 0.000607\n", + "INFO:root:epoch: 59/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.806979 acc: 0.755208 lr: 0.000605\n", + "INFO:root:epoch: 59/100 et: 1s eta: 13s batches: 40/313(12%) samples: 1280 loss: 0.815923 acc: 0.753906 lr: 0.000604\n", + "INFO:root:epoch: 59/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.822824 acc: 0.753750 lr: 0.000603\n", + "INFO:root:epoch: 59/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.822709 acc: 0.751563 lr: 0.000602\n", + "INFO:root:epoch: 59/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.832924 acc: 0.747321 lr: 0.000601\n", + "INFO:root:epoch: 59/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.847297 acc: 0.740234 lr: 0.000600\n", + "INFO:root:epoch: 59/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.860101 acc: 0.734375 lr: 0.000599\n", + "INFO:root:epoch: 59/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.860811 acc: 0.734062 lr: 0.000598\n", + "INFO:root:epoch: 59/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.854995 acc: 0.737500 lr: 0.000597\n", + "INFO:root:epoch: 59/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.848008 acc: 0.739844 lr: 0.000596\n", + "INFO:root:epoch: 59/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.834319 acc: 0.743029 lr: 0.000595\n", + "INFO:root:epoch: 59/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.833186 acc: 0.741518 lr: 0.000594\n", + "INFO:root:epoch: 59/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.835036 acc: 0.741458 lr: 0.000593\n", + "INFO:root:epoch: 59/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.833203 acc: 0.741602 lr: 0.000592\n", + "INFO:root:epoch: 59/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.833846 acc: 0.740625 lr: 0.000591\n", + "INFO:root:epoch: 59/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.831953 acc: 0.741493 lr: 0.000590\n", + "INFO:root:epoch: 59/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.831190 acc: 0.741941 lr: 0.000589\n", + "INFO:root:epoch: 59/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.839709 acc: 0.739688 lr: 0.000588\n", + "INFO:root:epoch: 59/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.841757 acc: 0.737500 lr: 0.000587\n", + "INFO:root:epoch: 59/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.842198 acc: 0.737642 lr: 0.000586\n", + "INFO:root:epoch: 59/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.841420 acc: 0.736821 lr: 0.000585\n", + "INFO:root:epoch: 59/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.838895 acc: 0.738021 lr: 0.000584\n", + "INFO:root:epoch: 59/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.834055 acc: 0.739500 lr: 0.000583\n", + "INFO:root:epoch: 59/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.834251 acc: 0.739904 lr: 0.000582\n", + "INFO:root:epoch: 59/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.834817 acc: 0.739005 lr: 0.000581\n", + "INFO:root:epoch: 59/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.835571 acc: 0.738616 lr: 0.000580\n", + "INFO:root:epoch: 59/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.833812 acc: 0.740409 lr: 0.000579\n", + "INFO:root:epoch: 59/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.830910 acc: 0.741979 lr: 0.000578\n", + "INFO:root:epoch: 59/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.831648 acc: 0.742036 lr: 0.000577\n", + "INFO:root:epoch: 60/100 starts\n", + "INFO:root:epoch: 60/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.885892 acc: 0.775000 lr: 0.000576\n", + "INFO:root:epoch: 60/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.879384 acc: 0.754688 lr: 0.000575\n", + "INFO:root:epoch: 60/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.839739 acc: 0.762500 lr: 0.000574\n", + "INFO:root:epoch: 60/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.829774 acc: 0.758594 lr: 0.000573\n", + "INFO:root:epoch: 60/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.843364 acc: 0.753125 lr: 0.000572\n", + "INFO:root:epoch: 60/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.826290 acc: 0.756250 lr: 0.000571\n", + "INFO:root:epoch: 60/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.831055 acc: 0.752232 lr: 0.000570\n", + "INFO:root:epoch: 60/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.820983 acc: 0.754297 lr: 0.000569\n", + "INFO:root:epoch: 60/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.812713 acc: 0.754167 lr: 0.000568\n", + "INFO:root:epoch: 60/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.808644 acc: 0.753750 lr: 0.000567\n", + "INFO:root:epoch: 60/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.812561 acc: 0.750284 lr: 0.000566\n", + "INFO:root:epoch: 60/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.824937 acc: 0.746615 lr: 0.000565\n", + "INFO:root:epoch: 60/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.827743 acc: 0.744471 lr: 0.000564\n", + "INFO:root:epoch: 60/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.827845 acc: 0.742188 lr: 0.000563\n", + "INFO:root:epoch: 60/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.824984 acc: 0.742500 lr: 0.000562\n", + "INFO:root:epoch: 60/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.820623 acc: 0.742773 lr: 0.000561\n", + "INFO:root:epoch: 60/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.824389 acc: 0.740441 lr: 0.000560\n", + "INFO:root:epoch: 60/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.819917 acc: 0.741840 lr: 0.000559\n", + "INFO:root:epoch: 60/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.821452 acc: 0.741612 lr: 0.000558\n", + "INFO:root:epoch: 60/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.821996 acc: 0.740781 lr: 0.000557\n", + "INFO:root:epoch: 60/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.825038 acc: 0.741071 lr: 0.000556\n", + "INFO:root:epoch: 60/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.833056 acc: 0.739489 lr: 0.000555\n", + "INFO:root:epoch: 60/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.834126 acc: 0.740625 lr: 0.000554\n", + "INFO:root:epoch: 60/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.838261 acc: 0.739193 lr: 0.000553\n", + "INFO:root:epoch: 60/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.838391 acc: 0.738250 lr: 0.000552\n", + "INFO:root:epoch: 60/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.837515 acc: 0.738462 lr: 0.000551\n", + "INFO:root:epoch: 60/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.840794 acc: 0.738079 lr: 0.000550\n", + "INFO:root:epoch: 60/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.841568 acc: 0.738058 lr: 0.000549\n", + "INFO:root:epoch: 60/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.842523 acc: 0.738362 lr: 0.000548\n", + "INFO:root:epoch: 60/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.844872 acc: 0.737187 lr: 0.000547\n", + "INFO:root:epoch: 60/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.842277 acc: 0.736794 lr: 0.000546\n", + "INFO:root:epoch: 61/100 starts\n", + "INFO:root:epoch: 61/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.805544 acc: 0.753125 lr: 0.000545\n", + "INFO:root:epoch: 61/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.833780 acc: 0.742188 lr: 0.000544\n", + "INFO:root:epoch: 61/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.839274 acc: 0.740625 lr: 0.000543\n", + "INFO:root:epoch: 61/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.833520 acc: 0.744531 lr: 0.000542\n", + "INFO:root:epoch: 61/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.835566 acc: 0.743125 lr: 0.000541\n", + "INFO:root:epoch: 61/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.820548 acc: 0.746875 lr: 0.000540\n", + "INFO:root:epoch: 61/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.813290 acc: 0.748214 lr: 0.000539\n", + "INFO:root:epoch: 61/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.806995 acc: 0.748437 lr: 0.000539\n", + "INFO:root:epoch: 61/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.822883 acc: 0.743403 lr: 0.000538\n", + "INFO:root:epoch: 61/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.815852 acc: 0.746250 lr: 0.000537\n", + "INFO:root:epoch: 61/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.810577 acc: 0.747159 lr: 0.000536\n", + "INFO:root:epoch: 61/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.816468 acc: 0.744010 lr: 0.000535\n", + "INFO:root:epoch: 61/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.810262 acc: 0.745913 lr: 0.000534\n", + "INFO:root:epoch: 61/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.818650 acc: 0.743973 lr: 0.000533\n", + "INFO:root:epoch: 61/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.823645 acc: 0.742083 lr: 0.000532\n", + "INFO:root:epoch: 61/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.829649 acc: 0.740039 lr: 0.000531\n", + "INFO:root:epoch: 61/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.826165 acc: 0.740257 lr: 0.000530\n", + "INFO:root:epoch: 61/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.823962 acc: 0.740972 lr: 0.000529\n", + "INFO:root:epoch: 61/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.824045 acc: 0.741447 lr: 0.000528\n", + "INFO:root:epoch: 61/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.822660 acc: 0.741875 lr: 0.000527\n", + "INFO:root:epoch: 61/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.822528 acc: 0.741815 lr: 0.000527\n", + "INFO:root:epoch: 61/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.820247 acc: 0.741477 lr: 0.000526\n", + "INFO:root:epoch: 61/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.824795 acc: 0.739810 lr: 0.000525\n", + "INFO:root:epoch: 61/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.825198 acc: 0.739193 lr: 0.000524\n", + "INFO:root:epoch: 61/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.820621 acc: 0.741750 lr: 0.000523\n", + "INFO:root:epoch: 61/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.822385 acc: 0.740986 lr: 0.000522\n", + "INFO:root:epoch: 61/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.825277 acc: 0.739815 lr: 0.000521\n", + "INFO:root:epoch: 61/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.827828 acc: 0.738728 lr: 0.000520\n", + "INFO:root:epoch: 61/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.824658 acc: 0.739547 lr: 0.000519\n", + "INFO:root:epoch: 61/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.828213 acc: 0.737813 lr: 0.000518\n", + "INFO:root:epoch: 61/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.828741 acc: 0.738206 lr: 0.000518\n", + "INFO:root:epoch: 62/100 starts\n", + "INFO:root:epoch: 62/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.844680 acc: 0.765625 lr: 0.000516\n", + "INFO:root:epoch: 62/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.862761 acc: 0.767188 lr: 0.000515\n", + "INFO:root:epoch: 62/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.863700 acc: 0.759375 lr: 0.000515\n", + "INFO:root:epoch: 62/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.832684 acc: 0.765625 lr: 0.000514\n", + "INFO:root:epoch: 62/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.837586 acc: 0.765625 lr: 0.000513\n", + "INFO:root:epoch: 62/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.814643 acc: 0.770313 lr: 0.000512\n", + "INFO:root:epoch: 62/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.811375 acc: 0.763839 lr: 0.000511\n", + "INFO:root:epoch: 62/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.820167 acc: 0.757422 lr: 0.000510\n", + "INFO:root:epoch: 62/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.824017 acc: 0.757986 lr: 0.000509\n", + "INFO:root:epoch: 62/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.825212 acc: 0.756563 lr: 0.000508\n", + "INFO:root:epoch: 62/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.821173 acc: 0.757102 lr: 0.000507\n", + "INFO:root:epoch: 62/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.820457 acc: 0.756250 lr: 0.000507\n", + "INFO:root:epoch: 62/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.819386 acc: 0.756010 lr: 0.000506\n", + "INFO:root:epoch: 62/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.821911 acc: 0.753571 lr: 0.000505\n", + "INFO:root:epoch: 62/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.819128 acc: 0.752708 lr: 0.000504\n", + "INFO:root:epoch: 62/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.816960 acc: 0.753320 lr: 0.000503\n", + "INFO:root:epoch: 62/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.815645 acc: 0.753493 lr: 0.000502\n", + "INFO:root:epoch: 62/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.819335 acc: 0.750868 lr: 0.000501\n", + "INFO:root:epoch: 62/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.815293 acc: 0.750658 lr: 0.000500\n", + "INFO:root:epoch: 62/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.813565 acc: 0.750312 lr: 0.000500\n", + "INFO:root:epoch: 62/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.819647 acc: 0.748065 lr: 0.000499\n", + "INFO:root:epoch: 62/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.821927 acc: 0.747585 lr: 0.000498\n", + "INFO:root:epoch: 62/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.819023 acc: 0.748505 lr: 0.000497\n", + "INFO:root:epoch: 62/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.820804 acc: 0.747266 lr: 0.000496\n", + "INFO:root:epoch: 62/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.820380 acc: 0.747250 lr: 0.000495\n", + "INFO:root:epoch: 62/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.814690 acc: 0.748558 lr: 0.000494\n", + "INFO:root:epoch: 62/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.814140 acc: 0.750000 lr: 0.000494\n", + "INFO:root:epoch: 62/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.812839 acc: 0.750781 lr: 0.000493\n", + "INFO:root:epoch: 62/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.812621 acc: 0.750216 lr: 0.000492\n", + "INFO:root:epoch: 62/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.813526 acc: 0.748958 lr: 0.000491\n", + "INFO:root:epoch: 62/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.815095 acc: 0.747379 lr: 0.000490\n", + "INFO:root:epoch: 63/100 starts\n", + "INFO:root:epoch: 63/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.796004 acc: 0.759375 lr: 0.000489\n", + "INFO:root:epoch: 63/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.827343 acc: 0.740625 lr: 0.000488\n", + "INFO:root:epoch: 63/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.810678 acc: 0.753125 lr: 0.000487\n", + "INFO:root:epoch: 63/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.817466 acc: 0.749219 lr: 0.000487\n", + "INFO:root:epoch: 63/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.814195 acc: 0.745625 lr: 0.000486\n", + "INFO:root:epoch: 63/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.837556 acc: 0.738021 lr: 0.000485\n", + "INFO:root:epoch: 63/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.830012 acc: 0.738839 lr: 0.000484\n", + "INFO:root:epoch: 63/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.845837 acc: 0.737891 lr: 0.000483\n", + "INFO:root:epoch: 63/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.837978 acc: 0.742014 lr: 0.000482\n", + "INFO:root:epoch: 63/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.839345 acc: 0.739063 lr: 0.000482\n", + "INFO:root:epoch: 63/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.840577 acc: 0.738068 lr: 0.000481\n", + "INFO:root:epoch: 63/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.837437 acc: 0.739323 lr: 0.000480\n", + "INFO:root:epoch: 63/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.833851 acc: 0.740865 lr: 0.000479\n", + "INFO:root:epoch: 63/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.831390 acc: 0.739286 lr: 0.000478\n", + "INFO:root:epoch: 63/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.825265 acc: 0.740417 lr: 0.000477\n", + "INFO:root:epoch: 63/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.823050 acc: 0.742773 lr: 0.000477\n", + "INFO:root:epoch: 63/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.817195 acc: 0.744485 lr: 0.000476\n", + "INFO:root:epoch: 63/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.818153 acc: 0.743750 lr: 0.000475\n", + "INFO:root:epoch: 63/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.819347 acc: 0.743914 lr: 0.000474\n", + "INFO:root:epoch: 63/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.820129 acc: 0.742813 lr: 0.000473\n", + "INFO:root:epoch: 63/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.822119 acc: 0.741667 lr: 0.000472\n", + "INFO:root:epoch: 63/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.820984 acc: 0.741903 lr: 0.000472\n", + "INFO:root:epoch: 63/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.818035 acc: 0.743071 lr: 0.000471\n", + "INFO:root:epoch: 63/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.820342 acc: 0.742708 lr: 0.000470\n", + "INFO:root:epoch: 63/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.822475 acc: 0.742000 lr: 0.000469\n", + "INFO:root:epoch: 63/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.822982 acc: 0.741466 lr: 0.000468\n", + "INFO:root:epoch: 63/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.818938 acc: 0.742361 lr: 0.000468\n", + "INFO:root:epoch: 63/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.822081 acc: 0.741518 lr: 0.000467\n", + "INFO:root:epoch: 63/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.819623 acc: 0.741595 lr: 0.000466\n", + "INFO:root:epoch: 63/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.818924 acc: 0.742396 lr: 0.000465\n", + "INFO:root:epoch: 63/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.819387 acc: 0.742238 lr: 0.000464\n", + "INFO:root:epoch: 64/100 starts\n", + "INFO:root:epoch: 64/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.781804 acc: 0.743750 lr: 0.000463\n", + "INFO:root:epoch: 64/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.778554 acc: 0.751563 lr: 0.000462\n", + "INFO:root:epoch: 64/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.765933 acc: 0.766667 lr: 0.000462\n", + "INFO:root:epoch: 64/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.799363 acc: 0.753125 lr: 0.000461\n", + "INFO:root:epoch: 64/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.818070 acc: 0.745625 lr: 0.000460\n", + "INFO:root:epoch: 64/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.788903 acc: 0.754687 lr: 0.000459\n", + "INFO:root:epoch: 64/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.792313 acc: 0.753125 lr: 0.000458\n", + "INFO:root:epoch: 64/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.794060 acc: 0.752734 lr: 0.000458\n", + "INFO:root:epoch: 64/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.796208 acc: 0.750694 lr: 0.000457\n", + "INFO:root:epoch: 64/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.801066 acc: 0.749687 lr: 0.000456\n", + "INFO:root:epoch: 64/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.806042 acc: 0.748011 lr: 0.000455\n", + "INFO:root:epoch: 64/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.813346 acc: 0.748177 lr: 0.000455\n", + "INFO:root:epoch: 64/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.820467 acc: 0.746635 lr: 0.000454\n", + "INFO:root:epoch: 64/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.826604 acc: 0.743973 lr: 0.000453\n", + "INFO:root:epoch: 64/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.825852 acc: 0.743750 lr: 0.000452\n", + "INFO:root:epoch: 64/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.822501 acc: 0.744727 lr: 0.000451\n", + "INFO:root:epoch: 64/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.822466 acc: 0.744301 lr: 0.000451\n", + "INFO:root:epoch: 64/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.822896 acc: 0.745486 lr: 0.000450\n", + "INFO:root:epoch: 64/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.827110 acc: 0.743750 lr: 0.000449\n", + "INFO:root:epoch: 64/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.832103 acc: 0.742187 lr: 0.000448\n", + "INFO:root:epoch: 64/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.831318 acc: 0.743155 lr: 0.000447\n", + "INFO:root:epoch: 64/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.824343 acc: 0.745170 lr: 0.000447\n", + "INFO:root:epoch: 64/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.825736 acc: 0.743342 lr: 0.000446\n", + "INFO:root:epoch: 64/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.823029 acc: 0.744141 lr: 0.000445\n", + "INFO:root:epoch: 64/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.821226 acc: 0.744250 lr: 0.000444\n", + "INFO:root:epoch: 64/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.821424 acc: 0.743630 lr: 0.000444\n", + "INFO:root:epoch: 64/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.823106 acc: 0.744213 lr: 0.000443\n", + "INFO:root:epoch: 64/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.819443 acc: 0.744866 lr: 0.000442\n", + "INFO:root:epoch: 64/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.813378 acc: 0.746444 lr: 0.000441\n", + "INFO:root:epoch: 64/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.814149 acc: 0.745521 lr: 0.000441\n", + "INFO:root:epoch: 64/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.811714 acc: 0.746169 lr: 0.000440\n", + "INFO:root:epoch: 65/100 starts\n", + "INFO:root:epoch: 65/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.758955 acc: 0.775000 lr: 0.000439\n", + "INFO:root:epoch: 65/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.791680 acc: 0.759375 lr: 0.000438\n", + "INFO:root:epoch: 65/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.805555 acc: 0.753125 lr: 0.000437\n", + "INFO:root:epoch: 65/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.838570 acc: 0.740625 lr: 0.000437\n", + "INFO:root:epoch: 65/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.835562 acc: 0.741875 lr: 0.000436\n", + "INFO:root:epoch: 65/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.838233 acc: 0.738542 lr: 0.000435\n", + "INFO:root:epoch: 65/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.834390 acc: 0.740625 lr: 0.000434\n", + "INFO:root:epoch: 65/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.829112 acc: 0.741797 lr: 0.000434\n", + "INFO:root:epoch: 65/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.826130 acc: 0.744097 lr: 0.000433\n", + "INFO:root:epoch: 65/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.822832 acc: 0.742500 lr: 0.000432\n", + "INFO:root:epoch: 65/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.814990 acc: 0.744886 lr: 0.000431\n", + "INFO:root:epoch: 65/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.814008 acc: 0.745052 lr: 0.000431\n", + "INFO:root:epoch: 65/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.817677 acc: 0.743990 lr: 0.000430\n", + "INFO:root:epoch: 65/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.823342 acc: 0.741741 lr: 0.000429\n", + "INFO:root:epoch: 65/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.819397 acc: 0.742083 lr: 0.000428\n", + "INFO:root:epoch: 65/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.824206 acc: 0.741211 lr: 0.000428\n", + "INFO:root:epoch: 65/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.820364 acc: 0.743566 lr: 0.000427\n", + "INFO:root:epoch: 65/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.825177 acc: 0.742882 lr: 0.000426\n", + "INFO:root:epoch: 65/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.820837 acc: 0.743257 lr: 0.000425\n", + "INFO:root:epoch: 65/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.821903 acc: 0.743125 lr: 0.000425\n", + "INFO:root:epoch: 65/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.824014 acc: 0.741964 lr: 0.000424\n", + "INFO:root:epoch: 65/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.824602 acc: 0.742330 lr: 0.000423\n", + "INFO:root:epoch: 65/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.826400 acc: 0.742391 lr: 0.000422\n", + "INFO:root:epoch: 65/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.825911 acc: 0.742187 lr: 0.000422\n", + "INFO:root:epoch: 65/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.832440 acc: 0.739625 lr: 0.000421\n", + "INFO:root:epoch: 65/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.830574 acc: 0.740625 lr: 0.000420\n", + "INFO:root:epoch: 65/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.828070 acc: 0.742130 lr: 0.000419\n", + "INFO:root:epoch: 65/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.826871 acc: 0.742522 lr: 0.000419\n", + "INFO:root:epoch: 65/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.829686 acc: 0.742457 lr: 0.000418\n", + "INFO:root:epoch: 65/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.827813 acc: 0.742292 lr: 0.000417\n", + "INFO:root:epoch: 65/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.830362 acc: 0.741835 lr: 0.000417\n", + "INFO:root:epoch: 66/100 starts\n", + "INFO:root:epoch: 66/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.745997 acc: 0.765625 lr: 0.000416\n", + "INFO:root:epoch: 66/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.756653 acc: 0.767187 lr: 0.000415\n", + "INFO:root:epoch: 66/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.760714 acc: 0.768750 lr: 0.000414\n", + "INFO:root:epoch: 66/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.774805 acc: 0.764844 lr: 0.000413\n", + "INFO:root:epoch: 66/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.775891 acc: 0.766875 lr: 0.000413\n", + "INFO:root:epoch: 66/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.783079 acc: 0.761979 lr: 0.000412\n", + "INFO:root:epoch: 66/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.784908 acc: 0.760268 lr: 0.000411\n", + "INFO:root:epoch: 66/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.784080 acc: 0.760938 lr: 0.000411\n", + "INFO:root:epoch: 66/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.787995 acc: 0.761458 lr: 0.000410\n", + "INFO:root:epoch: 66/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.794286 acc: 0.760312 lr: 0.000409\n", + "INFO:root:epoch: 66/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.803357 acc: 0.759659 lr: 0.000409\n", + "INFO:root:epoch: 66/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.806682 acc: 0.755729 lr: 0.000408\n", + "INFO:root:epoch: 66/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.811768 acc: 0.755048 lr: 0.000407\n", + "INFO:root:epoch: 66/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.816728 acc: 0.753571 lr: 0.000406\n", + "INFO:root:epoch: 66/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.819195 acc: 0.750000 lr: 0.000406\n", + "INFO:root:epoch: 66/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.815099 acc: 0.751172 lr: 0.000405\n", + "INFO:root:epoch: 66/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.815097 acc: 0.750919 lr: 0.000404\n", + "INFO:root:epoch: 66/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.818380 acc: 0.749132 lr: 0.000404\n", + "INFO:root:epoch: 66/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.818319 acc: 0.747533 lr: 0.000403\n", + "INFO:root:epoch: 66/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.817883 acc: 0.747656 lr: 0.000402\n", + "INFO:root:epoch: 66/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.815824 acc: 0.748512 lr: 0.000401\n", + "INFO:root:epoch: 66/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.814059 acc: 0.749006 lr: 0.000401\n", + "INFO:root:epoch: 66/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.817707 acc: 0.747826 lr: 0.000400\n", + "INFO:root:epoch: 66/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.819252 acc: 0.746354 lr: 0.000399\n", + "INFO:root:epoch: 66/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.822368 acc: 0.745875 lr: 0.000399\n", + "INFO:root:epoch: 66/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.819690 acc: 0.745913 lr: 0.000398\n", + "INFO:root:epoch: 66/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.821290 acc: 0.744444 lr: 0.000397\n", + "INFO:root:epoch: 66/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.820228 acc: 0.744978 lr: 0.000397\n", + "INFO:root:epoch: 66/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.817387 acc: 0.745797 lr: 0.000396\n", + "INFO:root:epoch: 66/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.817338 acc: 0.746458 lr: 0.000395\n", + "INFO:root:epoch: 66/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.820686 acc: 0.745766 lr: 0.000395\n", + "INFO:root:epoch: 67/100 starts\n", + "INFO:root:epoch: 67/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.832244 acc: 0.743750 lr: 0.000394\n", + "INFO:root:epoch: 67/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.813995 acc: 0.745313 lr: 0.000393\n", + "INFO:root:epoch: 67/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.843474 acc: 0.736458 lr: 0.000392\n", + "INFO:root:epoch: 67/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.814236 acc: 0.746875 lr: 0.000392\n", + "INFO:root:epoch: 67/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.804321 acc: 0.753750 lr: 0.000391\n", + "INFO:root:epoch: 67/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.818051 acc: 0.746354 lr: 0.000390\n", + "INFO:root:epoch: 67/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.814831 acc: 0.750893 lr: 0.000390\n", + "INFO:root:epoch: 67/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.827012 acc: 0.743750 lr: 0.000389\n", + "INFO:root:epoch: 67/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.823958 acc: 0.745139 lr: 0.000388\n", + "INFO:root:epoch: 67/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.819067 acc: 0.745938 lr: 0.000388\n", + "INFO:root:epoch: 67/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.811314 acc: 0.749148 lr: 0.000387\n", + "INFO:root:epoch: 67/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.803624 acc: 0.752083 lr: 0.000386\n", + "INFO:root:epoch: 67/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.809094 acc: 0.750240 lr: 0.000386\n", + "INFO:root:epoch: 67/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.818883 acc: 0.747545 lr: 0.000385\n", + "INFO:root:epoch: 67/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.821867 acc: 0.746667 lr: 0.000384\n", + "INFO:root:epoch: 67/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.817816 acc: 0.749219 lr: 0.000384\n", + "INFO:root:epoch: 67/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.811076 acc: 0.752206 lr: 0.000383\n", + "INFO:root:epoch: 67/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.804419 acc: 0.753993 lr: 0.000382\n", + "INFO:root:epoch: 67/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.798674 acc: 0.755921 lr: 0.000382\n", + "INFO:root:epoch: 67/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.799432 acc: 0.756562 lr: 0.000381\n", + "INFO:root:epoch: 67/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.796897 acc: 0.756696 lr: 0.000380\n", + "INFO:root:epoch: 67/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.792756 acc: 0.757955 lr: 0.000380\n", + "INFO:root:epoch: 67/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.791548 acc: 0.757880 lr: 0.000379\n", + "INFO:root:epoch: 67/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.793815 acc: 0.757161 lr: 0.000378\n", + "INFO:root:epoch: 67/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.792313 acc: 0.757250 lr: 0.000378\n", + "INFO:root:epoch: 67/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.790128 acc: 0.757212 lr: 0.000377\n", + "INFO:root:epoch: 67/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.792920 acc: 0.757755 lr: 0.000376\n", + "INFO:root:epoch: 67/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.793132 acc: 0.757478 lr: 0.000376\n", + "INFO:root:epoch: 67/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.796018 acc: 0.756573 lr: 0.000375\n", + "INFO:root:epoch: 67/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.799939 acc: 0.755000 lr: 0.000374\n", + "INFO:root:epoch: 67/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.801079 acc: 0.754536 lr: 0.000374\n", + "INFO:root:epoch: 68/100 starts\n", + "INFO:root:epoch: 68/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.805098 acc: 0.771875 lr: 0.000373\n", + "INFO:root:epoch: 68/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.794243 acc: 0.753125 lr: 0.000372\n", + "INFO:root:epoch: 68/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.841131 acc: 0.738542 lr: 0.000372\n", + "INFO:root:epoch: 68/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.868295 acc: 0.736719 lr: 0.000371\n", + "INFO:root:epoch: 68/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.864982 acc: 0.738125 lr: 0.000370\n", + "INFO:root:epoch: 68/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.843730 acc: 0.742188 lr: 0.000370\n", + "INFO:root:epoch: 68/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.833939 acc: 0.745982 lr: 0.000369\n", + "INFO:root:epoch: 68/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.832561 acc: 0.747656 lr: 0.000368\n", + "INFO:root:epoch: 68/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.831154 acc: 0.747917 lr: 0.000368\n", + "INFO:root:epoch: 68/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.831324 acc: 0.748750 lr: 0.000367\n", + "INFO:root:epoch: 68/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.821556 acc: 0.751420 lr: 0.000367\n", + "INFO:root:epoch: 68/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.820977 acc: 0.749479 lr: 0.000366\n", + "INFO:root:epoch: 68/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.814694 acc: 0.750240 lr: 0.000365\n", + "INFO:root:epoch: 68/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.820349 acc: 0.748661 lr: 0.000365\n", + "INFO:root:epoch: 68/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.825379 acc: 0.747708 lr: 0.000364\n", + "INFO:root:epoch: 68/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.822750 acc: 0.748828 lr: 0.000363\n", + "INFO:root:epoch: 68/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.825821 acc: 0.747426 lr: 0.000363\n", + "INFO:root:epoch: 68/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.823634 acc: 0.748958 lr: 0.000362\n", + "INFO:root:epoch: 68/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.823277 acc: 0.747862 lr: 0.000361\n", + "INFO:root:epoch: 68/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.820041 acc: 0.748437 lr: 0.000361\n", + "INFO:root:epoch: 68/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.820466 acc: 0.748512 lr: 0.000360\n", + "INFO:root:epoch: 68/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.820309 acc: 0.748864 lr: 0.000360\n", + "INFO:root:epoch: 68/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.825311 acc: 0.747962 lr: 0.000359\n", + "INFO:root:epoch: 68/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.827428 acc: 0.747396 lr: 0.000358\n", + "INFO:root:epoch: 68/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.829084 acc: 0.746625 lr: 0.000358\n", + "INFO:root:epoch: 68/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.827185 acc: 0.747236 lr: 0.000357\n", + "INFO:root:epoch: 68/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.822909 acc: 0.748264 lr: 0.000356\n", + "INFO:root:epoch: 68/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.820255 acc: 0.748437 lr: 0.000356\n", + "INFO:root:epoch: 68/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.824639 acc: 0.747737 lr: 0.000355\n", + "INFO:root:epoch: 68/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.827008 acc: 0.746979 lr: 0.000355\n", + "INFO:root:epoch: 68/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.825204 acc: 0.747581 lr: 0.000354\n", + "INFO:root:epoch: 69/100 starts\n", + "INFO:root:epoch: 69/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.838380 acc: 0.721875 lr: 0.000353\n", + "INFO:root:epoch: 69/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.807019 acc: 0.742188 lr: 0.000353\n", + "INFO:root:epoch: 69/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.810526 acc: 0.729167 lr: 0.000352\n", + "INFO:root:epoch: 69/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.808304 acc: 0.731250 lr: 0.000351\n", + "INFO:root:epoch: 69/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.813057 acc: 0.736875 lr: 0.000351\n", + "INFO:root:epoch: 69/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.828840 acc: 0.730729 lr: 0.000350\n", + "INFO:root:epoch: 69/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.827348 acc: 0.730804 lr: 0.000350\n", + "INFO:root:epoch: 69/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.823257 acc: 0.732813 lr: 0.000349\n", + "INFO:root:epoch: 69/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.824329 acc: 0.732986 lr: 0.000348\n", + "INFO:root:epoch: 69/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.817774 acc: 0.735313 lr: 0.000348\n", + "INFO:root:epoch: 69/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.811549 acc: 0.738920 lr: 0.000347\n", + "INFO:root:epoch: 69/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.810713 acc: 0.740365 lr: 0.000347\n", + "INFO:root:epoch: 69/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.805829 acc: 0.745433 lr: 0.000346\n", + "INFO:root:epoch: 69/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.802905 acc: 0.747545 lr: 0.000345\n", + "INFO:root:epoch: 69/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.804984 acc: 0.748542 lr: 0.000345\n", + "INFO:root:epoch: 69/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.807312 acc: 0.748242 lr: 0.000344\n", + "INFO:root:epoch: 69/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.815804 acc: 0.746507 lr: 0.000344\n", + "INFO:root:epoch: 69/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.820381 acc: 0.745486 lr: 0.000343\n", + "INFO:root:epoch: 69/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.824466 acc: 0.743750 lr: 0.000342\n", + "INFO:root:epoch: 69/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.825536 acc: 0.744063 lr: 0.000342\n", + "INFO:root:epoch: 69/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.824606 acc: 0.745536 lr: 0.000341\n", + "INFO:root:epoch: 69/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.827560 acc: 0.744886 lr: 0.000341\n", + "INFO:root:epoch: 69/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.824883 acc: 0.744565 lr: 0.000340\n", + "INFO:root:epoch: 69/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.824868 acc: 0.744661 lr: 0.000339\n", + "INFO:root:epoch: 69/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.827894 acc: 0.744000 lr: 0.000339\n", + "INFO:root:epoch: 69/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.825162 acc: 0.744351 lr: 0.000338\n", + "INFO:root:epoch: 69/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.825078 acc: 0.744560 lr: 0.000338\n", + "INFO:root:epoch: 69/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.823879 acc: 0.744196 lr: 0.000337\n", + "INFO:root:epoch: 69/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.829744 acc: 0.743427 lr: 0.000336\n", + "INFO:root:epoch: 69/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.832037 acc: 0.741563 lr: 0.000336\n", + "INFO:root:epoch: 69/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.828244 acc: 0.742641 lr: 0.000335\n", + "INFO:root:epoch: 70/100 starts\n", + "INFO:root:epoch: 70/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.798392 acc: 0.756250 lr: 0.000335\n", + "INFO:root:epoch: 70/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.785699 acc: 0.754688 lr: 0.000334\n", + "INFO:root:epoch: 70/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.834265 acc: 0.741667 lr: 0.000333\n", + "INFO:root:epoch: 70/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.814712 acc: 0.745313 lr: 0.000333\n", + "INFO:root:epoch: 70/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.783671 acc: 0.753750 lr: 0.000332\n", + "INFO:root:epoch: 70/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.773007 acc: 0.758333 lr: 0.000332\n", + "INFO:root:epoch: 70/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.792273 acc: 0.757143 lr: 0.000331\n", + "INFO:root:epoch: 70/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.794986 acc: 0.754297 lr: 0.000331\n", + "INFO:root:epoch: 70/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.799806 acc: 0.752778 lr: 0.000330\n", + "INFO:root:epoch: 70/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.813534 acc: 0.750000 lr: 0.000329\n", + "INFO:root:epoch: 70/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.811018 acc: 0.751420 lr: 0.000329\n", + "INFO:root:epoch: 70/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.809025 acc: 0.750781 lr: 0.000328\n", + "INFO:root:epoch: 70/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.812925 acc: 0.750240 lr: 0.000328\n", + "INFO:root:epoch: 70/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.812998 acc: 0.750000 lr: 0.000327\n", + "INFO:root:epoch: 70/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.815848 acc: 0.748542 lr: 0.000327\n", + "INFO:root:epoch: 70/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.812975 acc: 0.748828 lr: 0.000326\n", + "INFO:root:epoch: 70/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.810961 acc: 0.750184 lr: 0.000325\n", + "INFO:root:epoch: 70/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.815061 acc: 0.748785 lr: 0.000325\n", + "INFO:root:epoch: 70/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.808749 acc: 0.750329 lr: 0.000324\n", + "INFO:root:epoch: 70/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.807923 acc: 0.749844 lr: 0.000324\n", + "INFO:root:epoch: 70/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.811348 acc: 0.748810 lr: 0.000323\n", + "INFO:root:epoch: 70/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.809734 acc: 0.750000 lr: 0.000323\n", + "INFO:root:epoch: 70/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.812528 acc: 0.749185 lr: 0.000322\n", + "INFO:root:epoch: 70/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.818263 acc: 0.747005 lr: 0.000322\n", + "INFO:root:epoch: 70/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.818961 acc: 0.747625 lr: 0.000321\n", + "INFO:root:epoch: 70/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.815984 acc: 0.748438 lr: 0.000320\n", + "INFO:root:epoch: 70/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.818170 acc: 0.747685 lr: 0.000320\n", + "INFO:root:epoch: 70/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.820473 acc: 0.746987 lr: 0.000319\n", + "INFO:root:epoch: 70/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.817501 acc: 0.747953 lr: 0.000319\n", + "INFO:root:epoch: 70/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.816513 acc: 0.749375 lr: 0.000318\n", + "INFO:root:epoch: 70/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.815202 acc: 0.750302 lr: 0.000318\n", + "INFO:root:epoch: 71/100 starts\n", + "INFO:root:epoch: 71/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.864179 acc: 0.731250 lr: 0.000317\n", + "INFO:root:epoch: 71/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.829603 acc: 0.742188 lr: 0.000316\n", + "INFO:root:epoch: 71/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.809511 acc: 0.745833 lr: 0.000316\n", + "INFO:root:epoch: 71/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.791211 acc: 0.760938 lr: 0.000315\n", + "INFO:root:epoch: 71/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.795259 acc: 0.758750 lr: 0.000315\n", + "INFO:root:epoch: 71/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.782706 acc: 0.764583 lr: 0.000314\n", + "INFO:root:epoch: 71/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.787226 acc: 0.761607 lr: 0.000314\n", + "INFO:root:epoch: 71/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.782050 acc: 0.759375 lr: 0.000313\n", + "INFO:root:epoch: 71/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.789894 acc: 0.756597 lr: 0.000313\n", + "INFO:root:epoch: 71/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.785893 acc: 0.756875 lr: 0.000312\n", + "INFO:root:epoch: 71/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.789324 acc: 0.755114 lr: 0.000311\n", + "INFO:root:epoch: 71/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.790187 acc: 0.756510 lr: 0.000311\n", + "INFO:root:epoch: 71/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.792512 acc: 0.755769 lr: 0.000310\n", + "INFO:root:epoch: 71/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.787107 acc: 0.757589 lr: 0.000310\n", + "INFO:root:epoch: 71/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.786991 acc: 0.758750 lr: 0.000309\n", + "INFO:root:epoch: 71/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.789724 acc: 0.759375 lr: 0.000309\n", + "INFO:root:epoch: 71/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.788426 acc: 0.759926 lr: 0.000308\n", + "INFO:root:epoch: 71/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.785897 acc: 0.760764 lr: 0.000308\n", + "INFO:root:epoch: 71/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.787667 acc: 0.760033 lr: 0.000307\n", + "INFO:root:epoch: 71/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.787929 acc: 0.759688 lr: 0.000307\n", + "INFO:root:epoch: 71/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.791317 acc: 0.758333 lr: 0.000306\n", + "INFO:root:epoch: 71/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.795959 acc: 0.756392 lr: 0.000306\n", + "INFO:root:epoch: 71/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.796229 acc: 0.756658 lr: 0.000305\n", + "INFO:root:epoch: 71/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.795460 acc: 0.756901 lr: 0.000305\n", + "INFO:root:epoch: 71/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.797127 acc: 0.757000 lr: 0.000304\n", + "INFO:root:epoch: 71/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.796646 acc: 0.756250 lr: 0.000303\n", + "INFO:root:epoch: 71/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.797799 acc: 0.755787 lr: 0.000303\n", + "INFO:root:epoch: 71/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.800690 acc: 0.755580 lr: 0.000302\n", + "INFO:root:epoch: 71/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.796875 acc: 0.755388 lr: 0.000302\n", + "INFO:root:epoch: 71/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.798822 acc: 0.753333 lr: 0.000301\n", + "INFO:root:epoch: 71/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.794765 acc: 0.754738 lr: 0.000301\n", + "INFO:root:epoch: 72/100 starts\n", + "INFO:root:epoch: 72/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.877658 acc: 0.693750 lr: 0.000300\n", + "INFO:root:epoch: 72/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.812990 acc: 0.734375 lr: 0.000300\n", + "INFO:root:epoch: 72/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.780840 acc: 0.746875 lr: 0.000299\n", + "INFO:root:epoch: 72/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.779216 acc: 0.754687 lr: 0.000299\n", + "INFO:root:epoch: 72/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.775524 acc: 0.758750 lr: 0.000298\n", + "INFO:root:epoch: 72/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.787118 acc: 0.756250 lr: 0.000298\n", + "INFO:root:epoch: 72/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.775308 acc: 0.758036 lr: 0.000297\n", + "INFO:root:epoch: 72/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.778646 acc: 0.758203 lr: 0.000297\n", + "INFO:root:epoch: 72/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.783685 acc: 0.755903 lr: 0.000296\n", + "INFO:root:epoch: 72/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.785047 acc: 0.756562 lr: 0.000296\n", + "INFO:root:epoch: 72/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.794285 acc: 0.755966 lr: 0.000295\n", + "INFO:root:epoch: 72/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.800572 acc: 0.754167 lr: 0.000295\n", + "INFO:root:epoch: 72/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.800480 acc: 0.754327 lr: 0.000294\n", + "INFO:root:epoch: 72/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.799564 acc: 0.754464 lr: 0.000293\n", + "INFO:root:epoch: 72/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.792825 acc: 0.756667 lr: 0.000293\n", + "INFO:root:epoch: 72/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.792193 acc: 0.754102 lr: 0.000292\n", + "INFO:root:epoch: 72/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.794815 acc: 0.754228 lr: 0.000292\n", + "INFO:root:epoch: 72/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.791455 acc: 0.756076 lr: 0.000291\n", + "INFO:root:epoch: 72/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.794742 acc: 0.755757 lr: 0.000291\n", + "INFO:root:epoch: 72/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.795464 acc: 0.755625 lr: 0.000290\n", + "INFO:root:epoch: 72/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.788110 acc: 0.758482 lr: 0.000290\n", + "INFO:root:epoch: 72/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.788900 acc: 0.758523 lr: 0.000289\n", + "INFO:root:epoch: 72/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.790933 acc: 0.755707 lr: 0.000289\n", + "INFO:root:epoch: 72/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.794669 acc: 0.755729 lr: 0.000288\n", + "INFO:root:epoch: 72/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.798293 acc: 0.754000 lr: 0.000288\n", + "INFO:root:epoch: 72/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.795453 acc: 0.754447 lr: 0.000287\n", + "INFO:root:epoch: 72/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.799001 acc: 0.754167 lr: 0.000287\n", + "INFO:root:epoch: 72/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.798975 acc: 0.754241 lr: 0.000286\n", + "INFO:root:epoch: 72/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.799768 acc: 0.753772 lr: 0.000286\n", + "INFO:root:epoch: 72/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.803339 acc: 0.753021 lr: 0.000285\n", + "INFO:root:epoch: 72/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.806513 acc: 0.752016 lr: 0.000285\n", + "INFO:root:epoch: 73/100 starts\n", + "INFO:root:epoch: 73/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.859161 acc: 0.743750 lr: 0.000284\n", + "INFO:root:epoch: 73/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.795976 acc: 0.745313 lr: 0.000284\n", + "INFO:root:epoch: 73/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.796629 acc: 0.745833 lr: 0.000283\n", + "INFO:root:epoch: 73/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.785419 acc: 0.753906 lr: 0.000283\n", + "INFO:root:epoch: 73/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.798214 acc: 0.749375 lr: 0.000282\n", + "INFO:root:epoch: 73/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.816244 acc: 0.742188 lr: 0.000282\n", + "INFO:root:epoch: 73/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.820838 acc: 0.737946 lr: 0.000281\n", + "INFO:root:epoch: 73/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.836950 acc: 0.732422 lr: 0.000281\n", + "INFO:root:epoch: 73/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.836056 acc: 0.731944 lr: 0.000280\n", + "INFO:root:epoch: 73/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.826064 acc: 0.735312 lr: 0.000280\n", + "INFO:root:epoch: 73/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.817924 acc: 0.740341 lr: 0.000279\n", + "INFO:root:epoch: 73/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.818547 acc: 0.741146 lr: 0.000279\n", + "INFO:root:epoch: 73/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.820180 acc: 0.740625 lr: 0.000278\n", + "INFO:root:epoch: 73/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.819756 acc: 0.740402 lr: 0.000278\n", + "INFO:root:epoch: 73/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.813831 acc: 0.742708 lr: 0.000278\n", + "INFO:root:epoch: 73/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.824447 acc: 0.738867 lr: 0.000277\n", + "INFO:root:epoch: 73/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.817084 acc: 0.740809 lr: 0.000277\n", + "INFO:root:epoch: 73/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.820232 acc: 0.740104 lr: 0.000276\n", + "INFO:root:epoch: 73/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.817827 acc: 0.741118 lr: 0.000276\n", + "INFO:root:epoch: 73/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.811755 acc: 0.742031 lr: 0.000275\n", + "INFO:root:epoch: 73/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.810096 acc: 0.741964 lr: 0.000275\n", + "INFO:root:epoch: 73/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.809113 acc: 0.742614 lr: 0.000274\n", + "INFO:root:epoch: 73/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.811607 acc: 0.741168 lr: 0.000274\n", + "INFO:root:epoch: 73/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.813121 acc: 0.742318 lr: 0.000273\n", + "INFO:root:epoch: 73/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.813520 acc: 0.741625 lr: 0.000273\n", + "INFO:root:epoch: 73/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.811619 acc: 0.743269 lr: 0.000272\n", + "INFO:root:epoch: 73/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.805024 acc: 0.746296 lr: 0.000272\n", + "INFO:root:epoch: 73/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.803621 acc: 0.746652 lr: 0.000271\n", + "INFO:root:epoch: 73/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.806689 acc: 0.745690 lr: 0.000271\n", + "INFO:root:epoch: 73/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.805943 acc: 0.745938 lr: 0.000270\n", + "INFO:root:epoch: 73/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.805572 acc: 0.746169 lr: 0.000270\n", + "INFO:root:epoch: 74/100 starts\n", + "INFO:root:epoch: 74/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.807711 acc: 0.756250 lr: 0.000269\n", + "INFO:root:epoch: 74/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.799644 acc: 0.757812 lr: 0.000269\n", + "INFO:root:epoch: 74/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.761504 acc: 0.762500 lr: 0.000268\n", + "INFO:root:epoch: 74/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.774130 acc: 0.762500 lr: 0.000268\n", + "INFO:root:epoch: 74/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.803446 acc: 0.758750 lr: 0.000267\n", + "INFO:root:epoch: 74/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.813824 acc: 0.751042 lr: 0.000267\n", + "INFO:root:epoch: 74/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.815117 acc: 0.749107 lr: 0.000267\n", + "INFO:root:epoch: 74/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.826940 acc: 0.747266 lr: 0.000266\n", + "INFO:root:epoch: 74/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.824534 acc: 0.747569 lr: 0.000266\n", + "INFO:root:epoch: 74/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.809307 acc: 0.753125 lr: 0.000265\n", + "INFO:root:epoch: 74/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.812155 acc: 0.754261 lr: 0.000265\n", + "INFO:root:epoch: 74/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.808710 acc: 0.753125 lr: 0.000264\n", + "INFO:root:epoch: 74/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.802184 acc: 0.753606 lr: 0.000264\n", + "INFO:root:epoch: 74/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.801939 acc: 0.754018 lr: 0.000263\n", + "INFO:root:epoch: 74/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.796604 acc: 0.755000 lr: 0.000263\n", + "INFO:root:epoch: 74/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.796238 acc: 0.755273 lr: 0.000262\n", + "INFO:root:epoch: 74/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.801234 acc: 0.753309 lr: 0.000262\n", + "INFO:root:epoch: 74/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.801564 acc: 0.752083 lr: 0.000262\n", + "INFO:root:epoch: 74/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.802566 acc: 0.753289 lr: 0.000261\n", + "INFO:root:epoch: 74/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.801704 acc: 0.754219 lr: 0.000261\n", + "INFO:root:epoch: 74/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.799007 acc: 0.755208 lr: 0.000260\n", + "INFO:root:epoch: 74/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.801955 acc: 0.755256 lr: 0.000260\n", + "INFO:root:epoch: 74/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.797431 acc: 0.757065 lr: 0.000259\n", + "INFO:root:epoch: 74/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.800845 acc: 0.755208 lr: 0.000259\n", + "INFO:root:epoch: 74/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.796210 acc: 0.757500 lr: 0.000258\n", + "INFO:root:epoch: 74/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.794110 acc: 0.758293 lr: 0.000258\n", + "INFO:root:epoch: 74/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.795419 acc: 0.758449 lr: 0.000257\n", + "INFO:root:epoch: 74/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.796114 acc: 0.756696 lr: 0.000257\n", + "INFO:root:epoch: 74/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.795710 acc: 0.755927 lr: 0.000257\n", + "INFO:root:epoch: 74/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.792194 acc: 0.757604 lr: 0.000256\n", + "INFO:root:epoch: 74/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.792609 acc: 0.757661 lr: 0.000256\n", + "INFO:root:epoch: 75/100 starts\n", + "INFO:root:epoch: 75/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.810152 acc: 0.759375 lr: 0.000255\n", + "INFO:root:epoch: 75/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.827014 acc: 0.765625 lr: 0.000255\n", + "INFO:root:epoch: 75/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.894969 acc: 0.743750 lr: 0.000254\n", + "INFO:root:epoch: 75/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.866518 acc: 0.750781 lr: 0.000254\n", + "INFO:root:epoch: 75/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.836647 acc: 0.755000 lr: 0.000253\n", + "INFO:root:epoch: 75/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.838524 acc: 0.748958 lr: 0.000253\n", + "INFO:root:epoch: 75/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.843608 acc: 0.750446 lr: 0.000252\n", + "INFO:root:epoch: 75/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.842278 acc: 0.750781 lr: 0.000252\n", + "INFO:root:epoch: 75/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.820922 acc: 0.756944 lr: 0.000252\n", + "INFO:root:epoch: 75/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.820232 acc: 0.754375 lr: 0.000251\n", + "INFO:root:epoch: 75/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.807429 acc: 0.754830 lr: 0.000251\n", + "INFO:root:epoch: 75/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.806791 acc: 0.753646 lr: 0.000250\n", + "INFO:root:epoch: 75/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.811307 acc: 0.750481 lr: 0.000250\n", + "INFO:root:epoch: 75/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.816761 acc: 0.749554 lr: 0.000249\n", + "INFO:root:epoch: 75/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.820517 acc: 0.748542 lr: 0.000249\n", + "INFO:root:epoch: 75/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.819179 acc: 0.747852 lr: 0.000249\n", + "INFO:root:epoch: 75/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.819349 acc: 0.748346 lr: 0.000248\n", + "INFO:root:epoch: 75/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.820818 acc: 0.747743 lr: 0.000248\n", + "INFO:root:epoch: 75/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.827496 acc: 0.746217 lr: 0.000247\n", + "INFO:root:epoch: 75/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.825079 acc: 0.747031 lr: 0.000247\n", + "INFO:root:epoch: 75/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.824680 acc: 0.747173 lr: 0.000246\n", + "INFO:root:epoch: 75/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.824993 acc: 0.746591 lr: 0.000246\n", + "INFO:root:epoch: 75/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.823038 acc: 0.747418 lr: 0.000246\n", + "INFO:root:epoch: 75/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.820720 acc: 0.748568 lr: 0.000245\n", + "INFO:root:epoch: 75/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.827207 acc: 0.746125 lr: 0.000245\n", + "INFO:root:epoch: 75/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.825874 acc: 0.748437 lr: 0.000244\n", + "INFO:root:epoch: 75/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.825848 acc: 0.748380 lr: 0.000244\n", + "INFO:root:epoch: 75/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.827415 acc: 0.748772 lr: 0.000243\n", + "INFO:root:epoch: 75/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.828070 acc: 0.748491 lr: 0.000243\n", + "INFO:root:epoch: 75/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.827790 acc: 0.748542 lr: 0.000243\n", + "INFO:root:epoch: 75/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.825898 acc: 0.749395 lr: 0.000242\n", + "INFO:root:epoch: 76/100 starts\n", + "INFO:root:epoch: 76/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.860451 acc: 0.759375 lr: 0.000242\n", + "INFO:root:epoch: 76/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.808804 acc: 0.765625 lr: 0.000241\n", + "INFO:root:epoch: 76/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.837553 acc: 0.758333 lr: 0.000241\n", + "INFO:root:epoch: 76/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.830293 acc: 0.753125 lr: 0.000240\n", + "INFO:root:epoch: 76/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.852930 acc: 0.746875 lr: 0.000240\n", + "INFO:root:epoch: 76/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.833143 acc: 0.751563 lr: 0.000240\n", + "INFO:root:epoch: 76/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.824864 acc: 0.754464 lr: 0.000239\n", + "INFO:root:epoch: 76/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.826082 acc: 0.751172 lr: 0.000239\n", + "INFO:root:epoch: 76/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.821045 acc: 0.751042 lr: 0.000238\n", + "INFO:root:epoch: 76/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.820884 acc: 0.750625 lr: 0.000238\n", + "INFO:root:epoch: 76/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.815294 acc: 0.752557 lr: 0.000237\n", + "INFO:root:epoch: 76/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.813564 acc: 0.752083 lr: 0.000237\n", + "INFO:root:epoch: 76/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.809495 acc: 0.751683 lr: 0.000237\n", + "INFO:root:epoch: 76/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.808137 acc: 0.752679 lr: 0.000236\n", + "INFO:root:epoch: 76/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.797264 acc: 0.755833 lr: 0.000236\n", + "INFO:root:epoch: 76/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.797611 acc: 0.756250 lr: 0.000235\n", + "INFO:root:epoch: 76/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.798314 acc: 0.755147 lr: 0.000235\n", + "INFO:root:epoch: 76/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.799500 acc: 0.755035 lr: 0.000235\n", + "INFO:root:epoch: 76/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.801328 acc: 0.754112 lr: 0.000234\n", + "INFO:root:epoch: 76/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.799718 acc: 0.754062 lr: 0.000234\n", + "INFO:root:epoch: 76/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.796545 acc: 0.754762 lr: 0.000233\n", + "INFO:root:epoch: 76/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.797761 acc: 0.754972 lr: 0.000233\n", + "INFO:root:epoch: 76/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.795961 acc: 0.755435 lr: 0.000233\n", + "INFO:root:epoch: 76/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.794366 acc: 0.755859 lr: 0.000232\n", + "INFO:root:epoch: 76/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.792657 acc: 0.756250 lr: 0.000232\n", + "INFO:root:epoch: 76/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.791301 acc: 0.756250 lr: 0.000231\n", + "INFO:root:epoch: 76/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.790612 acc: 0.755903 lr: 0.000231\n", + "INFO:root:epoch: 76/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.788301 acc: 0.756362 lr: 0.000231\n", + "INFO:root:epoch: 76/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.792633 acc: 0.755496 lr: 0.000230\n", + "INFO:root:epoch: 76/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.791107 acc: 0.755729 lr: 0.000230\n", + "INFO:root:epoch: 76/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.789158 acc: 0.756754 lr: 0.000229\n", + "INFO:root:epoch: 77/100 starts\n", + "INFO:root:epoch: 77/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.832452 acc: 0.740625 lr: 0.000229\n", + "INFO:root:epoch: 77/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.805612 acc: 0.753125 lr: 0.000228\n", + "INFO:root:epoch: 77/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.803798 acc: 0.745833 lr: 0.000228\n", + "INFO:root:epoch: 77/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.796911 acc: 0.750000 lr: 0.000228\n", + "INFO:root:epoch: 77/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.810061 acc: 0.747500 lr: 0.000227\n", + "INFO:root:epoch: 77/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.796402 acc: 0.748958 lr: 0.000227\n", + "INFO:root:epoch: 77/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.788726 acc: 0.750893 lr: 0.000227\n", + "INFO:root:epoch: 77/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.786726 acc: 0.753125 lr: 0.000226\n", + "INFO:root:epoch: 77/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.795086 acc: 0.752778 lr: 0.000226\n", + "INFO:root:epoch: 77/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.804910 acc: 0.748125 lr: 0.000225\n", + "INFO:root:epoch: 77/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.813269 acc: 0.746591 lr: 0.000225\n", + "INFO:root:epoch: 77/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.811426 acc: 0.747396 lr: 0.000225\n", + "INFO:root:epoch: 77/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.807266 acc: 0.749519 lr: 0.000224\n", + "INFO:root:epoch: 77/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.804830 acc: 0.750223 lr: 0.000224\n", + "INFO:root:epoch: 77/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.802381 acc: 0.750417 lr: 0.000223\n", + "INFO:root:epoch: 77/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.806278 acc: 0.749414 lr: 0.000223\n", + "INFO:root:epoch: 77/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.811008 acc: 0.747610 lr: 0.000223\n", + "INFO:root:epoch: 77/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.813348 acc: 0.747743 lr: 0.000222\n", + "INFO:root:epoch: 77/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.811135 acc: 0.748355 lr: 0.000222\n", + "INFO:root:epoch: 77/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.804978 acc: 0.751094 lr: 0.000221\n", + "INFO:root:epoch: 77/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.802043 acc: 0.752381 lr: 0.000221\n", + "INFO:root:epoch: 77/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.803534 acc: 0.751989 lr: 0.000221\n", + "INFO:root:epoch: 77/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.799325 acc: 0.753940 lr: 0.000220\n", + "INFO:root:epoch: 77/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.803571 acc: 0.752604 lr: 0.000220\n", + "INFO:root:epoch: 77/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.798379 acc: 0.753375 lr: 0.000220\n", + "INFO:root:epoch: 77/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.791920 acc: 0.755409 lr: 0.000219\n", + "INFO:root:epoch: 77/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.796250 acc: 0.754282 lr: 0.000219\n", + "INFO:root:epoch: 77/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.795690 acc: 0.754799 lr: 0.000218\n", + "INFO:root:epoch: 77/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.796734 acc: 0.754526 lr: 0.000218\n", + "INFO:root:epoch: 77/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.797624 acc: 0.753958 lr: 0.000218\n", + "INFO:root:epoch: 77/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.797562 acc: 0.754435 lr: 0.000217\n", + "INFO:root:epoch: 78/100 starts\n", + "INFO:root:epoch: 78/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.798032 acc: 0.756250 lr: 0.000217\n", + "INFO:root:epoch: 78/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.773538 acc: 0.750000 lr: 0.000216\n", + "INFO:root:epoch: 78/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.793279 acc: 0.743750 lr: 0.000216\n", + "INFO:root:epoch: 78/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.792584 acc: 0.742969 lr: 0.000216\n", + "INFO:root:epoch: 78/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.778425 acc: 0.748750 lr: 0.000215\n", + "INFO:root:epoch: 78/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.779079 acc: 0.747396 lr: 0.000215\n", + "INFO:root:epoch: 78/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.774168 acc: 0.751339 lr: 0.000215\n", + "INFO:root:epoch: 78/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.768096 acc: 0.755469 lr: 0.000214\n", + "INFO:root:epoch: 78/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.777745 acc: 0.752431 lr: 0.000214\n", + "INFO:root:epoch: 78/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.774641 acc: 0.751875 lr: 0.000213\n", + "INFO:root:epoch: 78/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.777431 acc: 0.752557 lr: 0.000213\n", + "INFO:root:epoch: 78/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.780717 acc: 0.752083 lr: 0.000213\n", + "INFO:root:epoch: 78/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.788871 acc: 0.748798 lr: 0.000212\n", + "INFO:root:epoch: 78/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.787595 acc: 0.747991 lr: 0.000212\n", + "INFO:root:epoch: 78/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.790902 acc: 0.748958 lr: 0.000212\n", + "INFO:root:epoch: 78/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.787587 acc: 0.750977 lr: 0.000211\n", + "INFO:root:epoch: 78/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.787144 acc: 0.749449 lr: 0.000211\n", + "INFO:root:epoch: 78/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.787887 acc: 0.750694 lr: 0.000211\n", + "INFO:root:epoch: 78/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.786996 acc: 0.750164 lr: 0.000210\n", + "INFO:root:epoch: 78/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.789652 acc: 0.750781 lr: 0.000210\n", + "INFO:root:epoch: 78/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.792301 acc: 0.749851 lr: 0.000209\n", + "INFO:root:epoch: 78/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.791866 acc: 0.750000 lr: 0.000209\n", + "INFO:root:epoch: 78/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.789160 acc: 0.750951 lr: 0.000209\n", + "INFO:root:epoch: 78/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.798120 acc: 0.748307 lr: 0.000208\n", + "INFO:root:epoch: 78/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.796369 acc: 0.749625 lr: 0.000208\n", + "INFO:root:epoch: 78/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.800060 acc: 0.750000 lr: 0.000208\n", + "INFO:root:epoch: 78/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.802455 acc: 0.748495 lr: 0.000207\n", + "INFO:root:epoch: 78/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.802527 acc: 0.748549 lr: 0.000207\n", + "INFO:root:epoch: 78/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.799196 acc: 0.749138 lr: 0.000207\n", + "INFO:root:epoch: 78/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.801000 acc: 0.749167 lr: 0.000206\n", + "INFO:root:epoch: 78/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.799783 acc: 0.749194 lr: 0.000206\n", + "INFO:root:epoch: 79/100 starts\n", + "INFO:root:epoch: 79/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.880325 acc: 0.721875 lr: 0.000205\n", + "INFO:root:epoch: 79/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.848140 acc: 0.729687 lr: 0.000205\n", + "INFO:root:epoch: 79/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.816481 acc: 0.736458 lr: 0.000205\n", + "INFO:root:epoch: 79/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.794711 acc: 0.742969 lr: 0.000204\n", + "INFO:root:epoch: 79/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.818348 acc: 0.742500 lr: 0.000204\n", + "INFO:root:epoch: 79/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.818294 acc: 0.738021 lr: 0.000204\n", + "INFO:root:epoch: 79/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.815743 acc: 0.740625 lr: 0.000203\n", + "INFO:root:epoch: 79/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.814110 acc: 0.741016 lr: 0.000203\n", + "INFO:root:epoch: 79/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.824917 acc: 0.738889 lr: 0.000203\n", + "INFO:root:epoch: 79/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.817645 acc: 0.740625 lr: 0.000202\n", + "INFO:root:epoch: 79/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.816279 acc: 0.743182 lr: 0.000202\n", + "INFO:root:epoch: 79/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.803411 acc: 0.746875 lr: 0.000201\n", + "INFO:root:epoch: 79/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.794337 acc: 0.751202 lr: 0.000201\n", + "INFO:root:epoch: 79/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.801573 acc: 0.750223 lr: 0.000201\n", + "INFO:root:epoch: 79/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.811712 acc: 0.747083 lr: 0.000200\n", + "INFO:root:epoch: 79/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.816973 acc: 0.744531 lr: 0.000200\n", + "INFO:root:epoch: 79/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.819649 acc: 0.742831 lr: 0.000200\n", + "INFO:root:epoch: 79/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.813565 acc: 0.744097 lr: 0.000199\n", + "INFO:root:epoch: 79/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.811333 acc: 0.745724 lr: 0.000199\n", + "INFO:root:epoch: 79/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.815313 acc: 0.743594 lr: 0.000199\n", + "INFO:root:epoch: 79/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.815732 acc: 0.742262 lr: 0.000198\n", + "INFO:root:epoch: 79/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.816799 acc: 0.743182 lr: 0.000198\n", + "INFO:root:epoch: 79/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.816930 acc: 0.743342 lr: 0.000198\n", + "INFO:root:epoch: 79/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.814491 acc: 0.744792 lr: 0.000197\n", + "INFO:root:epoch: 79/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.810731 acc: 0.745500 lr: 0.000197\n", + "INFO:root:epoch: 79/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.812610 acc: 0.743990 lr: 0.000197\n", + "INFO:root:epoch: 79/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.813645 acc: 0.742593 lr: 0.000196\n", + "INFO:root:epoch: 79/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.813632 acc: 0.742634 lr: 0.000196\n", + "INFO:root:epoch: 79/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.818151 acc: 0.742457 lr: 0.000196\n", + "INFO:root:epoch: 79/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.818030 acc: 0.743021 lr: 0.000195\n", + "INFO:root:epoch: 79/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.817498 acc: 0.743347 lr: 0.000195\n", + "INFO:root:epoch: 80/100 starts\n", + "INFO:root:epoch: 80/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.750064 acc: 0.734375 lr: 0.000195\n", + "INFO:root:epoch: 80/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.767472 acc: 0.742188 lr: 0.000194\n", + "INFO:root:epoch: 80/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.758861 acc: 0.748958 lr: 0.000194\n", + "INFO:root:epoch: 80/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.787833 acc: 0.742969 lr: 0.000194\n", + "INFO:root:epoch: 80/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.796420 acc: 0.745000 lr: 0.000193\n", + "INFO:root:epoch: 80/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.779974 acc: 0.748958 lr: 0.000193\n", + "INFO:root:epoch: 80/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.784120 acc: 0.748214 lr: 0.000192\n", + "INFO:root:epoch: 80/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.774460 acc: 0.752734 lr: 0.000192\n", + "INFO:root:epoch: 80/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.777161 acc: 0.751736 lr: 0.000192\n", + "INFO:root:epoch: 80/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.783559 acc: 0.751250 lr: 0.000192\n", + "INFO:root:epoch: 80/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.786514 acc: 0.751989 lr: 0.000191\n", + "INFO:root:epoch: 80/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.791311 acc: 0.754427 lr: 0.000191\n", + "INFO:root:epoch: 80/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.791077 acc: 0.754808 lr: 0.000191\n", + "INFO:root:epoch: 80/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.791207 acc: 0.755134 lr: 0.000190\n", + "INFO:root:epoch: 80/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.792544 acc: 0.754375 lr: 0.000190\n", + "INFO:root:epoch: 80/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.798387 acc: 0.751172 lr: 0.000190\n", + "INFO:root:epoch: 80/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.803075 acc: 0.749081 lr: 0.000189\n", + "INFO:root:epoch: 80/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.803261 acc: 0.748090 lr: 0.000189\n", + "INFO:root:epoch: 80/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.801857 acc: 0.748849 lr: 0.000189\n", + "INFO:root:epoch: 80/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.801413 acc: 0.749219 lr: 0.000188\n", + "INFO:root:epoch: 80/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.802582 acc: 0.749405 lr: 0.000188\n", + "INFO:root:epoch: 80/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.801758 acc: 0.749716 lr: 0.000188\n", + "INFO:root:epoch: 80/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.797444 acc: 0.750951 lr: 0.000187\n", + "INFO:root:epoch: 80/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.800117 acc: 0.751042 lr: 0.000187\n", + "INFO:root:epoch: 80/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.796923 acc: 0.752625 lr: 0.000187\n", + "INFO:root:epoch: 80/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.798514 acc: 0.751803 lr: 0.000186\n", + "INFO:root:epoch: 80/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.802870 acc: 0.749884 lr: 0.000186\n", + "INFO:root:epoch: 80/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.802658 acc: 0.749888 lr: 0.000186\n", + "INFO:root:epoch: 80/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.800922 acc: 0.749138 lr: 0.000185\n", + "INFO:root:epoch: 80/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.795446 acc: 0.752083 lr: 0.000185\n", + "INFO:root:epoch: 80/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.797460 acc: 0.750907 lr: 0.000185\n", + "INFO:root:epoch: 81/100 starts\n", + "INFO:root:epoch: 81/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.762089 acc: 0.771875 lr: 0.000184\n", + "INFO:root:epoch: 81/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.782786 acc: 0.773438 lr: 0.000184\n", + "INFO:root:epoch: 81/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.786439 acc: 0.772917 lr: 0.000184\n", + "INFO:root:epoch: 81/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.796047 acc: 0.759375 lr: 0.000183\n", + "INFO:root:epoch: 81/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.793564 acc: 0.758750 lr: 0.000183\n", + "INFO:root:epoch: 81/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.777598 acc: 0.759375 lr: 0.000183\n", + "INFO:root:epoch: 81/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.777131 acc: 0.758482 lr: 0.000182\n", + "INFO:root:epoch: 81/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.774471 acc: 0.757422 lr: 0.000182\n", + "INFO:root:epoch: 81/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.780834 acc: 0.752778 lr: 0.000182\n", + "INFO:root:epoch: 81/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.780715 acc: 0.752500 lr: 0.000181\n", + "INFO:root:epoch: 81/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.784636 acc: 0.753125 lr: 0.000181\n", + "INFO:root:epoch: 81/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.788807 acc: 0.751563 lr: 0.000181\n", + "INFO:root:epoch: 81/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.793873 acc: 0.752644 lr: 0.000180\n", + "INFO:root:epoch: 81/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.791054 acc: 0.752679 lr: 0.000180\n", + "INFO:root:epoch: 81/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.791990 acc: 0.752292 lr: 0.000180\n", + "INFO:root:epoch: 81/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.790770 acc: 0.752734 lr: 0.000180\n", + "INFO:root:epoch: 81/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.788597 acc: 0.754963 lr: 0.000179\n", + "INFO:root:epoch: 81/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.791801 acc: 0.753646 lr: 0.000179\n", + "INFO:root:epoch: 81/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.797125 acc: 0.752796 lr: 0.000179\n", + "INFO:root:epoch: 81/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.795501 acc: 0.754375 lr: 0.000178\n", + "INFO:root:epoch: 81/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.791406 acc: 0.755506 lr: 0.000178\n", + "INFO:root:epoch: 81/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.787304 acc: 0.755824 lr: 0.000178\n", + "INFO:root:epoch: 81/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.785418 acc: 0.757065 lr: 0.000177\n", + "INFO:root:epoch: 81/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.786203 acc: 0.756771 lr: 0.000177\n", + "INFO:root:epoch: 81/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.787770 acc: 0.756000 lr: 0.000177\n", + "INFO:root:epoch: 81/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.789473 acc: 0.755889 lr: 0.000176\n", + "INFO:root:epoch: 81/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.791416 acc: 0.755671 lr: 0.000176\n", + "INFO:root:epoch: 81/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.790074 acc: 0.756473 lr: 0.000176\n", + "INFO:root:epoch: 81/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.794264 acc: 0.754526 lr: 0.000176\n", + "INFO:root:epoch: 81/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.789935 acc: 0.756146 lr: 0.000175\n", + "INFO:root:epoch: 81/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.791139 acc: 0.755847 lr: 0.000175\n", + "INFO:root:epoch: 82/100 starts\n", + "INFO:root:epoch: 82/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.628804 acc: 0.803125 lr: 0.000175\n", + "INFO:root:epoch: 82/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.695736 acc: 0.801562 lr: 0.000174\n", + "INFO:root:epoch: 82/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.726282 acc: 0.792708 lr: 0.000174\n", + "INFO:root:epoch: 82/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.762066 acc: 0.775000 lr: 0.000174\n", + "INFO:root:epoch: 82/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.771014 acc: 0.766875 lr: 0.000173\n", + "INFO:root:epoch: 82/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.781545 acc: 0.765625 lr: 0.000173\n", + "INFO:root:epoch: 82/100 et: 3s eta: 10s batches: 70/313(22%) samples: 2240 loss: 0.781009 acc: 0.764286 lr: 0.000173\n", + "INFO:root:epoch: 82/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.782196 acc: 0.762891 lr: 0.000172\n", + "INFO:root:epoch: 82/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.782277 acc: 0.759722 lr: 0.000172\n", + "INFO:root:epoch: 82/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.778186 acc: 0.758750 lr: 0.000172\n", + "INFO:root:epoch: 82/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.779406 acc: 0.757102 lr: 0.000172\n", + "INFO:root:epoch: 82/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.785728 acc: 0.752604 lr: 0.000171\n", + "INFO:root:epoch: 82/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.792901 acc: 0.752163 lr: 0.000171\n", + "INFO:root:epoch: 82/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.792623 acc: 0.751786 lr: 0.000171\n", + "INFO:root:epoch: 82/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.787286 acc: 0.752708 lr: 0.000170\n", + "INFO:root:epoch: 82/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.788793 acc: 0.753320 lr: 0.000170\n", + "INFO:root:epoch: 82/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.784648 acc: 0.755331 lr: 0.000170\n", + "INFO:root:epoch: 82/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.784689 acc: 0.754688 lr: 0.000169\n", + "INFO:root:epoch: 82/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.783967 acc: 0.755263 lr: 0.000169\n", + "INFO:root:epoch: 82/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.788794 acc: 0.754375 lr: 0.000169\n", + "INFO:root:epoch: 82/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.794243 acc: 0.751786 lr: 0.000169\n", + "INFO:root:epoch: 82/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.796243 acc: 0.751847 lr: 0.000168\n", + "INFO:root:epoch: 82/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.794753 acc: 0.751087 lr: 0.000168\n", + "INFO:root:epoch: 82/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.793552 acc: 0.750911 lr: 0.000168\n", + "INFO:root:epoch: 82/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.792333 acc: 0.751000 lr: 0.000167\n", + "INFO:root:epoch: 82/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.793941 acc: 0.750841 lr: 0.000167\n", + "INFO:root:epoch: 82/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.791701 acc: 0.751273 lr: 0.000167\n", + "INFO:root:epoch: 82/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.796419 acc: 0.749888 lr: 0.000167\n", + "INFO:root:epoch: 82/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.798194 acc: 0.749138 lr: 0.000166\n", + "INFO:root:epoch: 82/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.796819 acc: 0.749479 lr: 0.000166\n", + "INFO:root:epoch: 82/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.797615 acc: 0.748286 lr: 0.000166\n", + "INFO:root:epoch: 83/100 starts\n", + "INFO:root:epoch: 83/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.761741 acc: 0.775000 lr: 0.000165\n", + "INFO:root:epoch: 83/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.751758 acc: 0.771875 lr: 0.000165\n", + "INFO:root:epoch: 83/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.760744 acc: 0.757292 lr: 0.000165\n", + "INFO:root:epoch: 83/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.801879 acc: 0.746875 lr: 0.000164\n", + "INFO:root:epoch: 83/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.812679 acc: 0.746250 lr: 0.000164\n", + "INFO:root:epoch: 83/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.812477 acc: 0.746354 lr: 0.000164\n", + "INFO:root:epoch: 83/100 et: 3s eta: 10s batches: 70/313(22%) samples: 2240 loss: 0.800193 acc: 0.747321 lr: 0.000164\n", + "INFO:root:epoch: 83/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.804929 acc: 0.750000 lr: 0.000163\n", + "INFO:root:epoch: 83/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.809009 acc: 0.746875 lr: 0.000163\n", + "INFO:root:epoch: 83/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.805895 acc: 0.747187 lr: 0.000163\n", + "INFO:root:epoch: 83/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.799392 acc: 0.750000 lr: 0.000162\n", + "INFO:root:epoch: 83/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.793033 acc: 0.749219 lr: 0.000162\n", + "INFO:root:epoch: 83/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.794914 acc: 0.748558 lr: 0.000162\n", + "INFO:root:epoch: 83/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.796939 acc: 0.749330 lr: 0.000162\n", + "INFO:root:epoch: 83/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.802187 acc: 0.746875 lr: 0.000161\n", + "INFO:root:epoch: 83/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.796101 acc: 0.749023 lr: 0.000161\n", + "INFO:root:epoch: 83/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.798109 acc: 0.749816 lr: 0.000161\n", + "INFO:root:epoch: 83/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.795528 acc: 0.749479 lr: 0.000161\n", + "INFO:root:epoch: 83/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.794467 acc: 0.750164 lr: 0.000160\n", + "INFO:root:epoch: 83/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.793782 acc: 0.751094 lr: 0.000160\n", + "INFO:root:epoch: 83/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.793767 acc: 0.750446 lr: 0.000160\n", + "INFO:root:epoch: 83/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.793982 acc: 0.750142 lr: 0.000159\n", + "INFO:root:epoch: 83/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.792620 acc: 0.750136 lr: 0.000159\n", + "INFO:root:epoch: 83/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.793667 acc: 0.750781 lr: 0.000159\n", + "INFO:root:epoch: 83/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.792085 acc: 0.751000 lr: 0.000159\n", + "INFO:root:epoch: 83/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.791104 acc: 0.752163 lr: 0.000158\n", + "INFO:root:epoch: 83/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.789927 acc: 0.752315 lr: 0.000158\n", + "INFO:root:epoch: 83/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.790392 acc: 0.752121 lr: 0.000158\n", + "INFO:root:epoch: 83/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.795518 acc: 0.750539 lr: 0.000157\n", + "INFO:root:epoch: 83/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.796847 acc: 0.750417 lr: 0.000157\n", + "INFO:root:epoch: 83/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.794402 acc: 0.750806 lr: 0.000157\n", + "INFO:root:epoch: 84/100 starts\n", + "INFO:root:epoch: 84/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.820571 acc: 0.759375 lr: 0.000157\n", + "INFO:root:epoch: 84/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.880304 acc: 0.737500 lr: 0.000156\n", + "INFO:root:epoch: 84/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.824604 acc: 0.750000 lr: 0.000156\n", + "INFO:root:epoch: 84/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.815394 acc: 0.754687 lr: 0.000156\n", + "INFO:root:epoch: 84/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.787376 acc: 0.764375 lr: 0.000155\n", + "INFO:root:epoch: 84/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.791501 acc: 0.762500 lr: 0.000155\n", + "INFO:root:epoch: 84/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.801442 acc: 0.758482 lr: 0.000155\n", + "INFO:root:epoch: 84/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.811390 acc: 0.753516 lr: 0.000155\n", + "INFO:root:epoch: 84/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.810934 acc: 0.751736 lr: 0.000154\n", + "INFO:root:epoch: 84/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.801340 acc: 0.754688 lr: 0.000154\n", + "INFO:root:epoch: 84/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.809829 acc: 0.751989 lr: 0.000154\n", + "INFO:root:epoch: 84/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.806949 acc: 0.753385 lr: 0.000154\n", + "INFO:root:epoch: 84/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.808518 acc: 0.751202 lr: 0.000153\n", + "INFO:root:epoch: 84/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.811593 acc: 0.750893 lr: 0.000153\n", + "INFO:root:epoch: 84/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.807069 acc: 0.752292 lr: 0.000153\n", + "INFO:root:epoch: 84/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.807367 acc: 0.750586 lr: 0.000153\n", + "INFO:root:epoch: 84/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.807674 acc: 0.749816 lr: 0.000152\n", + "INFO:root:epoch: 84/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.810116 acc: 0.749653 lr: 0.000152\n", + "INFO:root:epoch: 84/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.803725 acc: 0.752796 lr: 0.000152\n", + "INFO:root:epoch: 84/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.802151 acc: 0.752969 lr: 0.000152\n", + "INFO:root:epoch: 84/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.795957 acc: 0.754018 lr: 0.000151\n", + "INFO:root:epoch: 84/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.792063 acc: 0.755682 lr: 0.000151\n", + "INFO:root:epoch: 84/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.788154 acc: 0.756386 lr: 0.000151\n", + "INFO:root:epoch: 84/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.786676 acc: 0.755599 lr: 0.000150\n", + "INFO:root:epoch: 84/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.791370 acc: 0.755500 lr: 0.000150\n", + "INFO:root:epoch: 84/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.795353 acc: 0.752644 lr: 0.000150\n", + "INFO:root:epoch: 84/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.792128 acc: 0.753009 lr: 0.000150\n", + "INFO:root:epoch: 84/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.793074 acc: 0.752567 lr: 0.000149\n", + "INFO:root:epoch: 84/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.790604 acc: 0.752909 lr: 0.000149\n", + "INFO:root:epoch: 84/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.792560 acc: 0.752396 lr: 0.000149\n", + "INFO:root:epoch: 84/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.792698 acc: 0.751915 lr: 0.000149\n", + "INFO:root:epoch: 85/100 starts\n", + "INFO:root:epoch: 85/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.766288 acc: 0.796875 lr: 0.000148\n", + "INFO:root:epoch: 85/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.747180 acc: 0.784375 lr: 0.000148\n", + "INFO:root:epoch: 85/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.769042 acc: 0.769792 lr: 0.000148\n", + "INFO:root:epoch: 85/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.766711 acc: 0.764063 lr: 0.000148\n", + "INFO:root:epoch: 85/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.766502 acc: 0.764375 lr: 0.000147\n", + "INFO:root:epoch: 85/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.761401 acc: 0.764583 lr: 0.000147\n", + "INFO:root:epoch: 85/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.755251 acc: 0.766964 lr: 0.000147\n", + "INFO:root:epoch: 85/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.761742 acc: 0.763672 lr: 0.000147\n", + "INFO:root:epoch: 85/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.764124 acc: 0.761111 lr: 0.000146\n", + "INFO:root:epoch: 85/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.773257 acc: 0.760000 lr: 0.000146\n", + "INFO:root:epoch: 85/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.786567 acc: 0.758239 lr: 0.000146\n", + "INFO:root:epoch: 85/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.796150 acc: 0.754167 lr: 0.000146\n", + "INFO:root:epoch: 85/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.802659 acc: 0.752163 lr: 0.000145\n", + "INFO:root:epoch: 85/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.798765 acc: 0.754018 lr: 0.000145\n", + "INFO:root:epoch: 85/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.793041 acc: 0.754792 lr: 0.000145\n", + "INFO:root:epoch: 85/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.798035 acc: 0.754297 lr: 0.000145\n", + "INFO:root:epoch: 85/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.799259 acc: 0.755147 lr: 0.000144\n", + "INFO:root:epoch: 85/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.801317 acc: 0.754861 lr: 0.000144\n", + "INFO:root:epoch: 85/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.794634 acc: 0.758388 lr: 0.000144\n", + "INFO:root:epoch: 85/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.793190 acc: 0.758906 lr: 0.000144\n", + "INFO:root:epoch: 85/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.791191 acc: 0.759673 lr: 0.000143\n", + "INFO:root:epoch: 85/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.789623 acc: 0.759943 lr: 0.000143\n", + "INFO:root:epoch: 85/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.789784 acc: 0.759783 lr: 0.000143\n", + "INFO:root:epoch: 85/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.790267 acc: 0.759375 lr: 0.000143\n", + "INFO:root:epoch: 85/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.794921 acc: 0.757875 lr: 0.000142\n", + "INFO:root:epoch: 85/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.795896 acc: 0.757813 lr: 0.000142\n", + "INFO:root:epoch: 85/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.793987 acc: 0.758796 lr: 0.000142\n", + "INFO:root:epoch: 85/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.794202 acc: 0.758482 lr: 0.000142\n", + "INFO:root:epoch: 85/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.795893 acc: 0.757543 lr: 0.000141\n", + "INFO:root:epoch: 85/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.796696 acc: 0.756563 lr: 0.000141\n", + "INFO:root:epoch: 85/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.798361 acc: 0.756048 lr: 0.000141\n", + "INFO:root:epoch: 86/100 starts\n", + "INFO:root:epoch: 86/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.789228 acc: 0.743750 lr: 0.000140\n", + "INFO:root:epoch: 86/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.786835 acc: 0.756250 lr: 0.000140\n", + "INFO:root:epoch: 86/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.767423 acc: 0.758333 lr: 0.000140\n", + "INFO:root:epoch: 86/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.768584 acc: 0.751563 lr: 0.000140\n", + "INFO:root:epoch: 86/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.781230 acc: 0.746875 lr: 0.000140\n", + "INFO:root:epoch: 86/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.784208 acc: 0.745313 lr: 0.000139\n", + "INFO:root:epoch: 86/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.804742 acc: 0.743750 lr: 0.000139\n", + "INFO:root:epoch: 86/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.805145 acc: 0.742187 lr: 0.000139\n", + "INFO:root:epoch: 86/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.799827 acc: 0.743056 lr: 0.000139\n", + "INFO:root:epoch: 86/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.811971 acc: 0.741875 lr: 0.000138\n", + "INFO:root:epoch: 86/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.803488 acc: 0.744318 lr: 0.000138\n", + "INFO:root:epoch: 86/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.795298 acc: 0.747396 lr: 0.000138\n", + "INFO:root:epoch: 86/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.790926 acc: 0.749279 lr: 0.000138\n", + "INFO:root:epoch: 86/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.783147 acc: 0.749777 lr: 0.000137\n", + "INFO:root:epoch: 86/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.789888 acc: 0.747500 lr: 0.000137\n", + "INFO:root:epoch: 86/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.791250 acc: 0.747070 lr: 0.000137\n", + "INFO:root:epoch: 86/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.788609 acc: 0.747610 lr: 0.000137\n", + "INFO:root:epoch: 86/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.791464 acc: 0.746875 lr: 0.000136\n", + "INFO:root:epoch: 86/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.795199 acc: 0.746217 lr: 0.000136\n", + "INFO:root:epoch: 86/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.796702 acc: 0.745469 lr: 0.000136\n", + "INFO:root:epoch: 86/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.799329 acc: 0.745238 lr: 0.000136\n", + "INFO:root:epoch: 86/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.795992 acc: 0.746875 lr: 0.000135\n", + "INFO:root:epoch: 86/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.793393 acc: 0.749049 lr: 0.000135\n", + "INFO:root:epoch: 86/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.793658 acc: 0.748828 lr: 0.000135\n", + "INFO:root:epoch: 86/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.793849 acc: 0.748000 lr: 0.000135\n", + "INFO:root:epoch: 86/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.792766 acc: 0.747716 lr: 0.000135\n", + "INFO:root:epoch: 86/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.791771 acc: 0.747106 lr: 0.000134\n", + "INFO:root:epoch: 86/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.796318 acc: 0.745759 lr: 0.000134\n", + "INFO:root:epoch: 86/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.798938 acc: 0.745690 lr: 0.000134\n", + "INFO:root:epoch: 86/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.799687 acc: 0.746458 lr: 0.000134\n", + "INFO:root:epoch: 86/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.796616 acc: 0.748488 lr: 0.000133\n", + "INFO:root:epoch: 87/100 starts\n", + "INFO:root:epoch: 87/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.850964 acc: 0.740625 lr: 0.000133\n", + "INFO:root:epoch: 87/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.802344 acc: 0.746875 lr: 0.000133\n", + "INFO:root:epoch: 87/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.815467 acc: 0.739583 lr: 0.000133\n", + "INFO:root:epoch: 87/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.809854 acc: 0.742969 lr: 0.000132\n", + "INFO:root:epoch: 87/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.780924 acc: 0.751875 lr: 0.000132\n", + "INFO:root:epoch: 87/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.791136 acc: 0.755729 lr: 0.000132\n", + "INFO:root:epoch: 87/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.790528 acc: 0.755357 lr: 0.000132\n", + "INFO:root:epoch: 87/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.790394 acc: 0.755469 lr: 0.000131\n", + "INFO:root:epoch: 87/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.788743 acc: 0.752083 lr: 0.000131\n", + "INFO:root:epoch: 87/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.789852 acc: 0.753125 lr: 0.000131\n", + "INFO:root:epoch: 87/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.784777 acc: 0.753977 lr: 0.000131\n", + "INFO:root:epoch: 87/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.776484 acc: 0.757031 lr: 0.000131\n", + "INFO:root:epoch: 87/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.775763 acc: 0.756250 lr: 0.000130\n", + "INFO:root:epoch: 87/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.784094 acc: 0.754464 lr: 0.000130\n", + "INFO:root:epoch: 87/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.781750 acc: 0.754792 lr: 0.000130\n", + "INFO:root:epoch: 87/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.783062 acc: 0.754883 lr: 0.000130\n", + "INFO:root:epoch: 87/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.779886 acc: 0.756985 lr: 0.000129\n", + "INFO:root:epoch: 87/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.782445 acc: 0.756944 lr: 0.000129\n", + "INFO:root:epoch: 87/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.790414 acc: 0.756414 lr: 0.000129\n", + "INFO:root:epoch: 87/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.790827 acc: 0.755937 lr: 0.000129\n", + "INFO:root:epoch: 87/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.792944 acc: 0.755357 lr: 0.000129\n", + "INFO:root:epoch: 87/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.793317 acc: 0.754119 lr: 0.000128\n", + "INFO:root:epoch: 87/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.785357 acc: 0.756386 lr: 0.000128\n", + "INFO:root:epoch: 87/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.787292 acc: 0.754948 lr: 0.000128\n", + "INFO:root:epoch: 87/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.785757 acc: 0.755000 lr: 0.000128\n", + "INFO:root:epoch: 87/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.786238 acc: 0.755048 lr: 0.000127\n", + "INFO:root:epoch: 87/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.784898 acc: 0.755671 lr: 0.000127\n", + "INFO:root:epoch: 87/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.783881 acc: 0.755915 lr: 0.000127\n", + "INFO:root:epoch: 87/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.786642 acc: 0.755388 lr: 0.000127\n", + "INFO:root:epoch: 87/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.784731 acc: 0.755937 lr: 0.000127\n", + "INFO:root:epoch: 87/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.781761 acc: 0.756552 lr: 0.000126\n", + "INFO:root:epoch: 88/100 starts\n", + "INFO:root:epoch: 88/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.747271 acc: 0.737500 lr: 0.000126\n", + "INFO:root:epoch: 88/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.740076 acc: 0.756250 lr: 0.000126\n", + "INFO:root:epoch: 88/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.748918 acc: 0.761458 lr: 0.000126\n", + "INFO:root:epoch: 88/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.732172 acc: 0.767969 lr: 0.000125\n", + "INFO:root:epoch: 88/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.747654 acc: 0.769375 lr: 0.000125\n", + "INFO:root:epoch: 88/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.764433 acc: 0.760937 lr: 0.000125\n", + "INFO:root:epoch: 88/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.770631 acc: 0.760268 lr: 0.000125\n", + "INFO:root:epoch: 88/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.765169 acc: 0.762109 lr: 0.000125\n", + "INFO:root:epoch: 88/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.775203 acc: 0.760417 lr: 0.000124\n", + "INFO:root:epoch: 88/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.774713 acc: 0.759062 lr: 0.000124\n", + "INFO:root:epoch: 88/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.773177 acc: 0.760795 lr: 0.000124\n", + "INFO:root:epoch: 88/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.773849 acc: 0.760156 lr: 0.000124\n", + "INFO:root:epoch: 88/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.770292 acc: 0.761058 lr: 0.000123\n", + "INFO:root:epoch: 88/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.771836 acc: 0.759375 lr: 0.000123\n", + "INFO:root:epoch: 88/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.770684 acc: 0.760625 lr: 0.000123\n", + "INFO:root:epoch: 88/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.766468 acc: 0.762500 lr: 0.000123\n", + "INFO:root:epoch: 88/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.770944 acc: 0.760662 lr: 0.000123\n", + "INFO:root:epoch: 88/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.769675 acc: 0.760937 lr: 0.000122\n", + "INFO:root:epoch: 88/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.766954 acc: 0.762171 lr: 0.000122\n", + "INFO:root:epoch: 88/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.769263 acc: 0.760937 lr: 0.000122\n", + "INFO:root:epoch: 88/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.777239 acc: 0.757589 lr: 0.000122\n", + "INFO:root:epoch: 88/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.776881 acc: 0.757812 lr: 0.000122\n", + "INFO:root:epoch: 88/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.777944 acc: 0.756522 lr: 0.000121\n", + "INFO:root:epoch: 88/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.777051 acc: 0.756510 lr: 0.000121\n", + "INFO:root:epoch: 88/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.778403 acc: 0.756375 lr: 0.000121\n", + "INFO:root:epoch: 88/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.778209 acc: 0.756971 lr: 0.000121\n", + "INFO:root:epoch: 88/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.775398 acc: 0.757292 lr: 0.000120\n", + "INFO:root:epoch: 88/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.775242 acc: 0.757143 lr: 0.000120\n", + "INFO:root:epoch: 88/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.775968 acc: 0.757543 lr: 0.000120\n", + "INFO:root:epoch: 88/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.775583 acc: 0.757083 lr: 0.000120\n", + "INFO:root:epoch: 88/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.773207 acc: 0.757258 lr: 0.000120\n", + "INFO:root:epoch: 89/100 starts\n", + "INFO:root:epoch: 89/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.734807 acc: 0.753125 lr: 0.000119\n", + "INFO:root:epoch: 89/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.776515 acc: 0.748438 lr: 0.000119\n", + "INFO:root:epoch: 89/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.785510 acc: 0.744792 lr: 0.000119\n", + "INFO:root:epoch: 89/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.772257 acc: 0.761719 lr: 0.000119\n", + "INFO:root:epoch: 89/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.781800 acc: 0.763750 lr: 0.000119\n", + "INFO:root:epoch: 89/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.783931 acc: 0.761458 lr: 0.000118\n", + "INFO:root:epoch: 89/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.777696 acc: 0.765179 lr: 0.000118\n", + "INFO:root:epoch: 89/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.776828 acc: 0.763672 lr: 0.000118\n", + "INFO:root:epoch: 89/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.778528 acc: 0.764931 lr: 0.000118\n", + "INFO:root:epoch: 89/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.782296 acc: 0.764375 lr: 0.000118\n", + "INFO:root:epoch: 89/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.779648 acc: 0.763068 lr: 0.000117\n", + "INFO:root:epoch: 89/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.781904 acc: 0.761719 lr: 0.000117\n", + "INFO:root:epoch: 89/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.783871 acc: 0.761538 lr: 0.000117\n", + "INFO:root:epoch: 89/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.780222 acc: 0.762277 lr: 0.000117\n", + "INFO:root:epoch: 89/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.790571 acc: 0.760000 lr: 0.000117\n", + "INFO:root:epoch: 89/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.788741 acc: 0.760352 lr: 0.000116\n", + "INFO:root:epoch: 89/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.786166 acc: 0.759743 lr: 0.000116\n", + "INFO:root:epoch: 89/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.783236 acc: 0.761806 lr: 0.000116\n", + "INFO:root:epoch: 89/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.789045 acc: 0.760855 lr: 0.000116\n", + "INFO:root:epoch: 89/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.788909 acc: 0.761719 lr: 0.000116\n", + "INFO:root:epoch: 89/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.787145 acc: 0.762798 lr: 0.000115\n", + "INFO:root:epoch: 89/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.786296 acc: 0.762926 lr: 0.000115\n", + "INFO:root:epoch: 89/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.787380 acc: 0.762500 lr: 0.000115\n", + "INFO:root:epoch: 89/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.791102 acc: 0.761198 lr: 0.000115\n", + "INFO:root:epoch: 89/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.787385 acc: 0.762750 lr: 0.000115\n", + "INFO:root:epoch: 89/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.787774 acc: 0.761058 lr: 0.000114\n", + "INFO:root:epoch: 89/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.787381 acc: 0.760301 lr: 0.000114\n", + "INFO:root:epoch: 89/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.784104 acc: 0.761272 lr: 0.000114\n", + "INFO:root:epoch: 89/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.783240 acc: 0.760560 lr: 0.000114\n", + "INFO:root:epoch: 89/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.784704 acc: 0.760313 lr: 0.000114\n", + "INFO:root:epoch: 89/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.783017 acc: 0.761391 lr: 0.000113\n", + "INFO:root:epoch: 90/100 starts\n", + "INFO:root:epoch: 90/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.997723 acc: 0.671875 lr: 0.000113\n", + "INFO:root:epoch: 90/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.891037 acc: 0.723437 lr: 0.000113\n", + "INFO:root:epoch: 90/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.856110 acc: 0.741667 lr: 0.000113\n", + "INFO:root:epoch: 90/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.829870 acc: 0.750781 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.825318 acc: 0.754375 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.819115 acc: 0.751562 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.821462 acc: 0.746875 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.810009 acc: 0.751563 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.805757 acc: 0.753125 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.792173 acc: 0.755937 lr: 0.000111\n", + "INFO:root:epoch: 90/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.797606 acc: 0.755398 lr: 0.000111\n", + "INFO:root:epoch: 90/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.788231 acc: 0.758073 lr: 0.000111\n", + "INFO:root:epoch: 90/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.790362 acc: 0.757692 lr: 0.000111\n", + "INFO:root:epoch: 90/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.787044 acc: 0.758482 lr: 0.000111\n", + "INFO:root:epoch: 90/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.785833 acc: 0.758333 lr: 0.000110\n", + "INFO:root:epoch: 90/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.786919 acc: 0.758594 lr: 0.000110\n", + "INFO:root:epoch: 90/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.795069 acc: 0.755882 lr: 0.000110\n", + "INFO:root:epoch: 90/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.801482 acc: 0.752431 lr: 0.000110\n", + "INFO:root:epoch: 90/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.799135 acc: 0.753618 lr: 0.000110\n", + "INFO:root:epoch: 90/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.802517 acc: 0.752344 lr: 0.000109\n", + "INFO:root:epoch: 90/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.802409 acc: 0.753423 lr: 0.000109\n", + "INFO:root:epoch: 90/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.799748 acc: 0.754119 lr: 0.000109\n", + "INFO:root:epoch: 90/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.799636 acc: 0.754348 lr: 0.000109\n", + "INFO:root:epoch: 90/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.797934 acc: 0.754036 lr: 0.000109\n", + "INFO:root:epoch: 90/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.796901 acc: 0.754625 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.796197 acc: 0.753966 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.795277 acc: 0.754514 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.794153 acc: 0.754911 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.797511 acc: 0.754741 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.798095 acc: 0.755729 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.797423 acc: 0.755645 lr: 0.000107\n", + "INFO:root:epoch: 91/100 starts\n", + "INFO:root:epoch: 91/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.880258 acc: 0.734375 lr: 0.000107\n", + "INFO:root:epoch: 91/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.863308 acc: 0.737500 lr: 0.000107\n", + "INFO:root:epoch: 91/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.802331 acc: 0.753125 lr: 0.000107\n", + "INFO:root:epoch: 91/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.790554 acc: 0.753906 lr: 0.000107\n", + "INFO:root:epoch: 91/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.782695 acc: 0.761250 lr: 0.000106\n", + "INFO:root:epoch: 91/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.794129 acc: 0.754687 lr: 0.000106\n", + "INFO:root:epoch: 91/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.807174 acc: 0.752679 lr: 0.000106\n", + "INFO:root:epoch: 91/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.798740 acc: 0.752344 lr: 0.000106\n", + "INFO:root:epoch: 91/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.797484 acc: 0.755208 lr: 0.000106\n", + "INFO:root:epoch: 91/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.804903 acc: 0.753438 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.810278 acc: 0.753125 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.802937 acc: 0.754688 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.805282 acc: 0.751923 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.802206 acc: 0.753348 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.811627 acc: 0.749583 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.806510 acc: 0.751367 lr: 0.000104\n", + "INFO:root:epoch: 91/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.804635 acc: 0.752390 lr: 0.000104\n", + "INFO:root:epoch: 91/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.801146 acc: 0.753993 lr: 0.000104\n", + "INFO:root:epoch: 91/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.805238 acc: 0.753454 lr: 0.000104\n", + "INFO:root:epoch: 91/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.803522 acc: 0.752500 lr: 0.000104\n", + "INFO:root:epoch: 91/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.798943 acc: 0.753720 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.803901 acc: 0.752131 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.803263 acc: 0.752717 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.800978 acc: 0.753255 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.800123 acc: 0.754125 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.799911 acc: 0.754687 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.800873 acc: 0.754745 lr: 0.000102\n", + "INFO:root:epoch: 91/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.797758 acc: 0.755246 lr: 0.000102\n", + "INFO:root:epoch: 91/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.795045 acc: 0.755603 lr: 0.000102\n", + "INFO:root:epoch: 91/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.794688 acc: 0.756354 lr: 0.000102\n", + "INFO:root:epoch: 91/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.795100 acc: 0.755948 lr: 0.000102\n", + "INFO:root:epoch: 92/100 starts\n", + "INFO:root:epoch: 92/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.690164 acc: 0.806250 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.703724 acc: 0.804688 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.741244 acc: 0.785417 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.774958 acc: 0.772656 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.798213 acc: 0.758750 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.797358 acc: 0.758333 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.792055 acc: 0.756696 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.790635 acc: 0.756250 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.781314 acc: 0.757986 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.786877 acc: 0.753437 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.793163 acc: 0.749432 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.791653 acc: 0.748958 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.784631 acc: 0.750000 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.792362 acc: 0.748661 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.794085 acc: 0.748125 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.801355 acc: 0.746680 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.794676 acc: 0.748346 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.794705 acc: 0.748785 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.794772 acc: 0.748684 lr: 0.000098\n", + "INFO:root:epoch: 92/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.795025 acc: 0.749063 lr: 0.000098\n", + "INFO:root:epoch: 92/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.792504 acc: 0.749554 lr: 0.000098\n", + "INFO:root:epoch: 92/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.789176 acc: 0.750852 lr: 0.000098\n", + "INFO:root:epoch: 92/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.787420 acc: 0.752717 lr: 0.000098\n", + "INFO:root:epoch: 92/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.790530 acc: 0.751693 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.789574 acc: 0.751500 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.790968 acc: 0.752404 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.791330 acc: 0.752662 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.788709 acc: 0.752902 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.788136 acc: 0.753556 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.783615 acc: 0.754583 lr: 0.000096\n", + "INFO:root:epoch: 92/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.785232 acc: 0.753831 lr: 0.000096\n", + "INFO:root:epoch: 93/100 starts\n", + "INFO:root:epoch: 93/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.731577 acc: 0.753125 lr: 0.000096\n", + "INFO:root:epoch: 93/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.806961 acc: 0.728125 lr: 0.000096\n", + "INFO:root:epoch: 93/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.780273 acc: 0.742708 lr: 0.000096\n", + "INFO:root:epoch: 93/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.794445 acc: 0.740625 lr: 0.000096\n", + "INFO:root:epoch: 93/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.785886 acc: 0.748750 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.808928 acc: 0.743750 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.807731 acc: 0.746429 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.809243 acc: 0.746094 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.801031 acc: 0.748264 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.806838 acc: 0.747187 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.808934 acc: 0.746023 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.813034 acc: 0.742187 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.805116 acc: 0.744471 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.810833 acc: 0.743304 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.805423 acc: 0.745000 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.805258 acc: 0.746484 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.799348 acc: 0.748529 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.798730 acc: 0.748958 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.799672 acc: 0.749507 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.808083 acc: 0.748437 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.808896 acc: 0.748214 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.801298 acc: 0.750284 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.797310 acc: 0.752446 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.798673 acc: 0.751823 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.801596 acc: 0.751750 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.799826 acc: 0.752163 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 12s eta: 2s batches: 270/313(86%) samples: 8640 loss: 0.796523 acc: 0.751968 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 13s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.796352 acc: 0.751228 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.799463 acc: 0.750108 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 14s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.800424 acc: 0.749479 lr: 0.000091\n", + "INFO:root:epoch: 93/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.797798 acc: 0.750706 lr: 0.000091\n", + "INFO:root:epoch: 94/100 starts\n", + "INFO:root:epoch: 94/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.803447 acc: 0.762500 lr: 0.000091\n", + "INFO:root:epoch: 94/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.833519 acc: 0.745312 lr: 0.000091\n", + "INFO:root:epoch: 94/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.840047 acc: 0.743750 lr: 0.000091\n", + "INFO:root:epoch: 94/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.841335 acc: 0.739844 lr: 0.000091\n", + "INFO:root:epoch: 94/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.821914 acc: 0.746250 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.825954 acc: 0.744792 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.844646 acc: 0.739286 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.839375 acc: 0.737109 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.827256 acc: 0.740972 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.824656 acc: 0.745937 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.822932 acc: 0.746023 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.821938 acc: 0.747135 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.809640 acc: 0.751683 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.803667 acc: 0.754018 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.806155 acc: 0.752917 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.808421 acc: 0.752930 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.803617 acc: 0.754779 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.801626 acc: 0.754514 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.798918 acc: 0.754112 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.797969 acc: 0.755000 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.797693 acc: 0.754762 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.800783 acc: 0.753267 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.798359 acc: 0.754891 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.800326 acc: 0.754167 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.789442 acc: 0.758000 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.787871 acc: 0.758413 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.784470 acc: 0.759722 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.783457 acc: 0.759710 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.782562 acc: 0.759698 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.782705 acc: 0.759375 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.778999 acc: 0.760282 lr: 0.000086\n", + "INFO:root:epoch: 95/100 starts\n", + "INFO:root:epoch: 95/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.766231 acc: 0.762500 lr: 0.000086\n", + "INFO:root:epoch: 95/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.751542 acc: 0.765625 lr: 0.000086\n", + "INFO:root:epoch: 95/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.773591 acc: 0.756250 lr: 0.000086\n", + "INFO:root:epoch: 95/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.763280 acc: 0.757031 lr: 0.000086\n", + "INFO:root:epoch: 95/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.771654 acc: 0.757500 lr: 0.000086\n", + "INFO:root:epoch: 95/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.772644 acc: 0.754687 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.768126 acc: 0.758036 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.781850 acc: 0.756641 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.798895 acc: 0.751389 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.796311 acc: 0.752812 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.796165 acc: 0.752841 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.786548 acc: 0.754687 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.788553 acc: 0.756250 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.786640 acc: 0.757589 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.790195 acc: 0.756875 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.790926 acc: 0.757227 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.792668 acc: 0.755699 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.796034 acc: 0.754167 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.796901 acc: 0.753289 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.793658 acc: 0.754219 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.790697 acc: 0.755357 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.791174 acc: 0.755540 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.790981 acc: 0.755842 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.792551 acc: 0.754948 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.790318 acc: 0.755750 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.788822 acc: 0.756731 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.785852 acc: 0.757755 lr: 0.000082\n", + "INFO:root:epoch: 95/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.785526 acc: 0.757701 lr: 0.000082\n", + "INFO:root:epoch: 95/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.788618 acc: 0.757112 lr: 0.000082\n", + "INFO:root:epoch: 95/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.787243 acc: 0.757708 lr: 0.000082\n", + "INFO:root:epoch: 95/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.786788 acc: 0.758165 lr: 0.000082\n", + "INFO:root:epoch: 96/100 starts\n", + "INFO:root:epoch: 96/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.900184 acc: 0.721875 lr: 0.000082\n", + "INFO:root:epoch: 96/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.817050 acc: 0.726562 lr: 0.000082\n", + "INFO:root:epoch: 96/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.787611 acc: 0.742708 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.786342 acc: 0.742187 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.785539 acc: 0.748125 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.809411 acc: 0.743229 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.815168 acc: 0.744643 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.808906 acc: 0.747656 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.799573 acc: 0.748958 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.809905 acc: 0.745937 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.815977 acc: 0.744886 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.809168 acc: 0.747656 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.803039 acc: 0.749038 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.803802 acc: 0.750670 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.800406 acc: 0.751458 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.801153 acc: 0.751367 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.795543 acc: 0.753493 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.792251 acc: 0.754340 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.795225 acc: 0.754276 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.794892 acc: 0.755156 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.801911 acc: 0.753571 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.804700 acc: 0.753125 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.806657 acc: 0.752446 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.804139 acc: 0.752604 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.802955 acc: 0.753250 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.800339 acc: 0.753726 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.800201 acc: 0.752894 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.800504 acc: 0.751563 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.798371 acc: 0.752155 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.798875 acc: 0.751354 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.798440 acc: 0.751915 lr: 0.000078\n", + "INFO:root:epoch: 97/100 starts\n", + "INFO:root:epoch: 97/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.719486 acc: 0.768750 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.778803 acc: 0.753125 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.755296 acc: 0.759375 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.797087 acc: 0.750781 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.783176 acc: 0.753750 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.789088 acc: 0.750000 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.783504 acc: 0.753125 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.795049 acc: 0.750391 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.797903 acc: 0.750347 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.795908 acc: 0.749375 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.791749 acc: 0.748295 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.793694 acc: 0.749219 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.790152 acc: 0.750962 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.781105 acc: 0.752232 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.780573 acc: 0.753333 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.778776 acc: 0.753125 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.777386 acc: 0.753493 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.778101 acc: 0.753646 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.782027 acc: 0.752467 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.775025 acc: 0.755156 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.772303 acc: 0.756101 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.768858 acc: 0.757670 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.770851 acc: 0.756522 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.770891 acc: 0.755990 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.773270 acc: 0.755750 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.771640 acc: 0.756611 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.771903 acc: 0.756019 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.777557 acc: 0.754911 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.780185 acc: 0.753664 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.779205 acc: 0.755417 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.780044 acc: 0.756048 lr: 0.000073\n", + "INFO:root:epoch: 98/100 starts\n", + "INFO:root:epoch: 98/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.707994 acc: 0.771875 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.712420 acc: 0.775000 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.744193 acc: 0.769792 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.754451 acc: 0.761719 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.754195 acc: 0.758125 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.764942 acc: 0.757813 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.758779 acc: 0.757589 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.766650 acc: 0.755078 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.757546 acc: 0.757639 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.761171 acc: 0.758125 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.762360 acc: 0.759943 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.770996 acc: 0.758594 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.768122 acc: 0.759615 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.772493 acc: 0.756250 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.770627 acc: 0.757500 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.768439 acc: 0.757227 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.781094 acc: 0.754228 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.785662 acc: 0.754340 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.783487 acc: 0.755099 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.780362 acc: 0.755156 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.784881 acc: 0.754613 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.782945 acc: 0.756108 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.784274 acc: 0.756522 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.781904 acc: 0.757292 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.778922 acc: 0.758000 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.776607 acc: 0.758053 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.776517 acc: 0.758102 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.776057 acc: 0.758147 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.775508 acc: 0.757759 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.774023 acc: 0.758854 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.771974 acc: 0.759173 lr: 0.000070\n", + "INFO:root:epoch: 99/100 starts\n", + "INFO:root:epoch: 99/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.837768 acc: 0.746875 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.823928 acc: 0.743750 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.777027 acc: 0.758333 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.774453 acc: 0.763281 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.784160 acc: 0.758125 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.773204 acc: 0.759896 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.767902 acc: 0.762500 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.773605 acc: 0.759766 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.792748 acc: 0.755903 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.793041 acc: 0.755625 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.783605 acc: 0.757955 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.784180 acc: 0.757292 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.787015 acc: 0.759135 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.784868 acc: 0.758929 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.783041 acc: 0.760000 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.787532 acc: 0.759961 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.780208 acc: 0.763235 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.782567 acc: 0.761632 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.782641 acc: 0.760197 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.780030 acc: 0.760781 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.777667 acc: 0.761310 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.778561 acc: 0.760938 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.784298 acc: 0.758967 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.782546 acc: 0.758854 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.782698 acc: 0.758625 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.781098 acc: 0.758413 lr: 0.000066\n", + "INFO:root:epoch: 99/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.781811 acc: 0.758796 lr: 0.000066\n", + "INFO:root:epoch: 99/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.778903 acc: 0.760156 lr: 0.000066\n", + "INFO:root:epoch: 99/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.774765 acc: 0.761422 lr: 0.000066\n", + "INFO:root:epoch: 99/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.775532 acc: 0.760313 lr: 0.000066\n", + "INFO:root:epoch: 99/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.778129 acc: 0.759173 lr: 0.000066\n", + "INFO:root:epoch: 100/100 starts\n", + "INFO:root:epoch: 100/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.728071 acc: 0.762500 lr: 0.000066\n", + "INFO:root:epoch: 100/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.806044 acc: 0.764063 lr: 0.000066\n", + "INFO:root:epoch: 100/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.840349 acc: 0.759375 lr: 0.000066\n", + "INFO:root:epoch: 100/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.832785 acc: 0.755469 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.830045 acc: 0.754375 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.827730 acc: 0.751042 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.814045 acc: 0.758482 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.798234 acc: 0.760547 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.800798 acc: 0.757986 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.792712 acc: 0.757500 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.783494 acc: 0.760227 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.777760 acc: 0.761719 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.774788 acc: 0.762981 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.766845 acc: 0.765848 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.765367 acc: 0.767292 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.762996 acc: 0.768945 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.766414 acc: 0.768382 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.768261 acc: 0.766319 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.768353 acc: 0.765954 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.769359 acc: 0.765156 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.771308 acc: 0.765030 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.771582 acc: 0.764631 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.776534 acc: 0.762500 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.782496 acc: 0.759635 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.782019 acc: 0.760250 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.780009 acc: 0.761058 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.776966 acc: 0.760880 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.776915 acc: 0.761607 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.773110 acc: 0.763901 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.776147 acc: 0.762917 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.777149 acc: 0.762601 lr: 0.000062\n" + ] + } + ], + "source": [ + "logger = logging.getLogger()\n", + "logger.setLevel(0)\n", + "trainer.fit(train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " epoch train_loss train_acc lr val_loss val_acc\n", + "21 1 4.597061 0.0185 0.003120 4.604409 0.03\n", + "22 2 4.576668 0.0248 0.006250 4.550752 0.03\n", + "23 3 4.429581 0.0333 0.009380 4.132248 0.03\n", + "24 4 3.932376 0.0777 0.010000 3.541716 0.11\n", + "25 5 3.252122 0.1659 0.010000 3.084885 0.17\n", + ".. ... ... ... ... ... ...\n", + "116 96 0.799725 0.7515 0.000077 0.671766 0.78\n", + "117 97 0.781655 0.7553 0.000073 0.714923 0.77\n", + "118 98 0.771329 0.7589 0.000070 0.513621 0.84\n", + "119 99 0.778845 0.7587 0.000066 0.385253 0.89\n", + "120 100 0.777107 0.7624 0.000062 0.483041 0.85\n", + "\n", + "[100 rows x 6 columns]\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "train_result = pd.read_csv('./tdnn_xvec/train.log')\n", + "train_result = train_result[train_result.index>20]\n", + "print(train_result)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA68AAALYCAYAAACTyMQkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAACcVUlEQVR4nOzddZiU1RvG8fvdpbthF2nEAgQFlA4BAUVFRAzs7u6un4otJraioihiIIqigIiKhGIhioAg3V275/fHw7DBxszuzLwT3891zTUweXZ3dmfu9zznOZ5zTgAAAAAAxLIUvwcAAAAAAEBhCK8AAAAAgJhHeAUAAAAAxDzCKwAAAAAg5hFeAQAAAAAxj/AKAAAAAIh5hFcAAAAAQMwjvAIAkprneYd5nveq53nzPM/bsvv0p+d5wz3POzzb7Sp7nrfI87xNnuc1yuexnvE8z3med3W2y87cfVlBpzuDGGfD3bf9JCxfOAAAcaaE3wMAAMAPnuelSnpU0uWSdkiaIOkDSZmS9pV0sqTzPM873Tn3hnNuved550v6VNJLnucd4Zxz2R6vp6QLJX0r6fE8nvILSVPzGc7EsHxRAAAkMMIrACBZ/U8WXGdIOsE5tyD7lZ7nVZZ0o6Qqgcucc+M8z3tZ0tmSLpH01O7bVpL0kqStks5yzmXm8XzjnXMPh//LAAAgOVA2DABIOp7n7SvpWkmrJPXNHVwlyTm33jl3k6Thua66WtJiSQ94ntd492WPSaov6Wbn3F8RG3gIPM/r6nne557nrfU8b6vneb96nne953kl87jtIM/zvvE8b6Xneds8z/vP87xPPc/rlet2PT3PG+953lLP87Z7nrfM87yvPc87KXpfGQAgWRFeAQDJ6EzZe+DzzrmVBd3QObc91//XSzpPUnlJr3ied7RsJvYbSU9GZLQh2h0mv5LUXtJ7koZJ8iQ9KGm053lettteIuldSWmS3pEF8S8lNZN0ZLbbHS1pvKTmkj6S9IishLq6pBMi/kUBAJIeZcMAgGTUYff5V0W5s3PuM8/zXpJ0jqTDJG2RlQu7Au7W2/O8Cvlc95xzbllRxpLb7nLn5yVtk3SYc+6P3ZffImmcpKMlnS7ptd13OUfSEkktnXNbcj1W9Wz/PVvSTkmtnHMrCrgdAAARQXgFACSjOrvP/yvGY9wm6SxJpSXd4pybV8jte+0+5WWMpLCEV0nHSaok6clAcJUk59xOz/NulPSjpDOUFV4la1iVkfuBnHOrc120c/epsNsBABB2lA0DAFA0NynrffSY3d2LC3Kdc87L5/RTGMd18O7zSbmvcM5Nl7Qp220kKxVuKOlXz/Pu8TzvCM/zyuXxuO/ISqV/9TzvYc/z+u2e5QUAICoIrwCAZBSY5axblDt7ntdV0qWSvpf0sqx0+KrwDK3YKu0+X57P9cuz3UaShko6X9IGSbfK1ruu8TxvhOd5tQM3cs69I2mgpPmyr3WspFWe533seV7T8H4JAADsjfAKAEhGgf1We4R6R8/zyssC63ZZ2fBVkhZJusfzvGZhG2HRbdh9Xjuf62tnu42cecE5d6ikWpIGyRoznSpr5KRstx3tnOskqZps7exbu8/HeZ5XKqxfBQAAuRBeAQDJ6FVJmZLO9zyvRkE39DyvdK6LHpTUWNLtzrk5zrkNsu7DZWTdh/1+b/1p93mX3Fd4nneIpArZbpODc26lc+4959wxu2/TJa/S4N3bCI11zp0h6UNJTSUdEJbRAwCQD7/fYAEAiLrde7E+LKmmpLGe5zXIfRvP8yp6nnevrKQ2cFl3SRfLyoUfzfZ4n8tmYztIujyyoy/Uh7KZ1XOzzwR7nldC0gO7//t6tsu75n4Az/PKSqosaZd2N3LyPK9z7nW9u4N6IPxvC+PXAADAXryCu/oDAJCYdgexxyRdJuu2+6Wk32Uzsk1knYErSTrNOTdi9zY3v8g6Fbd2zs3J9XiVJf0qK6ltGeg+7HnemZJekfSFssqVc5vjnBtZyHgbytabLpY0IZ+bTXHOveh53smSRsiaM70jaa2koyQdJOkTSccEtvXxPG+dpHWSfpC0UDaD3Fc2m/qUc+6y3bf7affX/q2kBbJ9Y7tLaiXpE+dc/4LGDwBAcRFeAQBJzfO8wyVdKKmzpDRZVdIiSZMlPe+cm7b7ds9IukjS9c65h/J5rL6SPt19327OOZctvBbkQ+fccYWMs6EsvBbkNefcmbtv303WEfkwWSCdJ5txfdQ5t2e7G8/zLpKF1ZaycLpR0p+yvWJHZAu5g2UNmw6VfZ+2SfpHtuXO8865HYWMDQCAYiG8AgAAAABiHmteAQAAAAAxj/AKAAAAAIh5hFcAAAAAQMwjvAIAAAAAYl4JvwcQiho1ariGDRv6OobNmzerfPnyvo4BCOD1iFjC6xGxhNcjYgmvR8SSWH89zpgxY5VzrmZe18VVeG3YsKGmT5/u6xgmTpyobt26+ToGIIDXI2IJr0fEEl6PiCW8HhFLYv316Hnewvyuo2wYAAAAABDzCK8AAAAAgJhHeAUAAAAAxDzCKwAAAAAg5hFeAQAAAAAxj/AKAAAAAIh5hFcAAAAAQMwjvAIAAAAAYh7hFQAAAAAQ8wivAAAAAICYR3gFAAAAAMQ8wisAAAAAIOYRXgEAAAAAMY/wCgAAAACIeYRXAAAAAEDMI7wCAAAAAGIe4RUAAAAAEPMIrwAAAACAmEd4BQAAAADEPMIrAAAAACDmEV4BAAAAADGP8AoAAAAAiHmEVwAAAABAzCO8AgAAAABiHuEVAAAAABDzCK8AAAAAgJhHeAUAAAAAxDzCazg9+6w0dKjfowAAAACAhEN4DRfn9OfLU6QbbtD6C66XnPN7RAAAAACQMAiv4eJ5uqPh63paF6vy8If0SZ1z9PQTu7R8ud8DAwAAAID4R3gNo5GjUtVv3lOa0uN2Hb3iFdW98gQ1StumI4+UXn1VWr/e7xECAAAAQHwivIZZo8aeOk24S3rySR2nD/VrvT5aOme9zjpLql1buvhiKooBAAAAIFSE10i57DLpzTfVeMm3+rl6d/04doWOP956Ov3wg9+DAwAAAID4QniNpFNOkT76SN6cOWpzRUe9cMsCVaggDR/u98AAAAAAIL4QXiOtb1/pyy+lVatUvndHXdvnV40cyfpXAAAAAAgF4TUaOnSQJk+WnNNNk47U1q1Ob73l96AAAAAAIH4QXqOlRQvp1ltVauUS9Wq+VMOH07gJAAAAAIJFeI2mJk0kSRf3+ls//STNmOHvcAAAAAAgXhBeo6lpU0lS7ybzVK4cjZsAAAAAIFiE12hq0EAqUULllvytwYOlt9+WNm70e1AAAAAAEPsIr9FUooQF2L//1vnnS5s2SSNH+j0oAAAAAIh9hNdoa9pUmjdPhx0mNW8uvfCC3wMCAAAAgNhHeI22pk2lv/+WJ6fzz5d+/FH66Se/BwUAAAAAsY3wGm1Nmkjr10urV2vIEKlMGWZfAQAAAKAwhNdo291xWPPmqWpVadAgacQIafNmf4cFAAAAALGM8BptgfD699+SpPPOkzZskEaN8nFMAAAAABDjCK/R1qiR5Hl7wmunTtL++7PnKwAAAAAUhPAabWXKSPvsI82bJ8ly7HnnSd99J/36q89jAwAAAIAYRXj1w+6OwwGnny6VKkXjJgAAAADID+HVD7nCa40a0vHHS6+/Lm3d6uO4AAAAACBGEV790KSJtHKldWra7fzzpXXrpPff929YAAAAABCrCK9+yLZdTkC3bnYxjZsAAAAAYG+EVz/k2i5HssZN554rffONtGCBP8MCAAAAgFhFePVD48Z2nm3mVZI6drTzuXOjPB4AAAAAiHGEVz9UrCjVrp1j5lWS6te383//9WFMAAAAABDDCK9+ydVxWJLS06WUFMIrAAAAAORGePVLkyZ7lQ2XKCHVrUt4BQAAAIDcCK9+adpUWrx4r41d69cnvAIAAABAboRXvwQ6Dv/zT46LCa8AAAAAsDfCq1+aNLHzXKXD9etLixZJmZk+jAkAAAAAYhTh1S957PUqWXjdsUNascKHMQEAAABAjCK8+qVaNalqVbbLAQAAAIAgEF79lEfHYcIrAAAAAOyN8OqnPPZ6JbwCAAAAwN4Ir35q2lRauFDauXPPRZUrSxUr2sUAAAAAAEN49VPTplJGRo6k6nlslwMAAAAAuRFe/RTYLieP0mHCKwAAAABkIbz6qYDtcgivAAAAAJCF8Oqn2rWl8uXz7Di8apW0ZYtP4wIAAACAGEN49ZPnWelwPh2HFy3yYUwAAAAAEIMIr35juxwAAAAAKBTh1W9Nm0r//GNdh3cjvAIAAABAToRXvzVpIu3YIf33356L6ta1imLCKwAAAAAYwqvf8ug4XLKklJ5OeAUAAACAAMKr3wLhNY+Ow4RXAAAAADCEV7/VrSuVKsVerwAAAABQAMKr31JTpcaN8wyvixZJmZk+jQsAAAAAYgjhNRbks13O9u3SypU+jQkAAAAAYgjhNRY0aWJrXp3bcxHb5QAAAABAFsJrLGjaVNq8WVq+fM9FhFcAAAAAyEJ4jQV5bJdDeAUAAACALITXWJDHdjlVq0oVKhBeAQAAAEAivMaGBg2s63C2mVfPY7scAAAAAAggvMaCkiUtwLLXKwAAAADkifAaK5o2zVE2LBFeAQAAACCA8BormjTJc+Z1xQpp61afxgQAAAAAMYLwGiuaNpXWrpXWrNlzUaDj8KJFPo0JAAAAAGIE4TVW5NFxmO1yAAAAAMAQXmNFkyZ2zl6vAAAAALAXwmusaNzYzrOF17p1bcscwisAAACAZEd4jRVly0r77JOjbLhUKSktjfAKAAAAAITXWJJPx2HCKwAAAIBkR3iNJU2bEl4BAAAAIA+E11jStKm0fLm0adOeiwLh1TkfxwUAAAAAPiO8xpJA06Z//tlzUf360vbt0sqVPo0JAAAAAGIA4TWW1K1r50uW7LmI7XIAAAAAgPAaW9LT7Xzp0j0XEV4BAAAAgPAaW9LS7JzwCgAAAAA5EF5jSZkyUpUqOcJrtWpSuXKEVwAAAADJjfAaa9LTc6x59Ty2ywEAAAAAwmusSUvLMfMqEV4BAAAAgPAaawivAAAAALAXwmusSUuzsmHn9lxUv760fLm0bZuP4wIAAAAAHxFeY016urRjh7R27Z6LAh2HFy/2aUwAAAAA4DPCa6xhuxwAAAAA2AvhNdYQXgEAAABgL4TXWJOebufZtsvZZx87J7wCAAAASFaE11iTx8xr6dJSnTqEVwAAAADJy/fw6nneZ57nOc/z7vV7LDGhfHmpYsU8t8tZuNCnMQEAAACAz3wNr57nnSzpYD/HEJMC2+Vkw16vAAAAAJKZb+HV87yqkh6TdLVfY4hZ6el5zrz++2+O7V8BAAAAIGn4OfP6oKRfnXNv+ziG2JSWlmd43bZNWrXKpzEBAAAAgI9K+PGknud1knS6KBnOWyC8Oid5nqSc2+XUrOnj2AAAAADAB1EPr57nlZL0vKSHnXN/BnH78yWdL0m1a9fWxIkTIzvAQmzatCniY9hn61Y13bJF34wdq4wKFSRJK1ZUkNRGn376qzZuZPoVJhqvRyBYvB4RS3g9IpbwekQsiefXox8zr9dLKivpvmBu7JwbLmm4JLVp08Z169YtciMLwsSJExXxMSxZIj37rDo3bSrtv78k6aCDpAsvlCpVai6fvwWIIVF5PQJB4vWIWMLrEbGE1yNiSTy/HqO65tXzvPqSbpF0m6TSnudV8Tyvyu6rA/9PjeaYYlIee73WqCGVKUPHYQAAAADJKdoNmxpLKiNphKS12U6SdO3uf7eI8phiTyC8Ztsux/PYLgcAAABA8op22fBPkrrncfnXskD7kqS/ozmgmJSebuf5bJcDAAAAAMkmquHVObdO0sTcl3vWUXehc26v65JSxYpSuXJ5htdPP/VpTAAAAADgIz/3eUV+PC/fvV6XLZO2b/dpXAAAAADgE1/2ec3NOef5PYaYk56eY82rJDVoYOeLF0tNmvgwJgAAAADwCTOvsSqfmVeJda8AAAAAkg/hNVblEV7r1bNzwisAAACAZEN4jVVpadLGjdKmTXsuqlPHzles8GlMAAAAAOATwmusymO7nAoVpDJlCK8AAAAAkg/hNValpdl5tvDqeVKtWoRXAAAAAMmH8Bqr8givEuEVAAAAQHIivMaqQNlwru1yCK8AAAAAkhHhNVZVqSKVLs3MKwAAAACI8Bq7PC/P7XIC4dU5n8YFAAAAAD4gvMaytLS9yoZr1pR27JA2bPBpTAAAAADgA8JrLEtPz3PmVaJ0GAAAAEByIbzGsnzKhiXCKwAAAIDkQniNZWlp0rp10tatey4ivAIAAABIRoTXWBbYLifb7CvhFQAAAEAyIrzGsrQ0O88WXmvWtHPCKwAAAIBkQniNZXmE19KlpcqVpZUrfRoTAAAAAPiA8BrLAmXDubbLCez1CgAAAADJgvAay6pXl0qWzLPjMOEVAAAAQDIhvMYyz5Pq1CG8AgAAAEh6hNdYl89er4RXAAAAAMmE8Brr0tPzXPO6apWUkeHTmAAAAAAgygivsS6fmVfnpNWrfRoTAAAAAEQZ4TXWpaVZSt2+fc9FtWrZOaXDAAAAAJIF4TXWBbbLWbZsz0WEVwAAAADJhvAa69LS7Dxb6TDhFQAAAECyIbzGOsIrAAAAABBeY14e4bVaNSklhfAKAAAAIHkQXmNdzZpSamqO7XJSUuxiwisAAACAZEF4jXWpqVLt2nlul0N4BQAAAJAsCK/xIJ+9XgmvAAAAAJIF4TUepKfnKBuWKBsGAAAAkFwIr/GAmVcAAAAASY7wGg/S0qSVK6WdO/dcVKuWtHGjtG2bj+MCAAAAgCghvMaDtDTJOWn58j0XBfZ6XbnSpzEBAAAAQBQRXuNBerqdZysdDoRXSocBAAAAJAPCazxIS7NzwisAAACAJEV4jQeEVwAAAABJjvAaD2rXljwvx3Y5hFcAAAAAyYTwGg9KlLC0mm3mtUIFqUwZwisAAACA5EB4jRe59nr1PPZ6BQAAAJA8CK/xIi0tR9mwRHgFAAAAkDwIr/EiPT3HzKtEeAUAAACQPAiv8SItTVq+XMrI2HMR4RUAAABAsiC8xou0NCkzU1q5cs9FgfDqnI/jAgAAAIAoILzGi/R0O8+1Xc6OHdKGDT6NCQAAAACihPAaL9LS7Dzbulf2egUAAACQLAiv8YLwCgAAACCJEV7jRZ06dp6rbFgivAIAAABIfITXeFGqlFSjBjOvAAAAAJIS4TWepKXlCK81a9o54RUAAABAoiO8xpNc4bVUKalKFcIrAAAAgMRHeI0n6ek51rxKNvuabetXAAAAAEhIhNd4kpYmLVsmZWbuuahWLWZeAQAAACQ+wms8SUuTdu2SVq/ecxHhFQAAAEAyILzGk8Ber7m2yyG8AgAAAEh0hNd4kp5u57m2y1m1SsrI8GlMAAAAABAFhNd4Egiv//2356JatSTnclQSAwAAAEDCIbzGk7p1pZQUaeHCPRfVqmXnlA4DAAAASGSE13hSsqRUr540f/6eiwivAAAAAJIB4TXeNGpEeAUAAACQdAiv8aZhQ8IrAAAAgKRDeI03jRrZVjnbt0uSqlWzZbCEVwAAAACJjPAabxo1svPdTZtSUqSaNQmvAAAAABIb4TXeNGxo57lKhwmvAAAAABIZ4TXeBGZeFyzYcxHhFQAAAECiI7zGm/R02zKHmVcAAAAASYTwGm9SUqQGDQivAAAAAJIK4TUeNWq0V9nwxo3S1q3+DQkAAAAAIonwGo8aNcpzr9eVK30aDwAAAABEGOE1HjVsaEl10yZJhFcAAAAAiY/wGo9y7fUaCK+sewUAAACQqAiv8SgQXneXDhNeAQAAACQ6wms8atjQzneH15o17b+EVwAAAACJivAaj2rVksqV29NxuEIFqUwZwisAAACAxEV4jUeeZ7Ovu2dePY+9XgEAAAAkNsJrvMoWXiXCKwAAAIDERniNV3ns9Up4BQAAAJCoCK/xqlEjaf16ad06SYRXAAAAAImN8BqvcnUcDoRX5/wbEgAAAABECuE1XuWx1+uOHdKGDT6OCQAAAAAihPAarwLhdfd2ObVq2X8pHQYAAACQiAiv8apKFalSpRwzrxLhFQAAAEBiIrzGK8/L0XGY8AoAAAAgkRFe41mjRpQNAwAAAEgKhNd41rChzbw6p5o17SLCKwAAAIBERHiNZ40aSVu2SCtXqlQpWwZLeAUAAACQiAiv8SyPjsOEVwAAAACJiPAazxo2tPNsTZtWrvRvOAAAAAAQKYTXeBaYec0WXpl5BQAAAJCICK/xrEIFqUYNwisAAACAhEd4jXcNG+ZY87pqlZSR4euIAAAAACDsCK/xrlGjHDOvzkmrV/s8JgAAAAAIM8JrvGvUSFq4UMrMVK1adhGlwwAAAAASDeE13jVsKO3YIS1dqpo17SLCKwAAAIBEQ3iNd9k6DjPzCgAAACBREV7jHeEVAAAAQBIgvMa7Bg3sfMECVasmpaQQXgEAAAAkHsJrvCtTRkpLk+bPV0qKVKeOtHix34MCAAAAgPAivCaCbNvlNG4s/fOPz+MBAAAAgDAjvCaCRo2kBQskSU2aEF4BAAAAJB7CayJo2FBatEjatUtNmkj//Sdt3er3oAAAAAAgfAiviaBRIykjQ1q0SI0b20W7q4gBAAAAICEQXhNBYLucBQvUpIn9k9JhAAAAAImE8JoIGja08/nz94TXefN8Gw0AAAAAhB3hNRHUq2cbvM6frxo1pAoVCK8AAAAAEgvhNRGULGkBdv58eR4dhwEAAAAkHsJromjYMMd2Ocy8AgAAAEgkhNdE0ajRnhbDjRvbPzMzfR4TAAAAAIQJ4TVRNGokLVkibdumJk2k7dvtvwAAAACQCAiviSLQcfjff/fs9UrpMAAAAIBEQXhNFIG9XrNtl0PTJgAAAACJgvCaKLKF1/r1pdRUZl4BAAAAJA7Ca6JIS7MtcxYsUMmSUv36hFcAAAAAiYPwmihSU6UGDfZ0HGavVwAAAACJhPCaSLJtl8NerwAAAAASCeE1kTRsKC1YIMn2el29Wlq/3tcRAQAAAEBYEF4TSaNG0sqV0qZNdBwGAAAAkFAIr4kk0HF4wYI94ZXSYQAAAACJgPCaSBo2tPP589W4sf2TmVcAAAAAiYDwmkiyzbxWqiTVqMHMKwAAAIDEQHhNJLVqSWXL0nEYAAAAQMIhvCYSz5Pq1pWWLJFkHYcpGwYAAACQCAiviSYtTVq6VJLNvP77r7Rzp89jAgAAAIBiIrwmmmzhtXFjKSNDWrjQ5zEBAAAAQDERXhNNrplXidJhAAAAAPGP8Jpo0tKkTZukjRvZ6xUAAABAwiC8Jpr0dDtfulRpaVLp0oRXAAAAAPEv6uHV87wjPc/7yvO8ZZ7nbfc8b7Hnee96nndgtMeSkNLS7HzpUqWk0HEYAAAAQGIo4cNzVpM0Q9IzklZKqi/pRknfe57XwjlHe6HiyBZeJfZ6BQAAAJAYoh5enXNvS3o7+2We502TNEfSCZIeifaYEkqu8Nq4sTRxouScbQMLAAAAAPEoVta8rt59vsvXUSSCqlVtoWu2mddNm6SVK30eFwAAAAAUg2/h1fO8VM/zSnmet6+k5yUtU64ZWRSB50l16uy1XQ6lwwAAAADimZ8zrz9I2i5prqSWkno451b4OJ7EkW2v18aN7SKaNgEAAACIZ340bAo4TVIlSY0lXSvpC8/zOjnnFmS/ked550s6X5Jq166tiRMnRnmYOW3atMn3MRTmoJIlVe7vv/XjxInasSNFntdZX365QHXr0gsr0cTD6xHJg9cjYgmvR8QSXo+IJfH8evScc36PQZ7nVZG0QNJI59yF+d2uTZs2bvr06dEaVp4mTpyobt26+TqGQl1yifTWW9LatZKkevWkHj2k117zeVwIu7h4PSJp8HpELOH1iFjC6xGxJNZfj57nzXDOtcnrupho2OScWyfpb0lNfR5KYkhPl9atk7ZulcRerwAAAADiX0yEV8/zakvaXxJthcIhsF3OsmWS2OsVAAAAQPyL+ppXz/M+kDRT0mxJGyQ1k3SVbJsc9ngNh+x7vTZqpCZN7J9btkjlyvk7NAAAAAAoCj9mXr+XdJyk1ySNlXS1pEmSWjnn5vownsSTPbwqq+Pw/Pk+jQcAAAAAiinqM6/OuQclPRjt500qucJr9r1eDzrIpzEBAAAAQDHExJpXhFnNmlJqKnu9AgAAAEgYhNdElJIi1a69J7xWry5VqkTTJgAAAADxi/CaqNLS9oRXz6PjMAAAAID4RnhNVOnp0pIle/7LXq8AAAAA4hnhNVFlm3mVbOZ1/nwpI8PHMQEAAABAERFeE1VamrRypbRzpyQLrzt2SP/95/O4AAAAAKAICK+JKrBdzvLlkug4DAAAACC+EV4TVQF7vQIAAABAvCG8Jqpc4bVePalECcIrAAAAgPhEeE1UucJriRJSgwaUDQMAAACIT4TXRFW7tm3wmqvjMDOvAAAAAOIR4TVRlSwp1ayZY6/XJk2YeQUAAAAQnwiviSzXXq+NG0tr1kjr1vk3JAAAAAAoCsJrIssVXuk4DAAAACBeEV4TWR4zrxKlwwAAAADiD+E1kaWlScuXSxkZkrLCKzOvAAAAAOIN4TWRpaVZcF21SpJUsaJUqxbhFQAAAED8Ibwmslx7vUo2+/r33z6NBwAAAACKiPCayPIIr61bSzNm7KkkBgAAAIC4QHhNZOnpdp4tvHbqJG3cKP3yi09jAgAAAIAiILwmsjp17HzJkj0Xdexo51Om+DAeAAAAACgiwmsiK1NGqlo1x8xr/frSPvsQXgEAAADEF8Jrosu116vnWenwlCmScz6OCwAAAABCQHhNdLnCq2Th9b//pH//9WlMAAAAABAiwmuiyyO8su4VAAAAQLwhvCa6QHjNViPcooVUsSLhFQAAAED8ILwmurQ0accOae3aPRelpkodOkjffuvjuAAAAAAgBITXRJeWZud5lA7/+muOTAsAAAAAMYvwmujS0+08216vkjVtck767jsfxgQAAAAAISK8Jrp8Zl7btZNKlKB0GAAAAEB8ILwmunzCa/nyUuvWNG0CAAAAEB8Ir4muQgU75QqvkpUOT5tm/ZwAAAAAIJYRXpNBHnu9ShZet22TZs70YUwAAAAAEALCazLIJ7x27GjnlA4DAAAAiHWE12SQT3itXVtq2pTwCgAAACD2EV6TQT7hVbLS4W+/tW1zAAAAACBWEV6TQXq6tHmztHHjXld16iStWiXNnevDuAAAAAAgSITXZBDYLmfJkr2uYt0rAAAAgHhAeE0G+ez1Kkn77SdVr054BQAAABDbCK/JoIDw6nlZ614BAAAAIFYRXpNBAeFVstLhv/6Sli+P4pgAAAAAIASE12RQpYpUunSBHYclZl8BAAAAxC7CazLwvAK3yznkEKlMGcIrAAAAgNhFeE0WBYTX0qWltm1p2gQAAAAgdhFek0V6er7hVbLS4ZkzpS1bojgmAAAAAAgS4TVZpKXluc9rQKdO0q5d0rRpURwTAAAAAASJ8Jos0tKk9eulrVvzvLp9ezundBgAAABALCK8JotCtsupWlVq3pzwCgAAACA2EV6TRSHhVbLS4alTpYyMKI0JAAAAAIJEeE0WQYTXjh2ljRulX3+N0pgAAAAAIEgFhlfP8yp5nucV9iCe55XzPO+Q8A0LYRfkzKtE6TAAAACA2FPYzOtaSW0D//E8L8XzvNme5x2Q63YtJP0Y7sEhjGrUkEqUKDC8Nmgg1a1LeAUAAAAQewoLr7lnXT1JzSWVjcxwEDEpKVKdOgWGV8+z2dcpUyTnojg2AAAAACgEa16TSSF7vUpSly7S4sXSP/9EaUwAAAAAEATCazJJSytw5lWSevSw8wkTojAeAAAAAAgS4TWZBBFe99tPSk8nvAIAAACILSWCuE0bz/Mq7P53iiQnqa3neVWy3ebAcA8MEZCWJq1aJe3YIZUqledNPE864gjps8+kzExbKgsAAAAAfgsmvA7T3o2bns32b7f7elr8xLrAdjnLl0v16uV7sx49pDfesP1eW7aM0tgAAAAAoACFhdfuURkFoiP7Xq8FhNcjjrDzCRMIrwAAAABiQ4Hh1Tk3KVoDQRRkD68FqFdP2ndfC69XXRWFcQEAAABAIQpc0eh5Xs1gH8jzvCOKPxxEVHq6nRcSXiWbfZ00Sdq5M8JjAgAAAIAgFNaO53fP804p6Aae51XxPO9VSePDNipERq1a1pEpiPDao4e0aZM0fXoUxgUAAAAAhSgsvE6RNMLzvI89z6ub+0rP806U9Iek4yVdGf7hIaxKlLAAu2RJoTftvnu1M1vmAAAAAIgFBYZX59wASSdLaifpN8/zLpAkz/Pqep73oaSRkmZJau6cGxbpwSIM0tKkf/6xfXAKUKOG1KqV9NVX0RkWAAAAABSk0F08nXPvyPZxHSvpGc/zfpD0m6T2kk5zzvVzzv0b2WEibDp0sER66KHSJ59ILv8djnr0kKZOlbZujeL4AAAAACAPhYZXSXLOrZZ0h6T5ktpKqiDpLOfcmxEcGyLhySelESOkjRul/v2l9u2lL7/MM8QecYS0fbv07bc+jBMAAAAAsik0vHrmGkk/SSop6UJJsyWN8TzvUc/zykV2iAir1FTp1FOlP/6QXnjB1r/26mWLXKdMyXHTLl1smSzrXgEAAAD4rbCtcg6S9L2koZLekK1tHS6bfb1dFmR/9TyvZ6QHijArWVI691zpr7+kYcOkP/+UOneW+vaVfvpJklShgnTYYax7BQAAAOC/wmZeZ0qqIqm7c+4i59xGSXLOZTjn7pfUStJ/kj73PO+VSA4UEVK6tHTppdK8edLQodK0aRZiN22SZOtep0+X1q3zd5gAAAAAklth4fUxSQc75ybndaVzbq5zrrOky2Xb5SBelSsnXXed9P77FlzHjZNk614zM6VJk3weHwAAAICkVlh4vVlSL8/zmud3g93X/Ssp39sgjnTuLNWsaSFW0uGHS2XLUjoMAAAAwF+Fhdchkt6WtLmA22yS9JakTuEaFHyUmiode6w0dqy0bZtKl5Y6daJpEwAAAAB/BRNeX3HOzc/vBs65BZJelnRGGMcFPw0caKXDX34pyUqHf/tNWrbM53EBAAAASFqFhddDJI0P4nG+lNSm+MNBTOjRQ6pceU/p8BFH2MWUDgMAAADwS2HhtaKktUE8ztrdt0UiKFVK6t9f+ugjaedOtW4tValCeAUAAADgn8LC6ypJDYJ4nPq7b4tEcfzx0po10qRJSk2VunVj3SsAAAAA/xQWXqcouLWsZ+6+LRLFkUfa9jmjR0uy0uEFC6R//vF3WAAAAACSU2Hh9XFJR3ie95jneaVyX+l5XknP8x6X1EO2JywSRblyUt++0gcfSJmZrHsFAAAA4KsCw6tz7jtJ10i6XNJiz/NGeJ533+7TCEmLJV0q6Rrn3PeRHy6i6vjjrcXwd99p//2ltDRKhwEAAAD4o0RhN3DOPe553kxJN0gaIKns7qu2Spoo6QHn3DcRGyH8c/TR1rxp9Gh5HTuqRw/piy8k5yTP83twAAAAAJJJYWXDkiTn3GTn3FGyjsJ1dp8qOeeOIrgmsEqVpJ49bcsc53TEEdKKFdKvv/o9MAAAAADJJqjwGuCcy3TOrdh9yojUoBBDBg6UFi6UZs1Sjx52EeteAQAAAERbSOEVSeiYY6TUVOn999WggdSkCeteAQAAAEQf4RUFq1FD6to1x5Y5kyZJu3b5PC4AAAAASYXwisIdf7w0Z470++/q2VPasEGaOtXvQQEAAABIJoRXFG7AADsfPVp9+0ply0rvvOPvkAAAAAAkF8IrCpeeLrVvL40erQoVbAedUaMoHQYAAAAQPYRXBGfgQGnWLOmff3TSSdLKldLEiX4PCgAAAECyILwiOMcfb+cffKC+faUKFaSRI/0dEgAAAIDkQXhFcBo1klq3lt5/X2XLSscdZw2Id+zwe2AAAAAAkgHhFcE7/njpu++kJUs0eLC0dq30xRd+DwoAAABAMiC8IngDB9r5Bx+od2+pShW6DgMAAACIDsIrgnfAAdL++0ujR6tUKZuIHTNG2rrV74EBAAAASHSEV4Rm4EBp0iRp5EidNChDGzdK48b5PSgAAAAAiY7witBccIHNvp58snpe3ULnVRypd9/O8HtUAAAAABIc4RWhqVdPmj1bevddeSkpGr7xZN3xfgtte3WklEGIBQAAABAZhFeELiVFGjRImj1bv9/5rjJcisqcdbLUvLn09tuEWAAAAABhR3hF0aWkaP/bBqlP2mw92HaUVKKEdMopFmL//dfv0QEAAABIIIRXFEtKijRocIpu//kErZ/8s+2dM2eO9NZbfg8NAAAAQAIhvKLYBg+WduyQxnyUIp14otSwoTRrlt/DAgAAAJBACK8otsMOkxo0kEaO3H1B69aEVwAAAABhRXhFsXmezb5++aW0apUsvP71l7Rxo99DAwAAAJAgCK8Ii5NOknbtkkaPloVXybbUAQAAAIAwILwiLFq1kpo1s35Ne8IrpcMAAAAAwoTwirAIlA5PnCgtS0mXatYkvAIAAAAIG8Irwuakk6TMTOm99z2aNgEAAAAIK8IrwubAA6XmzXd3HW7dWvr1V9tDBwAAAACKifCKsDrpJOnbb6VV9VpLO3dKv//u95AAAAAAJADCK8Jq8GA7/2ABTZsAAAAAhA/hFWHVtKl02GHS0583lcqXJ7wCAAAACAvCK8JuyBDp519StHnfgwmvAAAAAMKC8IqwGzxYKlFC+slrLf30k7UgBgAAAIBiILwi7GrWlPr0kT6Y31ratEmaN8/vIQEAAACIc4RXRMRpp0lfraNpEwAAAIDwILwiIvr3l/6tcJB2eSWsdBgAAAAAioHwiogoW1Y6ZlBp/e4dpIzpzLwCAAAAKB7CKyLmtNOkGZmttWMa4RUAAABA8RBeETFdu0rzK7dW2fXLpaVL/R4OAAAAgDhGeEXEpKRIaf2sadP6icy+AgAAACg6wisiquvlB0uSfn+L8AoAAACg6AiviKgDD6+kf0s11dZvCa8AAAAAio7wiojbun9rNVg7S3Pn+j0SAAAAAPGK8IqIS+vbSk30j957ab3fQwEAAAAQpwiviLhKXa1p0y9v/CTnfB4MAAAAgLhEeEXktbbwWnvpLH33nc9jAQAAABCXCK+IvDp1lFm7jg5N/UkjRvg9GAAAAADxiPCKqEg5pLW6VJyld96RduzwezQAAAAA4g3hFdHRurXqbfxdm9Zs17hxfg8GAAAAQLyJanj1PO8Ez/Pe9zxvoed5Wz3P+9PzvPs9z6sYzXHAB61bKyVjlzpX+ZXSYQAAAAAhi/bM67WSMiTdLKmPpGclXSTpC8/zmAVOZLubNp1zyCx9/LG0bp2/wwEAAAAQX0pE+fn6O+dWZvv/JM/z1kh6TVI3SV9FeTyIlkaNpEqVdES1Wdq+XRo1SjrvPL8HBQAAACBeRHW2M1dwDfhx93ndaI4FUZaSIrVqpZr/zVKLFtITT0iZmX4PCgAAAEC8iIVS3a67z//wdRSIvNat5f38s26+IUO//SZ98IHfAwIAAAAQLzznnH9P7nl1Jc2S9LNzrlc+tzlf0vmSVLt27UNHjhwZxRHubdOmTapQoYKvY4hXtT/7TAc8+KC+e/k1Db7jBJUqlakXXpguz/N7ZPGL1yNiCa9HxBJej4glvB4RS2L99di9e/cZzrk2eV3nW3j1PK+CpImS0iW1c84tLuw+bdq0cdOnT4/00Ao0ceJEdevWzdcxxK2ff5ZatZLeekuv7ThZZ54pffihdMwxfg8sfvF6RCzh9YhYwusRsYTXI2JJrL8ePc/LN7z6UjbseV5ZSR9LaizpyGCCKxLAgQdKpUpJs2bplFOsh9M990g+Tv4DAAAAiBNRD6+e55WU9J6kNpL6Oed+ifYY4JOSJaXmzaVZs1SypHTTTdL06dLnn/s9MAAAAACxLqrhdfderm9K6iHpOOfc99F8fsSA1q2ln36SnNMZZ0j16jH7CgAAAKBw0Z55fVrSIEmPSNrsed7h2U77RHks8EPr1tKqVdJ//6lUKemGG6SpU6Wvv/Z7YAAAAABiWbTDa9/d57dI+i7X6dwojwV+aN3azmfNkiSdc46UlmazrwAAAACQn6iGV+dcQ+ecl8/pzmiOBT5p2VLyPGnmTElSmTLSdddJEydKU6b4OzQAAAAAscuXbsNIYhUqSIceKo0du+eiCy6QatZk9hUAAABA/giviL4TTpB+/FFasECSVK6cdM010vjx0rRp/g4NAAAAQGwivCL6Bg2y8/ff33PRxRdL1aox+woAAAAgb4RXRF/jxtIhh0ijRu25qGJF6corpU8+2dPLCQAAAAD2ILzCH4MGST/8IP37756LLrtMqlRJuvdeH8cFAAAAICYRXuGPQOnwe+/tuahKFenyy6XRo6Vff/VnWAAAAABiE+EV/mjSxPZ8zVY6LFnpcIUK0n33+TMsAAAAALGJ8Ar/DBokff+9tGjRnouqV7fmTe++K82d6+PYAAAAAMQUwiv8c8IJdp6tdFiSrr5aKlVKeuABH8YEAAAAICYRXuGfffeVDj54r9Lh2rWl886T3nhDWrjQp7EBAAAAiCmEV/hr0CDpu++kxYtzXHzddZLnSUOH+jQuAAAAADGF8Ap/BboOv/9+jovr1ZPOPFN66SVpyZLoDwsAAABAbCG8wl/NmkktW+5VOixJN9wg7dwpPfKID+MCAAAAEFMIr/DfoEHSt99K//2X4+ImTaRTTpGee05atcqnsQEAAACICYRX+C/QdThX6bAk3XSTtHWr9Pjj0R0SAAAAgNhCeIX/9t9fat48z9LhAw+UBg6Uhg2T1q2L/tAAAAAAxAbCK2JDoHQ4j+5MN98sbdggPf20D+MCAAAAEBMIr4gNgwZJzuVZOty6tXTUUdJjj0mbNvkwNgAAAAC+I7wiNhxwgHTQQXmWDkvSLbdIq1dLzz8f5XEBAAAAiAmEV8SOQYOkKVOkpUv3uqp9e6lHD+nhh6Vt23wYGwAAAABfEV4ROwKlw6NH53n1rbdKy5ZJL78c5XEBAAAA8B3hFbHjwAOtfDif0uFu3aQOHaQHH5R27ozu0AAAAAD4i/CK2DJokDR5sk2x5uJ5Nvv677/SiBE+jA0AAACAbwiviC2FlA736SMdcoh0773S5s1RHhsAAAAA3xBeEVsOOkjaf3/prbfyvNrzrGnT/PnS+edbzgUAAACQ+AiviC2eJ51zjvTtt9JPP+V5k+7dpXvusXz77LPRHR4AAAAAfxBeEXvOOUcqV04aNizfm9x0k3TUUdKVV0rffx+9oQEAAADwB+EVsadqVen006U335RWrszzJikp0htvSHXr2jLZfG4GAAAAIEEQXhGbLrtM2r5deuGFfG9Star0/vsWXE85RcrIiOL4AAAAAEQV4RWx6cADpZ49pWeeKXBT10MOkZ5+WvryS+nOO6M3PAAAAADRRXhF7LriCum//6QPPijwZuecI519tm2fM3ZslMYGAAAAIKoIr4hd/fpJTZpITzxR6E2fekpq1UoaMsS20QEAAACQWAiviF0pKdKll0pTp0rTpxd407Jlbf2rJA0cKG3bFoXxAQAAAIgawiti21lnSRUqFLhtTkDjxtLrr0uzZlm/JwAAAACJg/CK2Fa5snTmmdLIkdLy5YXevH9/6eabpRdflEaMiPzwAAAAAEQH4RWx79JLpR07pOefD+rmd90ldeokXXSRNG9ehMcGAAAAICoIr4h9++0n9e0rPfushdhClCghvfmmnZ98clB3AQAAABDjCK+ID5dfLi1bJo0aFdTN69eXXnhB+vFH6fbbIzw2AAAAABFHeEV86N1batZMevLJoO9ywgnSeedJQ4dKEyZEcGwAAAAAIo7wiviQkmIthKdNk374Iei7PfaYVR2fdpq0cmUExwcAAAAgogiviB9nnCFVqhTS7Gv58taoePVq23XHuQiODwAAAEDEEF4RPypWlM4+W3r3XWnJkqDvdvDB0kMPSWPHSk89FcHxAQAAAIgYwiviy6WXShkZ0nPPhXS3yy6TjjpKuvZa6eefIzQ2AAAAABFDeEV8adJEOvpo6f77pQ4dLI2OHm2diAvgedIrr0jVq0snnSRt3hyl8QIAAAAIC8Ir4s/zz0tXXWVNnJ56Sho4UEpLkxo3loYMkZ55Rvr9973uVrOm9MYb0p9/2t0BAAAAxI8Sfg8ACFlamu1/I0nbt0uzZklTp9ppwgTpzTftuj//tO11sjniCOn666UHH5QOO0w655wojx0AAABAkTDzivhWurR0+OHS1VdL771njZymTrXrJk/O8y733GPbxp53njRiRBTHCgAAAKDICK9ILJ5nYbZ69awQm0vJktKYMVK3brb7zrvvRnWEAAAAAIqA8IrE43nWzCmf8CpJZctKH38sdewonXKKhVkAAAAAsYvwisTUvr2teV29Ot+blC9ve7+2bSudeKL9GwAAAEBsIrwiMXXoYOfff1/gzSpWlMaNk1q2lI4/Xho/PgpjAwAAABAywisSU9u2UmpqgaXDAVWqWGg94ADp2GOlr7+O/PAAAAAAhIbwisRUrpzUunVQ4VWSqlWTvvxSatJEOvpoacqUCI8PAAAAQEgIr0hcHTpI06ZJu3YFdfMaNWyb2Hr1pL59pR9+iPD4AAAAAASN8IrE1b69tGWLNHt20HepXVv66is7P/po6e+/Izg+AAAAAEEjvCJxBZo2BVk6HJCebk2cnLMZ2FWrIjA2AAAAACEhvCJx1asn1a0bcniVpH33tX1gFy+WjjlG2ro1AuMDAAAAEDTCKxKX59nsaxHCq2RVxyNG2G47Q4ZIGRlhHh8AAACAoBFekdjat5cWLpSWLCnS3QcOlB59VBo9WrruujCPDQAAAEDQCK9IbIF1r999V+SHuPJK6fLLpccek558MjzDAgAAABAawisSW+vWUunSRS4dDnj0Uem44yzIjhkTjoEBAAAACAXhFYmtVCmpbdtih9fUVOnNN6V27aSTT7Z1sAAAAACih/CKxNe+vTRzprRtW7Eeplw560Bct67Uv780b16YxgcAAACgUIRXJL4OHaQdOyzAFlPNmtKnn0qZmRZgN20Kw/gAAAAAFIrwisTXvr2dF7N0OKBZM+ndd6U5c6RLLgnLQwIAAAAoBOEVia92balJk7CFV0k64gjp9tul11+XXn01bA8LAAAAIB+EVySH9u1tuxznwvaQt90mde8uXXyx9NtvYXtYAAAAAHkgvCI5dOggLVsmLVgQtodMTZXeekuqVEkaNEjavDlsDw0AAAAgF8IrkkOHDnYextJhSapTx7bQYf0rAAAAEFmEVySH5s2lChXCHl4lW/96223Sa6+x/hUAAACIFMIrkkNqqnT44bbuNQJuv13q1o31rwAAAECkEF6RPNq3l37+OSKbswbWv1asKJ14IutfAQAAgHAjvCJ5dOggZWZK06ZF5OHT0mz96x9/SJddFpGnAAAAAJIW4RXJ4/DD7TwC614DevaUbr1VeuUVWwMLAAAAIDwIr0geVapIBx0UsXWvAXfcYetfzz7bOhCvXh3RpwMAAACSAuEVyaV9ewuvmZkRe4rUVGnMGAuuzz0nNWsmPfuslJERsacEAAAAEh7hFcmlQwdp7Vrpzz8j+jSVx4/Sk8tO1E8zM3XwwdaF+NBDpcmTI/q0AAAAQMIivCK5dOhg5xFc9ypJeuIJadQotVj+pSZMkEaNsszctat00knSokWRfXoAAAAg0RBekVyaNZOqVYtseF2xIuvxn3pKniedcIJ1Ib7jDunDD6X99pPuu49SYgAAACBYhFckF8/LWvcaKWPHSs5JRx8tffKJNH++JKlcOenOO6U5c6R+/awr8emnS7t2RW4oAAAAQKIgvCL5dOhg06Br1kTm8T/6SKpXz7o0paTYeTYNGkjvvWczr2+9JZ16qrRzZ2SGAgAAACQKwiuSTyTXvW7dKo0fLx1zjLTPPtKAAdKLL0pbtux105tvlh56SHr3XVsHu2NH+IcDAAAAJArCK5LP4YdLlStbF6VwmzDBguoxx9j/L73UOjWNHJnnza+91no7jR5t62K3bw//kAAAAIBEQHhF8ilTRjrxROn996VNm8L72B99JFWsaG2FJalLF6l5c2nYMFsHm4fLL5eeeUb6+GPpuONs8hYAAABAToRXJKfTT5c2b7Ypz3DJzLQE2revVLq0XeZ5Nvv6008FNom66CLphRekzz+3Sds8qowBAACApEZ4RXLq2FFq3Fh67bXwPeb06dKyZVklwwGnnmplyk89VeDdzz1XeuUVqzw+6qjwTwoDAAAA8YzwiuTkeTb7+vXX0r//hucxP/pISk21mdfsKlSQzj7b1tguXVrgQ5xxhjRihDR5stSnj7R+fTHG8+23kd0SCAAAAIgiwiuS12mn2TrUN98Mz+N9+KHUubNUrdre1118sW3o+sILhT7MKadYf6cffpDatZN+/70IY1m50qZvBw5kHx4AAAAkBMIrklfjxhY2X3st32ZKQfvnH+nXX6Vjj837+qZNbUb2ueeCCpODBln58Lp10mGHWW+pkNxyi03bLl1qoRoAAACIc4RXJLfTT5f+/FP68cfiPc7HH9t5//753+bSSy1MBtkkqksXaeZM6aCDbBudm26SMjKCuOOMGba37OWXSw0bSk8/HdTzAQAAALGM8IrkNmiQbZ1T3MZNH31kKbNJk/xv06ePzfYW0rgpu7p1pUmTpPPPlx54QOrXT1q9uoA7OCdddplUs6Z0993ShRdKEydKv/0W9HMCAAAAsYjwiuRWubJtrjpypLR9e9EeY+1aS5i5uwznlpIiXXKJNGWKbZ0TpNKlpeeft+WyEydKbdoUcPc337QmTfffb1/bOefYAzzzTNDPBwAAAMQiwitwxhnSmjXSp58W7f7jxlk9b37rXbM76yypbNkilfKee651Id65U+rQIY8+Uxs3StdfL7VtK515pl1Wo4Y0eLD0+uvShg0hPycAAAAQKwivQM+eUp06RS8d/ugjqXZtC42FqVpVGjLEkueaNSE/1WGH2ZLWtm3tYU488XDtv790yCHSGwfcJy1dquvKDNPJp6bonHN25/FLLrFNY0eMCP1rAwAAAGIE4RUoUcKS4Nix0qpVod13xw6bee3f38qCg3HJJdLWrdIrr4Q+VllO/vJLaehQ6dBD16pVK6ltlb900pJHNbbmmfpiw2GaMUP64APbKefvqm2lQw+12d7idlUGAAAAfEJ4BSTrOrxrl/T226Hdb9IkK8ctbL1rdgcfbFv0PPNMkO2D91aypHTdddINN/ypkSOl58tdpZIVyuio2ffrp5+kuXNt557SpaULLvTkLr7ENoydNKlIzwcAAAD4jfAKSFKLFlKrVrY2NBQffWRrWHv2DO1+l1xie8OGI0yOHWunO+6w8ufd0tOlBx+UvvpKemPnSVK1amybAwAAgLhFeAUCzjhDmj7dZiiD4ZyF1969LcCGon9/qXx56Z13Qh9nNt6OHdJVV0n77Wdb5ORy3nk2yXvlTWW1efDZVkv833/Fek4AAADAD4RXIODkk6XU1OBnX2fPlv79N7SS4YBy5ex+779v7YOLaJ/335f++kt6/HGpVKm9rk9JkYYPlzZvlm7+90IpM9MuAAAAAOIM4RUIqF1b6tvXuvIGsxb1o48kz5OOOqpoz3fiidLq1VbXWxRLlqjBG29YCO7TJ9+b7b+/dOut0pNjm2jFIX0svBYjMAMAAAB+ILwC2Z1+upXVBhMoP/xQat/eQm9R9OkjVaokvftu0e5/441K2bVLevTRQm96ww3SQQdJ1y+8RFq2zMqHAQAAgDhCeAWy699fqlKl8NLhxYttw9WilAwHlCkjHXusNHq0bbkTinnzpBEjtHjgQKlJk0JvXqqU9MIL0ohVfbSqUqNCGzc5Z1vRDhwoLVgQ2tAAAACASCC8AtmVKSMNHmyBcuPG/G/3ySd2XpzwKtlzrVsnffFFaPd7+mkpNdXCa5Dat5cuujRVQzdcJE2eLP3yS563+/NPa548ZIhN0HbuLM2ZE9rwAAAAgHAjvAK5nXGGtGWLzcL262fprVUrm+GsVcs6C190kdS0qS0oLY5evWymN5Suw5s2SS+/LA0cqB01aoT0dP/7n/RZ2lna7pVWxrBncly3dat0++1Sy5bSzJnSs8/a+Y4dUpcu0qxZuR4sI8MaRa1bF9IYAAAAgKIo4fcAgJhz+OEWXOfOlSpWtFODBln/Dpx697aGTcVRqpQ0YID03nvStm0281uYESOk9etta5wQGy9VrCjd93wNvX3MSTrltTeU+vCDUqVK+uyzrK1nhwyRHn44aynvN9/YTGz37tKnn0odOux+sC++sG16UlPz3KYHAAAACCfCK5Cb51kn4WgZPFh65RXps8+k444r+LbOSU89JbVubSly0qSQn65/f+mmIy5RqQmvae6tr+vWZZdq1CjbKnbCBKlHj5y3b9ZMmjLFAmyvXtanqmdPSePG2Q2mTQt5DAAAAECoKBsG/Najh1S9enClw19/Lf32m810FmPW94oRbTUjta12DXtGH3/kdO+90s8/7x1cA+rXtxnYJk1sZ6AxY0R4BQAAQFQRXgG/lSxpbX0//tjW2hZk2DALuiedVKynrFNHKnHe2TpQf2juB7/plluk0qULvk/t2tLEiTbpe/3AedJff0n16ll59dq1xRoPAAAAUBjCKxALBg+WNm+2RaX5WbjQypnPO8+aRhXTwbcfK0mqN2NM0PepVs2Wul7axGZdJ3e8ya6YPr3Y4wEAAAAKQngFYkHXrtbJuKDS4Wd2dwe+6KLwPGdamjWn+uCDkO5WsaJ0SZNxWlKuiY4ZebIkyf1A6TAAAAAii/AKxILUVOmEE6SxY20rnNy2bpVefNEaOtWvH77nHTDA9sNZuDD4+2zbptRJX6v2mX111ClVNEf76bdXp8m58A0LAAAAyI3wCsSKwYMtpH788d7XvfWWtGaNdPnl4X3OAQPs/MMPg7/PpEnS1q1KPaqv3nhD2rBfO9WY94POPstp167wDg8AAAAIILwCsaJTJyk9fe/SYeesUVOLFlKXLuF9zn33lQ48MLTS4XHjrLtTt25KSZHaXtJOdbRcX762WIMG2Xa1AAAAQLgRXoFYkZIiDRpk4XDDhqzLp0yxfWyKuT1OvgYMkCZPllatCu7248ZJ3bpJ5cpJkrzD2kmSnj97msaMkfr1yzl8AAAAIBwIr0AsGTxY2rEjZxnvsGFS1arSqadG5jkHDJAyM6VPPin8tv/8Y1vj9O2bddnBB0ulSqlfjWkaMcJycI8e0sqVkRkuAAAAkhPhFYglhx1me6cGSocXL5ZGj5bOOWfPTGfYHXKIPWcwpcPjbIucHOG1dGmpVStp2jSdeqrl7t9+kzp3lv79NyIjBgAAQBIivAKxJCVFOvFEafx4ae1a6bnnbFb04osj95yeZ12Mx4+3vWYLMm6c1LixrZXNrl072+s1I0NHHWUPtXSp1LGj9O671syYbsQAAAAoDsIrEGsGD5Z27pRGjpSGD5f695caNYrscw4YYJ2WPv88/9ts2yZ99ZXNuuZee9uunW3xM2eOJJt1nTRJysiwL6dhQ6l2bemoo6Q77rAK5eXL9374+fNtie+770qPPy5dd530wAM0gQIAAIBUwu8BAMilTRub3bzhBmnjRmvUFGmdO0vVqlnp8PHH532byZNtK59+/fa+rp01bdK0adJBB0mySuL586VffpF+/DHr9NlnNpksWbVylSrSkiXS6tV7P2zp0tL27dKIEXZq1aq4XygAAADiFeEViDWeZ6XDDzwgHXCAdMQRkX/OEiVshvfDD23Wt2TJvW+TbYucvey7r1S5soXXs87ac3Hp0pbF27SRLrrILtu0SZo1KyvMbtliuwTVrWs7BaWnZ/27alWbDD77bMvH99wjXXutlJoamW8DAAAAYhfhFYhFJ58sPfigdOWVkdkeJy8DBkivvWb1vj177n19ri1yckhJkdq2lX74odCnqVDBJno7dw5uWH362OztBRdIN94ojR0rvf66lSIDAAAgebDmFYhFLVtKf/8tnXde9J6zVy+pbNm8uw7Pny/9+WfOLsO5tWsnzZ5tpcVhVr26NGqUZeuffrJvz6uv0gQKAAAgmRBegVjVuHH0Zl0lm1Ht08dKhwOLUgPy2iInt3btrEPTrFkRGZ7nSaefbvm4VSurTj7hBGnVqog8HQAAAGIM4RVAlgEDpP/+s21vsstvi5zssjdtiqCGDaWvv7aq6o8/lpo1sx5S119v5cSzZkVk8hcAAAA+Y80rgCxHHWXdkD74ICuMBrbIOeusgmeC09KkffaJeHiVbIjXXy/17i098oitiZ0wQdqxw65PSZGaNpVatLAS4zPOkBo0iPiwAAAAEEHMvALIUq2aNWUaMybrsm++sZbABZUMB7RrF5XwGtCqlfTGG7YOdtMm6fffbY/YW2+Vmje3UHvXXTY7e9ll0rJlURsaAAAAwozwCiCnAQOkOXPsJGVtkdO9e+H3bddOmjcv701b8zJzpvTdd0UfazYlS9rOQoMGWWB9/33rMbVggXTmmdKzz1rl8403SmvWhOUpAQAAEEVRD6+e5+3jed4wz/O+8zxvi+d5zvO8htEeB4B8HHusnQe6Do8bJ3XtmvcWObkFSo1//LHw227caA2iOnSQLrxQ2rChaOMtRL160vPPWxY//nhp6FCpUSPbM3bjxog8JQAAACLAj5nXppJOlLRW0jc+PD+Aguyzj+3ZOmaMTVvOmRNcybAkHXqorYsNpnT4iSeklSulIUOkF16wOt/x44sz8gI1bSqNGGHdinv0kG6/3WZiH31U2rw5Yk8LAACAMPEjvE52ztV2zvWTNMqH5wdQmAEDLIC++KL9P9jwWqmS1e4WFl7XrJEefthmed94Q/r2W6l8eenII6Vzz5XWry/e+AvQvLlNKv/wg9S6tXTNNdZr6pxzpMmT2TsWAAAgVkU9vDrnMgu/FQBfHXecnT/0kNXYNmsW/H0PO8zCa0Ep8KGHrEz4nnvs/4cfbnvc3HCD9MorljADe8tGSLt2NtE7ZYrtF/vuu1Yd3bSprZmdPz+iTw8AAIAQ0bAJwN4OOEDabz/be6Zv34K3yMmtXTsrB164MO/rly2zkuGTT7a9bALKlJEeeMAaOFWqZJu3nn22tG5dsb6UwnTsKL38sg3r9dctq991l5UUd+tmWTqCE8EAAAAIEuEVQN4GDLDzYEuGAwJNm/IrHf7f/ywU33VX/vefOVO6+WZLk4ceGpX2wOXLS6edJn35pS31vfde6b//LD9Xq2YlxpddJr3zjl0erG3bIp6/AQAAkoLnfFzg5XneuZJekNTIObcgn9ucL+l8Sapdu/ahI0eOjN4A87Bp0yZVqFDB1zEAAZF8PZZesUL7jBqlf847T65UqaDv5+3apc79+um/AQM076KLcj7msmU67LTTtOzIIzX32msLfazKs2fr4Guu0Zp27fTrvfeGNgMcBs5Jv/9eSdOnV9Uvv1TWb79V1rZtqZKkOnW2qkWL9WrRYr3S0rZp9epSWrmy9F6n9etLyfOcevRYodNPX6j69bdE9WuIJv4+IpbwekQs4fWIWBLrr8fu3bvPcM61yeu6mA+v2bVp08ZNnz494uMqyMSJE9WtWzdfxwAExOzrsUMHqUQJ64CU3TnnSG++Kf31l+1hE4xhw6TLL7d1skEE3kjatUv66SdbJxs4LV+e8zbVq1vD5nr17HyffWzi+PnnpS1brFr69tutKjvRxOzrEUmJ1yNiCa9HxJJYfz16npdveC0R7cEASALt2tn2N7t2WYiVpD//lF591YJosMFVki69VJo0SbrxRgvFHTpEZMjBKFFCatPGTldeaTOz8+ZZGXHdunYqWzbv+954ozVYfuopaeRIC7G33ZaYIRYAACASWPMKIPzatbNpxt9/z7rs9tst2d10U2iP5XnSSy9JDRpIJ50krV4d3rEWg+dZd+JAl+L8gqsk1awpPfigrae99lrbrufAA22d7dy5URsyAABA3PJl5tXzvBN2//PQ3ed9Pc9bKWmlc26SH2MCEEbZmza1bGm1tu++K916q1SrVuiPV7my3b9DB+mMM6SPPpJS4vPYWyDEXnONzcQ+/bQ0YoQ1W65QIeepYsWsf1euLFWpIlWtaqfc/05Pl0qW9PmLS3QbNlj3rfr1/R4JAABJya+y4VG5/v/M7vNJkrpFdygAwq5JE0tVP/wgnXuuhdYqVSyxFdWhh0qPPmplxA8/LF1/fdiG64dataShQ20W9vXXbXehTZukjRvtPHBautQuW7/eTvm1KahcWTrqKNuit08fC74Is6uuksaMkRYtksqV83s0AAAkHV/Cq3Muui1DAUSX59ns67Rp0tSp0tix0v33W4AtjosvtvWvN99ss7CdOoVluH6qVSv4PlSZmTb5t3atTQCuXWunNWtse9yPP5beeksqVUo64ggLssccI9WpE8mvIElkZto3eM0a2y/prLP8HhEAAEmHhk0AIqNdO+m++yyZ1a5tm6QWl+dZI6iZM239608/STVqFP9x40RKiuX/vI4BnHeelJFhxwrGjLHTBRdIF14oHX641K+f1L691LatVKlSdMedEGbMsOnx1FTpuecIrwAA+CA+F40BiH3t2tls1XffSbfcIpUvH57HrVxZGjXKgsRpp9lzQJLlqs6dpUcekf7+W5o9W7rrLmn7duts3LOnBd8WLaya+8UXpV9+sdCLQowbZwdPbr7ZKgpmzvR7RAAAJB3CK4DIaNvWzuvXl84/P7yP3bq19Pjj0mef2cJRP2VmSjt2+DuGPHiehdTbbrNJwzVr7Nt1xx229+zo0TZb27KlBdojj7SS4+3b/R55jBo3zvZIuvpqayv93HN+jwgAgKRDeAUQGbVrS1dcYR/yS5cO/+NfeKE0eLDN6n78cfgfP1g332x73uTXSSlGVK1qAfWOOyyHrV5tW+++/rp0+uk2U3vqqRZsr7tO+usvv0ccQ1avtuZjffta0j/5ZEv669f7PTIAAJIK4RVA5Dz+uH3gj4TA+tdDDpFOPFGaPDkyz1OQzZulZ5+V5s2T/vijaI/hnPTFF9KuXeEdWyE8T2rWzCqvn37awur48bZn7WOP2XU9e0rvvSft3BnVocWe8ePt5xR4LV94of3s33zT33EBAJBkaNgEIH5VrGjTiJ07S/37SxMnWklxtLzzjrX/ley5Dzww9Mf47jupd2/ppZeks88O6/BCkZIi9eplp6VLpZdfloYPlwYNskn0006z3ljbtuV92rlTOuCAWura1YJxQhk3TqpePasUvk0bO2jy3HPSRRcl4BcMAEBsIrwCiG81atjMWMeOVhc7ZYpNG0bDc89ZYN2wwbbwufji0B9j/Hg7//xzX8NrdmlpVo194402rOefty12A72xSpaUypTJedq2TXr//QP12282GV27dnjG4pw1nho1ytbuNm5s3/KDDrLzmjUjnB0zM22x8JFHWkcsyZ7woots0fB339m2TQAAIOIIrwDiX716VnrbubNNHX77rS3ejKQZM6Qff5SefNLWQ37xhSWtUJPUV1/Z+ZdfWtvfQECKAamptsVOv37Sli32pZUubbO0uWVkSJdcMk+vvtpEBx1kpcgnnli0YOmc7YI0apSd/v7bnvOgg2wroMBkt2QTogcemHU6+GCpVStrSh0WM2daZ+vc5e8nnSRdc40dwCC8ApHz77/W+A8AxJpXAIliv/1shmztWivDXbUqss/3/PPWdfa006Ru3aQVK6wDUig2b5a+/96mE9esientV8qVsy83r+AqWdA96aRFmjVLatLEst2gQfZtCYZzdjzgxhulffe1qtyhQ6VGjexbvWyZzcCuWyf9959NWD/+uDRwoN333XetP1i3btZTqXFj6fjjpbvvlj76yD7/FqmnVmCLnCOPzHl5hQr2s3/3XWvoBCD8pk+XGjSwjasBQMy8AkgkhxxinYePPNKmCydMsHWx4bZhg3WbPflkS0pdu9rlEydK++8f/ON8840tFr37bmnIEJu9DayrjFMHHGAT3w8/bJ2NJ02SnnnGgmx2W7fa59KpU7NOq1ZZCD7iCAuxxx1nVeHZeZ6Unm6nXr2yLnfOAu7PP9usbeA0ZkxWaK1aVWrYUKpTx061a+/973Ll7PbOWcVw+uhPpeZt9O/amspcbWXK1avvftILLrAp5tdesy10AITXrFl2ft990rHHsr4cAOEVQILp2tVmw44/XhowQBo7Nvxb9bz5ps2aXnCB/b9pU0tTkyZZJ9pgTZgglSpl42zVyqYTb7459PEMGmQh/eWXQ79vBJQoYeGzf3/pjDOsfPjEE+3L/P57WyY6c2ZWg+VmzaSjj5a6dJGOOSZbOAyB59la3bQ0qU+frMs3bZJ++cWC7M8/S4sWScuX2yzu8uUFN3muptVaqR90j27TnfvZZSVL2qzyVVdJrVu3sLXWzz1nF0Twg3VmpgX80aNtIuqSS+z7DCS0uXPtfPp0W2JxxBH+jgeA73jrA5B4jjnGgtwZZ0innGJdgcP1Sd85CyutW2fNknqeheaJE0Nb9zphgq2XLFfOSp0fe8zSVoUKwY9n+XJLNCVKWB1tpUqhfkURc9BBFlaHDpXuvNOOKZQta9+2a6+1L/3ww202M1IqVJDat7dTbpmZVmW+fLnN2i5bZjPCKSn2I2z8w3ilPOd02J199ea+dtn339tL6403rET5kXYX6pDHTpO+/lrq0SOsY3fOllW/84597xYvth/zrl3SyJE24Rut3mSAL+bOtXUEGzdKDzxAeAXAmlcACer00y0Mjh4t/e9/4XvcH36wabsLL8wZUrt2tT1m/voruMdZvdqmAwMfxnr3thLiSZNCG8/o0ZbCduywxZ0xpkQJm0yeO9eC2Pr19iXef7/NzEYyuBYmJSWr4VOPHnac45xzpLPOks48U+qy2bbI6XNrW51yilWJP/GEzd4+9JBt79vhsRO0NqWa/r72OW3enP9zOWefvxctkpYssSXOmzdbo6vct/vpJ+mmm2zt8GGHScOG2bGSN9+0+735pjRnjk3WDxuW1QW6MNOn2xrhypXtcS++2HZo+ukn9vJFjJo7V2rRwiobvvzSXsQAkhozrwAS15VXWmK65x6bjW3VqviP+dxzVqJ78sk5L+/Wzc4nTQpuOuzrry2pBMJrx46258z48dJRRwU/nvfes+fbutWm54YMCf6+UdSwoZ3iRmCLnN699+oAXaWKzRxfcYU0enQZjb3iLA2e9YRa1l2qLoPTlJFhxyYCpzVr7Dy/gFiihFW2lyljx0MCa3979pRuv93W/lapknX7U06xYyXnnitdfrmt6335ZSsnzs05e0n+73+2pLpyZQuw//wjjRhh2xpJ9vwtW0qHHmq/JtWqSeXL730qV85ms8uUKf63GCjQrl12hOi44+xg4f/+Jz34oLUgB5C0CK8AEtuwYbZW6owzLMiWKlX0x1qzxmo4zzxz70ZQzZpZ15+JE23/z8IEmkkFSo/LlLFE8sUXwY9nxQp7vptusvD61FPWjjd70kHR5LdFTjYlS0qDB0uu1fny9n9Et+3zss5/8xZVqmQzutWrW/+uwL+rV7cfTUaGtH277Y27bVvWv7dvt1PbtrZkO3ezquzq1pU+/VR68UXrFdWihc0Kn3mmBWDnpE8+sc/7339vL80HH7QMEKgsz8y0bYhmzrROzzNmSG+/bcdnClO7dtZeu9n33S1ozJG0bZt1lF6wwE6LFkkbNqSrZEkL5ZHo24YIW7jQjvg0a2Yv2osvttLhv/6yUmIASYnwCiCxVasmDR9uM6/33GOnonr9dfuUnFdTJs+z2ddJk4Jb9zphgnUoyr4Wt3dv2zt00SLbu7YwH3xgCWTQIBvXo49a6fDpp4f0ZSEP+W2Rkwdvv2bSEUfolL+G65T1N0Ztr17Ps+MkPXtaqfPZZ1sV+cCB9lL45Reb7X7mGQu1ZcvmvH9KiuWCZs2sCZVkL6fFi628e8sWK23Oftqyxcqf582TfvvN1t1u3Jj1mDVrWoht0sS2Wq5bN+epRo2i97Vau9Z2o5ozx6pJ58+3oLpwoVXs5/7eONdMTz5p/2/a1GaUW7XK2gu4bl1/m9cuXWo/E4415SPQrClQyXLFFfbCfugh+5sOICkRXgEkvkDb2/vvtxBblO1onLMNRw8/3D795qVrV5uZnTfPPi3nZ9Eimz246KKclwf2fvniC0sihXnvPZuBaNnS/l+/vpUOE16Lb9w4qU0bqVat4G5/4YV2EOGzz0Ir+w6DRo2suODJJ20S/pNPLEC+8YbNDJcsGfxjpaTYyyhYzlnY/f13O/32m50+/dQaYeXeW7dUKWvMXaeObV1UtaqFt8C/A6fUVPsVCYTVP//MuWdwiRJWJt2woe2KFfh34JSeLo0e/Z3Klm2fY+uk997Leoy0NDt+1LWrnR94YOTD7OLF9is6cqQVgkg2I9+0qZ323Tfr302bFq3zdsDcufYyXrPGDkZkP61bZ+cpWzZp39YV1LGj1KmT/WnMfZDDN7nDa+3adpTm5ZetA1x6+p5dyw49NO53GQMQJMIrgOTw+OPW8OOMM6xOMtRFe5Mn26foV1/N/zbZ170WFF4nTLDz3J0zmze3T/XBhNdVq2zd7PXXZ33iPvFEqx1du9YSAIpmzRprzHXrrcHf59hj7Wf33HNRD6+Shc4rr7RjMwsXWiBLiUJLRs+zIoF69faepN650zo4//ff3qflyy2Mzp1rL9d16/JuPFWzppVeH3OMtN9+9u/99rPAXlgD8Zo1t6tbN9uGKWDjRuu3NmuWbdk0aZIdb5JsVrhz56ww26SJ/ZqtXJn3aft2C5uB0ulmzfL+s7JihYXmkSNta2fJtqS+/377Gv7+205TplgQyx7499/f8tppp1nYLkxmpv35eOIJC64BFSvaeucqVey8Th1pYKUvdMOkvnr2x9t02ae3S/JUsqQFwU6d7NShg49N1ebOtQFnr0W/9lpp+HBlPvq4XtpvqG69NeugxqmnWpl8KAdfimv9eunDD21v67597bXGFlZAhDnn4uZ06KGHOr99/fXXfg8B2IPXY4g++8w5ybnrrw/9vied5FyVKs5t2ZL/bTIznatVy7nTTiv4sYYMca5mTecyMva+7vTTnatePe/rshs+3L6WmTOzLps2zS575ZWC7xshBb4eH33Uvu433nBu+fKojalI3n7bvo/ffRfa/W691TnPc27SpMiMK4FlZDi3fr1zCxY4N2uWcz/84Nzq1cV7zGD+PmZmOvf33869/LJzZ5zhXKNG9qMv6FS6tHP77GO3TUnJujwlxbl993XumGOcu/FG5x57zLlevbJuc8ABzt19t3N//pn/eLZude7335376CPnHnrIuY4d7b6pqc4ddZRz773n3Pbte99v40bnnnrKuf32s9vXqePcXXc59++/zu3alccT7drlXPPmzpUs6Zzktlxyrfvow0x3ww32nKVKZX1dHTo49+yzxf95hKxnT+fatdvr4mXdT3KbUiq4KlrjOnZ0buJE526+2bkyZex00032WoqUTZuce+cd5wYMsNdC4DUhOVevnnP33uvcsmV734/3a8SSWH89Spru8smDvgfSUE6EVyAnXo9FcN559mly6tTg77N8uX3Iu+KKwm87aJB9gsnMzPv6zEzn0tKcGzw47+vfeMP+NE+fXvDz9OrlXJMmOZ8nM9O5hg2d69u38HFGQL6vx127nKtcOecn/UMPtbA3ZYpzO3dGc5iFO+00O4CQ56f+Aqxda+mhenVLRPBVUf8+/vuv/Ro++KCF2o8/du77752bN8+5DRty/spt3erczz87N3Kkc7ff7twJJzh34IHOlShhL/PGjZ275RbnZs/O/09CYebMsTCcnm6PWaOG/Sn66Scb01VXOVepkl3Xtq1zI0bkHXBzeOklu8PIkc5dcon9+6KL9hw027rVuW++ce6ee+zrkSzQDhjg3OjRzm3blvfDZmZaOH/xRTsOFwj4pUo5V768/RmoUcPCdb169v1p1cqe+u23nVu8ONuD1a9vB7x2mzvXuWOPde5gzXJOcrNPui/H93ThQudOPdXGWquWc889F74/Ldu2OTdmjHMnn2xfh2R/xq+4wo5x7djh3Acf2J9lyd4uTjrJucmTs37uvF8jlsT667Gg8OrZ9fGhTZs2brrPe3xNnDhR3QKlgYDPeD0WwcaN1pq1VClbBFeuXOH3GTpUuuEGW9R3wAEF3/bpp6VLL7W9SBo12vv6OXPsMYYPz7sr8bJlVh94//3SjTfm/RyrV9v6r2uvte6b2d1wgzU1Wb7cmlVFUb6vxx9/lNq1s71Z9t/f6hnHjbM2uJmZVhrYq5d0/vnWfchPmZlWU9mzp9Vwhurvv20T1dq1rS61cuXwjxFB8fPv486d9isYzqZQu3bZTlqvvGKlqoGtl0qUsOXWl19uL71Cn2/LFqt3rlfPXqOS/a0ZOtTWy7/0Uo7aV7d77+E33rBfieXLbVXC4MG2M1e5clYOPXmylT4vX273q1HDyq/32886bO/aZWPOfb5ihQ1j0ya7X5MmUo/2WzV8RDmtufJupdxxm+65xxrHly5t+0ZfP7GvUmfNsBr5XIt0f/zROnBPmWJdsB9+2Erat22zMt8NG+wU+Hf2yzZuzPp39tM//9h59erSCSdYg7POnfPuzTZ3rm1B9cor9tgtWlh7g9TU6erXr41q1rSvoyCbN1szsvnzs07r19vbVunSWafs/y9Txv6U5nUKNNnPyLBS+OXL7a0m+ylQfh143MBjZ/93w4bW9qF+/eg2O9u1y9bST59ur5ns34Pcp0BzuFDGt3WrPfZ339mSgKOPtrL5cPXeW7xYmjrVTt9/b78bxx1n7Thq1w7Pc4Qq1j8/ep43wznXJs8r80u1sXhi5hXIiddjEU2YYIfHr7yy8NtmZNj0QNeuwT32L7/YY7/8ct7XP/WUXT9vXv6P0bKlc92753/9iy/mPzs7fXrBzx9B+b4eH3jAxpS7lm7NGqu/O+ssmyopXdpmL/3044821tdfL/pjfP21Tb317h17s8pJJKJ/Hz/5xGbZN22K3HMUYNUq54YNsxLVHLOVwbj3XnuNf/NN1mWZmTbNKtn0cT5Ttzt3OjdunHOnnOJc2bJZhRSScw0a2ETp8OHO/fFHaDPNO3fan65HH3XuuOOc61R5tnOSO1EjXYkSVo1/9tnOLVmy+w4TJ9qTPvNMno+Xmenc++9bcYqUNRNe2KlMGftT1LSpc4cc4ly3blYGfv759nXv2BH817Rpk/2pbt167+epVMmeo0MHm00+91yb1T3sMHv+3LcvW9a5unVttUmlSlllysGeypa1gpDsxS/ZT+XKWdFO48ZWEl+rls2Sly1rJeu5b1+njv2cHnjAfhTh/jVYutRmum+80X4GgdnuYE9VqjjXqZNzF17o3NNP20qONWuyXhsLF9pM/+WXW7VC9tfH7kp6l5bm3GWXWXFQYat4sgu8lp980mbf69fP+X3u0sW+15K9rtu3tyqPOXPC+z0sTKx/fhQzr+ET60cqkFx4PRbDpZfaHiITJ9r0QH7Gj7fD9m+/nbWfSEEyM+1Q6lFH5d3c6fjjrVvM/Pn5P8Z111nr2DVrpPLl976+Tx87vD9v3t6Hl52zqYv99svZsSUK8n099u4tLVki/fpr/nf+4Qc7pP/SS8F1Wo6Ue+6Rbr/dpiaC7TSclxdftJn1yy7Tnv1aEFUR/ft48snWgWnSpIL/fsSaFSusmdwRR9hWW7k99phNW/brZ12mCmg9vHGj9PHH9ienc+fwNkrKHPW+Uk48QaNunqlvN7fWGWdIrVtnu4FzUvv2WV2/8umStGOHNSdeuNCKICpVyvu8YkX7d3G2Ac+Pc7Zt1Ucf/aLatVtoxQob9sqVyvHvcuWsWCevU61aef+p37Ura3/orVuzOknnPq1fbzPb1avb21OdOjlPFSoU/DUE9qaeM8dmDgOnv/6y61NSbIa5bVtrxt+ypf2/sL6Bztn+zL/8Yqeff7a3ggUL7PoSJeznfvjhdmrXzt4SA19z7tO2bfaz/uUXe7v55Rf72gPS0+18yRI7L1fOxty+vTUnO/xwu+yTT6wr+Kef2mPWrWvVDYMHW3VDRoY9T6DZ2rx5Wefz5tlYJNsurGNHe+yOHe37UrJk1mtizBiropg5026/3342I9u7t902kvtmx/rnx4JmXgmvIYr1HzaSC6/HYti82d4dnLPNMVevto0Xly3LOl+2zGqVMjKs7qewWq+AgQOlGTOy3oEDMjLs3WjgQAs3+fniC3v3+vRTa2GZ3Zo19unj6qulBx/M+/433ig98oiNvzh7bYQoz9fj9u32Cea886wFan6cs1LGBg2yujH7oUMHq0sL7GNSHNdcYyXczzyz97ZIiLiI/X3MzLQ0sXq11aRec034nyNSLr3UOmL/9pt9Us7L8OG29VO3brZvdGHJJhLuv9/qgzduzP/5x4yRBgywWuaTT47q8IoiEd+vV62Spk2zIPvddxbC1qzJur5evaww27KlvXX9/ntWWP3115zhsn59C4eBsNq6dfG2bnLOuptnD7OZmfbY7dtnhcn8BA7QvPuuHQvescNW46xfb2/nAWXLZm1v1aSJdezu2DG47dolC/AffWRBduJEOygh2YGFFi2yDga0aGGdzcuUser/QGl57hLz//6z4F+mTFZJeeA88O/DD5+tG29sWcTvbOQVFF5p6A0gOZUvbzOjXbvmOqQveydKS7N3jk6d7INRsMFVsg99o0fbodkGDbIunzXLDoP36FHw/Tt1sucbP37v8Prhh/bOdsIJ+d//xBMt2I4ZI51zTvDjjoTvv7cpgcK+Zs+zvS7uucfeeevWjc74sgtskXPLLeF5vKFDbYPSyy6zTzWBfXwR32bNsuAq2UK5eDF3ru1Vff75+QdXya4vV862Fevd2z61R3vt9ty5Nk1WUHA+5hhbQ//AA1YVE81FmOH2xhv2N6+wv5MxpkYNm6Tv18/+75zNas6enfP02WdZgUyyl1OLFtIpp2QFs+bNi/kyc84Oyuyzjy30lb0k9tnHTrnfSoNRsaKN8ZRTLLB+9JGFy7S0rKDatKl9VCjOy69+fTuudOml9hHhxx/t+xYI+U89lTWbm5JiAXrVqpyPUaZM1kx927YW0rdts1NgVnrbNnub27ZNatkyTAt6fUB4BZC8One2rh6LFtm7TyCwVqxYvHeirl3tfNIka4ASEJhRLOwDStmyVor4xRd7XzdqlHXNaJN3HwNJFsabNLHDxX6H1wkT7N028D0pyKmnSnffbeWYfsxmjR9v7/iBT2LFlZpqs0IdO1rN2fff24dtxLfPP7fzTp3CM0MfLTfeaJ9w77ij8NsGOjENGmR7ZAdzn3CaO9c2zi1ISoo1qDvrLAvY4fq9jbbt222mu0oVqzkNdQ/yGOJ5WQ2TsofFQMnx8uXWr3CffcJ0rCEz06Z8338/64Dx4MH2HhJmlSvbfsunnRb2h84h0L8w+7HOXbusLPmXXyzULl9ugbdx46zAWrt2aN/TiRNXhn3s0RKFLcwBIIZ16GBvdl272oelSpWK/67avLkdGp04MeflEyZY+8s6dQp/jN697Sjyf/9lXbZ2rfTllzbrWtAYPc9mXydM2PvwbFEVdYnJV19Z0N59JLxAzZrZbd98s2jPVRyB8vHq1e2wdbhUqmR1Z6VKWQvLwIwd4tf48XaA6KijLGysXev3iAr37be2xvWGG4Jvb3r88fZ15v47Fg3BhFfJpsTq1ZPuu6/of6P8NmWK1YAuWWIz4wmodGkrH+7d235cxXqL3bnT3gcvushScqdO1uW/eXP797hxWa24E0SJEnbcc9AgK04aPly69VZ7+bdvX/yZ33hDeAWAcEtJsZnTSZOyLtu+3T6kHHFEcI8ROOyaffb1o4/sTbmgkuGAE0+0RTl5NWUJ1YIFdtg51A+xmzZZGW4opXCnnmplmX/8EdpzFcfGjfYpYNQom3EK1/4IAQ0bWgn3okUWCMaOtdfGzJnW8WTpUvtexeuH72SycaPtd9G7d1b1Q6yXDjtnTeDS0qSrrgrtvl26WMVAoGYxGtassYNuwYTXUqUskE+dmvPvbTwZP97SSfv2ttZ3yxa/RxSbnJOuv96SWq9e0uuvW1h96y3revXJJ9KVV9qeRoEtoJCQCK8AEAndutnmgIsW2f+/+87WfgYbXlu0sBmS7OF11CirFWrXrvD7H3ywNUB6992Qh76XCRPsQ3uoswLffGP1TsF+zZKtXUtJid7s6y+/WAh5913pf/+zBkuR0KGDdVL+5hubge3Wzbp6NGtma/sqVrTQXKWK3c5Pv/9OkM7PpEl2AKl3b/v5SbEfXt9/3/7+3HNP3t3LC9Kliy2Qi+bXGGhhG0x4law7eZ060r33Rm5MkfT557a0YOhQqwd99ll/xpGZ6c/zBuuPP6SHHrKOTh98YAc4Ro2ynhSBxbI9e9qBgCh32kd0EV4BIBKyr3uVQlv7Kdlte/Wy8JqZaV0cxo8vvGQ4IFA6/NVXtg9DcUydaudjxthR7WBNmGAzIx06BH+fOnUs7L71VuQD1Kuv2gehDRvs+3TTTfZ9j5QhQ2xN1g8/WNnbBx9Yo5ZnnrEGW7feagvCLrnEAqQfpkyx0vaXX/bn+WPd+PG2Jr1jR+ui3bRpbK973bHD1ro2by6deWbo9+/Uyc4nTw7rsAo0d66dBxtey5aVrr3W/t7E24zbsmW2P8yRR9r3undv+1uwaVN0xzF7tpXg5rW9W6yYMcPOH3rI9pPJqw1x5cr2fkN4TWiEVwCIhJYt7cNtoNR2wgRbSxlKO8VevSx4zp5t6yaDLRkOOPFEC76jR4c09L1MnWoLlbZtC60MecIE+yBRrlxoz3fqqdbvP1IfRLdssdmas86yUr2ffgr+oEJx1atnM+dHHGEfwIYMsbVb119vzarGjLFZ2NNP92fd1nvv2fndd1vwQU7jx9useaD7eNu2sT3z+vzzti536NCilcPXqGEHM6IdXlNTrQtNsC64wNarx9vsa6CypndvO7/rLvub//TT0RvDsmVS//52fscdsbtedMYMey8prOld3752QCCwmSsSDuEVACIhJcW6GU+aZDN706aFVj4rZa17HT/eyqP22cdmCoPVooVtiVGc0uHVq61N5AUXWAfjESOCv99PPxVt64cBA6zjZiRKh//8076Hr74q3XabfW+DbWATDbVr2z6cM2ZYGXM0OWfheZ99bOPBWJ19XbfOfnZ//x3d51240F4/gaAhWcn5okVW7hlr1q+3MNSjh9SnT9Efp0sXa/iUfa+TSJo714JrqVLB36dCBVvP++mntpY8Xnz+uVSzZtZ2bYcfbl2Thw4NrcqlqLZutYNoq1ZZWfm//0rvvBP55y2KGTOkVq0KPwgTaHMc6AqOhEN4BYBI6drVPmCPHGnNk0INr2lpFkDfe8/eiE84IbSyVs+z9oQTJxb9w/X339t5x442SzhhQnBHtL/+2s5D/Zol69B7zDEWusM1C7BjhwXWNm1shmHcOJtdDHdzpnAYONC+1/fcE91ZvZ9/toB25532877vPpttjyUZGbYu+t577QN/UQ5w7NplwXzKlNDul3uWTCp606atW22fi9dfD+1+oXj+eTuI9NBDxWtF2qWLrXn/+efwja0gwXYazu3SS62y5b77wj+mSMjMtNdUr145/67fdZc1rRo2LLLP75xVn0ybZr9HN99ss+xDh8bemveMDDsoEVhnXpCWLa2PQDyWDq9ebX/fzj3X75HENMIrAERKt252fu+9VmbYvn3oj9Grl62p27HDgmioils6PHWqBby2ba2c1znp7bcLv99XX9lsSFG3nTn1VJsNGD++aPeX7Ov+5hvbQzEtzT6oHXywdTM+8siiP240DBtm639PO82CTjSMGWMfoo85xoL94sXSCy9E57mDdcMNdiDnnntsFmbIEOmMMyxcBWPKFPsAfM459hoLZTZx/HhbF3jAAVmXHXKIBcNQ171Onmyl8aNGhXa/ULz5pv3NOeSQ4j1O5852Ho3SYeeKHl4rV5Yuv9z+1v32W/jHFm4//2xdcrMfDJHsgMgxx0gPP2yz55Fy5502y/rggzb7mpJiyxd++SX2gt+ff9pyj4L2Nw/wPKs0+OKL6FULhMOUKfY37Z13pFdeiY8tuHxCeAWASDn4YPtAtWiRzWTl1WCiMIEPNnXrWklZqJo3tzVCRS0dnjrV3lDLl7fuxYcdFlzp8IQJNmNTsmTRnrdPH9srtygza7/8Yk1qGjWyMbzxhj3eJ5/YLPQ++xRtTNFUpYp9gJkzR7rllug855gx9jqtWVPq3t0qB/73v+iF58K8/rr0yCPW0OrWW212/4477PV46KFZDV3ysny5hdzOne1D4ZVXWolksGu4MzKsyVbv3jlnMStUsDAb6szrZ5/Z+eTJkfmA/fvvtlb+5JOL/1h169qSgWiE1yVLLKQUJbxK0hVX2N+qaJfcF0WgrDV3eJUsWK5bJz3+eGSe+8037QDV2Wdbs6uAk0+2dfkPPhiZ5y2qwO92MDOvkpUOr1uXVTkUyzIyrFogsJb+qafswGtxDtwmOMIrAERKamrWrEVRymclu3/FillbyITK86wMdfJkK0ULxa5dVlKWvVvwkCG2lvXXX/O/3+LFNntS1K9ZsvVugwZJH34YXOfNnTttpqJFCysbe/hhC+5vvmnB5c03paOOsm0U4kWvXhbUHnssqww7UubPt5mg446z/3uelS8uW2ZrcP02bZp0/vkWqh97zC4rUcI+5H/1lQWe9u3tuuxbfuzaZbPYzZpZxcBNN9mWGw8/bIEs2K2Rpk+30JtX0Ag0bQql1PKzz+xg1oYNVgkQbm+/bX8vTjwxPI/XpYtVMUR6O5VQOw3nVr26dPHFtlQjsOVOrBo/3v5WpaXtfV3r1rYn9KOPhn8GbupUC61du9q2PNkPxpQsaWuHJ0+OreAXbLOmgJ497f031maQc1u2zA6s3nqrvd/NnGmVQtWr2/pt5InwCgCRFCgdLmqQK1fOgmJxumj265e1vioUs2dbKMgeXgcPtg8FBc2IfvWVnRelWVN2p55qzz9mTMG327HDPqRfd52tl336aWnpUmnsWOmUU2x2LF49+KDNeJ91VmQbuHz4oZ0fe2zWZV272uv2gQekzZsj99yFWbrUmnilpVkFQe7Z/K5dLXj36yddfbXto7tihSr98ovN1Fx+uVUM/PKLzciVL2+v4SuusA/owXS1Hj/ePuT37Ln3dW3a2AGSxYuD+3oWLLAZ9SuusP+H+8BEoLS/R4/wNSPr0sXW4/3xR3geLz/FDa+SvQZKlbLXbazatMnKRPM6GBJw5532Ox/OvacXLLADVPXr2/6/eTXFOu8865QfS7Ov06cH16wpoEqV2N8y54svrDrr229tecZbb9n7V2qqBdpx42J/712fEF4BIJIuuMCCXrt2RX+M+vWt+25RHXaYfRgJlCoGK7C/a/bwWrOmvbG++Wb+b6xffWVbbLRsWbTxBnTsaF97QUF5+3ZrZDVmjM2wffutzbzUrFm8544V5ctbueyiRTYjEiljxtisdZMmOS+/+25bl/fMM5F77oJs22bBdf166aOP7HWVl+rVrQT46aft9desmQ65/HKrNgg0PNtvv5z3OessK+sPzOQWZPx4C8J5PX9gXXewpcOB38Mzz7SS43CH1+nTbXuccJQMB3TpYueRLh2eO9dmpOvWLfpj1KljAez1160BWSyaONGqRQpae9+ihR2Ue/xxO3BQXBs22IGdnTttCUX16nnfrkIFq/j48EM7yOK3jAyrTghmvWt2ffrY/ZYti8y4imrXLlsKcuSR9vfkxx+tQVP2GfB+/WzLpFjehstHhFcAiKQKFWz2rzjdPosrNdWO8H/2WWhHcqdOtQ+R9erlvPzUUy1MffPN3vdxzta7du9etDLn7FJS7Hv3xRcWoHLbts1K6z7+2MLVpZcW7/li1eGH2xrel1+2ABduK1fazzJQMpxdhw72IfDBB4NvilSYNWvsQ1lhr0XnrITuhx9s3XKLFgXf3vPswMWPP0qtWmnhKafYh++BA/P+/atQwUqR33+/4JCzYYPNzuY3S3bwwVbCHGzTps8+kxo2tNnF7t1tBi6ce2u+/bbNqB1/fPges1Ej+1sQjfC6777F/9tx3XX2M4+l2cPsxo+3kN6pU8G3u+MOq3p4+OHiPd+uXbb05M8/7WBO7gM5uV12ma2/fOih4j1vOASaNQW73jUgsGVOqAdtC/LNN/a4oTZoC1i92qpZ/vc/axr344/W4Tm3I4+01y+lw3kivAJAMujb145Ah7LdxdSpFl5yf/A/9lj74J9X46a//rLyyeKWDAeceqodec/dcCqwP+Gnn0rDh0sXXRSe54tVd9xhIem88yxshtMnn1iQzCu8Srb2dfXq8G3dcdFFNltZv741Tpo6Ne8g+/jj0muvWfnkgAHBP36LFtLEiZp/3nk2c12Qyy6z13dBX9vXX9trML/wWqaMPWcwsyQ7dtjBnT597Hm7dbMS0oKaTYUiI8O6lfbta6WT4eJ5Nvs6eXJkt1Epaqfh3OrVs5n1l14KbmuvaPv8cyt3L6yi5sADbQZ92LC8D+AFa9gwK0N9+unglrDUqmXrYt94Q/rvv6I/bziE2qwpoFUrm4UPV+nw66/b9+6zz+xnF2yzt4C//7Z1+T/8YO+dL7xgy4LyUr26HbQkvOaJ8AoAyaBPHzsP9o38v/9sNip7yXBAuXI2qzNq1N77gAbWuxanWVN2zZtb+XH20uEtW2wrifHj7cPpeeeF57liWalS9uFp3TpbwxlOY8bYh/3WrfO+vl07KzcMx9YdmzfbTHnPnhZgn3suqzz8qqtshjPQafPaa23W9LbbivecBalXzxqlvPBC/jPL48dbCC5oq6s2bYJr2jR1qoXVwO9jYE18uEqHv/nGwlo4S4YDunSxx/7nn/A/tmSzz//8E57wKtm2ShkZxZ+1DLcFCyykB7td1+2328G6oUOL9nxLltjBr379Qvtbec019v0LpuPxmjVS//520DLc3bOnTw+tWVNAYMuc8eOLN6bMTCvzDXQrnzPH3pMGDrTu58EczJk61cLomjX2HnnqqYXfp18/m5kt6h7tCYzwCgDJoHZt2+8x2PAaaGKTV3iVrOvw+vXWFCm7CRNsK5qmTYs+1txOPdUa68ybZ+Hn6KPteV55xWYHkkXLllY+PHJk1nrk4tq82T7cHXdcwaXtd91lXU+feKJ4z/fpp/ZB/JZbbOZixQqbhWjTxkq/O3SQGjSwtX7Nm0uvvlr8EtLCXHWVlQa//HLe148fb+W9eTW3CWjb1r4/hQW7zz6zhlOByoSaNe3rDFd4ffttC9r9+4fn8bIryrrXjRuDX3O4YIGFjHCF18aN7W/Hc8+Fv1qhOAJboAQbXvfbz4LTE09YN9pQXXutzfg/+WRoy1caN7bfw+eft4Nm+Zk9235/x42z1/GTT4Y+xoLMmGEH1oJt1pRdYMucH34o2nNv2WJNCv/3P1uX+tln9vP4+mvrtXDttVZJUlA4fvdd+32vVs3ex/J7T82tXz87D2yphD0IrwCQLPr2tVBa0AeRgKlTraStVau8r+/Rw0qyspcOZ2bam/oRR4R3je/JJ9vjDR9ub+iTJtks5BlnhO854sX110vp6Ra4wtGJcvz4rKZIBTnkEAu4xd264733rCQxsIVUpUoWMMaMsSD7xhv2XPvsY5dFo1N0u3Y2+/vEEzbTlN0//1i5X0FdYaWsZjKFrYUbN87WOVasmHVZ9+7WaGzHjtDHnt2OHfb9PfbY/MsRi+OAA6zBTCjh9aSTLNgHM/MVjk7Dud10k72+i3vQJZw+/9xe36HMJD78sB2AHDIktH2Xv/rKDmjceOPezdiCcf31dgDi2Wfzvn7UKKtI2L7dXsNHH22VEuFqlBVo1hRqyXBAr1528Kso616XLrXy4Pfft+//8OFZnc7LlrWDiDfeaOH+6KP37gbvnM2WDx5sfx+++y60g7qtW1uH9dwHiEF4BYCk0bevfRgIZsucqVPtQ2d+s02pqdZMaezYPfvHVvjnn6yGFOFUr57N+gwdas1tRoywD3HJqHx5mwWYNs0+lBbXmDHWiToQJgty55022x5Md968bNli62uPPz7vWZTKle3n+uGHtj1Uo0ZFe56iuOoq2+s2sGVQQOB3pbDw2ry5NbgpaN3rkiU2SxUoGQ7o3t2+N9OmhT7u3GNdsyYyJcOSHUDq3Dn48Dptms20L14cXMVHJMLr/vvbzy73mnm/7NplVSOBhjzBqlbNqhD++MPKoYOxY4c1sWvUKPj75Na6tX3/nngi5xKRjAw7MHDiiXaAc8YM62r/9NP2dV1ySXjWRhe1WVNA1aoWrkNd9/rTT3ZQ648/7G/kNdfs/fNKSZHuv1968UX7mXbsKP37r123a5fNyN5wgx3A+fLL/Ls758fz7D3788/DX4od5wivAJAsDjvMmrgU9ka+bZuVpxVW3jRkiK1TGzVKklQlUNIWrmZN2V14oQXpt9+O3IfzeHHaaTY7eeON9sGuqHbtsvWn/ftbt9zCHHywlcoVdeuOzz6z8Z5wQuj3jbTjjrMP+bn31Pz8c1uPW1igKlnSPsQXNPMaKP/LHV67drUPqsUtHX77bfuwXljQLo4uXWw2Opg9be+918ZTu7atTS/M3LkW0kL9kF+YY46xRnJ//hnexy2KadPsAFCwJcPZ9expewMPGxZcKenjj1v4GjbMZgqL6oYbbN3l66/b/9eutZnGBx6wreC+/tqqcCT7XbnnHjuo+d57RX/OgMDBoKKGV8kC4IwZwa8d/fhjq45wzg6WHnNMwbc/5xx7T120yN5jv/7a/qY+/7wF/DffLPpWd/362eslmL2okwjhFQCSRYkSWVvmFHRUfMYMC6WFhddWrawb5u7S4aozZ9p6oOLs0Zifk06ycucTTwz/Y8eblBSb/Vy8eO+wFYpvvrEPovl1Gc7LnXdaw6Fgmrjk9t57VnbatWvo94201FRrhPXtt1kBNDBL1rt3cLNkbdrYQZ/cpccBn31mZYC5t/ypVs0ODBQnvG7ZYjNEAwcWvDa3uIJd9zprloWAq66y8v5PPil87Wu4Og3ndtRRdv7JJ+F/7FB9/rn9/ha1OuX+++1v7llnFXwAadEi26P5mGOyvv6i6t7dXtsPPWSVA23b2u/F88/beuLcr7fLLrODa5dfHtwSlYLMmFG0Zk3ZBbbMCSbwDx9uZfcHHGAHGvJbNpNbz55ZS2169LAqiOHDrUqmOGv2e/a09226DudAeAWAZNKnj63lKWjLnEAzoIK6q0r2gX7IEDs6/ddfqjx7dmRmXQOKM3uQaLp0sfLbBx4o+lYgY8bYh61QZuoOOshmFZ57LrS1d1u3WpgZMCC4WV4/nH22rUUNlEVPm2br2IL9/rRta8E+UP6a3a5d9oE2sEVObt272+9d7u7dwfrkE2u+FemqhIMPtu9RYeH13nttPfNll1nQysjImrnLT6TCa4MG1uwsVsJr27Z2wKIoypa1mbxVq2zWM7+DkFdfbd/zcKz19Tybff37b5sB3bxZmjjR9kjOS4kS1r17xQqbeSyO4jRrCmjVymb/C6s4eukl+5727Wt9FdLTQ3ueAw+0hkxnnWXPFY4u+JUr2yww4TUHwisAJJNgtsyZOlXad1/rhFqYU06x8yuvVImtW8O/3hX5GzrU1rXdemvo93XOwmvv3oXvhZrblVfah+e33gr+PuPHW7CLxZLhgEqV7APnu+/azNX48aHNkhXUtGnaNJvlDswC5da9uzW9+f77oo397bdtVjfSs9qpqfZhuqDw+uuv0ujRNvNWpYrNmnXsaN2c8wtbmzdbJUEkwqtkZa6BSgOflNiwwV4bRSkZzq5VKzs48P77eR8QGD/eqhxuuUVq2LB4zxUwYIA9b9u2FigLq8o55BArcX7uOatmKIriNmsKSEnJ2jInv6qI11+33/0+fey1W9SGZ7Vr2+u8V6+ijze3o46yGe9Fi8L3mHGO8AoAySQtzT6E5BdenbPwWtisa0CDBjYL+Omncp6XtW8lIq9JEwsIr75qH/JCMWuWNRcJpWQ4oFs3m8l6/PHgm7KMGmWzTd27h/580XTZZfY1PfWUfdgNZZZs//3tQEBeTZs++8w+RPfsmfd9u3Sx64tSOrxunc3MnHhi8WaogtWli62lXLEi7+vvu8+6RF95ZdZl55xja07z2+Lpr7/sPJLhNSPD121Hqs6caR3Cw7Em+Zpr7Odw2WXWaCxg+3Zr0tS0qXTddcV/noDUVAveU6cGPyN59922Bvb884vWSXvOnOI1a8quTx9rZpZXU7S33rLZ0iOOsOBaunTxny+cAlvmhNp0KoERXgEg2fTtax9C8lqP9M8/9qE02L3opD2dfzc1bRr+Ziso2K232vf86qtD6+45ZoyFpaOPDv05Pc+Cya+/2lYchdm+XfroIwvKga0mYlXDhrZu9LnnbG/IUIJGaqrNOOU18/rZZ9Lhh1sDo7xUrmz3LUp4/eADCwfRamQWWPc6Zcre1/35p/TOO9ZtNvvfgkGDLNDm17gpEp2Gs2vXzipJfCwdrvbjj/ZzPuyw4j9YaqrNFnqedPrpWTOKjzxiBwKeeir8ISzUcv8KFaz78O+/23rZUM2YYeeBiobi6N3b/t7lDoCjRlkDvC5drNN4LC5NOeAAO0hM6fAehFcASDaBLXO+/HLv6wIzI6GE1xNOkMqV05p27cIzPgSvShXprrtsDVrubV4KMmaMbXsSTGl4Xk4+2fZrDaZx0xdf2F6RsVwynN1VV9la16LMkrVta9ts7NyZddnKlTYbm7vLcG7du1tgDrWD9NtvS40bW0CLhjZtbK10XqXD//ufXXf11Tkvr1DBmq69+669FnILhNdQ9sEMRWqqzWB9+mno245kZNh6z+JwTlWnT7fZvXCt+W7QwELqlCkWDhcssHLigQOLX5ocLkcfbb/399yTNbserBkzrJJhv/2KP45q1eygQfbw+sEH9nesQwdbjx+JvZHDwfPstfvll3YgEIRXAEg67dvbDEBeZUhTp9ravwMPDP7xqlaVfvtNC087LXxjRPDOP9+Ozl93XXDlefPmSb/8UrSS4YAyZWwfw08+KfxD6ahRFrLjZT10+/Y2S1qxYuizZG3aWNOl337LuuyLL2xWPJjwumNHaNtiLF9unV9POim0fUOLo1Qp+x7lDq/z5lkzoQsvtAMbuZ1zjq1tfeedva+bO1faZ5/Q11+H4uijbc1rqNuO3H239QC4/PKiN9SaM0dlVqwI/zZGQ4bYrPZtt2W9Boq6D3OkPPGEzQJfeGFo1SEzZtgSl3CVwvftaweRVqywv1uDB9vBpk8/tYMrsaxfP/vd+eYbv0cSEwivAJBsSpSwhhJ5bZkzdap9cA/1A0PDhsqMtbVCyaJECSsX/PtvK9MrTGCG9thji/e8gb13C+poumOHPd+xx0Z2C5dwGzHCPtSGWuYcKHHMvu513DjbIqiwtXudOtnvXSilw6NG2QxxtPc+7tLFZpjXr8+67P777bWY31rLww6zg2J5lQ5HqtNwdr17288zlNLhrVvtdyo93fZLbdcu54GJYAXW2oZ7RtTzrMS9Vi2btb/9dqlevfA+R3Glp1tX9K++kt54I7j7hKtZU3aBZmk33GCz061a2XtgxYrhe45I6d7dDgCMHev3SGIC4RUAklHfvrbFyuzZWZdt2GAzcqGUDCM29O1rH4zvvrvg/R8lKxk++GCpUaPiPWedOhaaXnkl/y6uEyZYwImXkuGAJk0sTIaqaVOragise83MtOBy5JGF7/dYsaLNBIUSXt9+W2re3E7R1KWLHfgKdJJduFB67TXr2JqWlvd9PM9mX7//3tZBBjhna2XDUR5akEqVrBvzxx8Hf5+337bfp8DBjOXL7QDFs8+GNos4fry21KsXvu6/2VWrZp2HL73USt5j0QUX2Gz91VcHt/drOJs1BRxyiIX8V1+135fPP7ff1XhQvrwFWNa9SiK8AkByymvLnB9+sA9khNf49Mgjtp7wrLOkZ56xhi4ffGBrpX74wWaMZs+2wFGckuHsrrjCPmTm14hn1CgLDeHcOiKWeZ6Fm8DM66xZtua1sJLhgO7drSPqpk2F33bhQquUiPasq2TVGSVKZJUOP/igfe3XX1/w/U47zWY/s79eVq+2QBPpmVfJSof/+MNKnAvjnFUVtGxpHbb79rXfn65dpYsvtu1jVq3K//47d9rv3qWXShMmaE04Gg/l5/DDbWY4VqsbUlJsBnv1atviqzDhbNaUfQznnmtr/cePz795Wqzq188qFIq7/joBEF4BIBmlp9vsW/bw+t139gE0HN0wEX0HHWQlcR9/bN1ezzhDOv54C46HH26zDQcfbLOB4QqvrVvbh/lhw/ZuhLNzp83yHnNM7G0/EUlt21rI2bbNyhKl4Nc6du9u38dg9sYM7PF50klFG2dxlCtnX+fkydJ//1kYPeuswktWa9a018Prr2etz450p+HsAt21gym/nDTJfo6XX561nrh2bZv9euwx+9t58ME5O25v2mR7rA4ZYrN8vXrZvp/9+mmRHwcZYknr1rYv+OOPW9VPQcLZrCm7++6z12w8dsUPlD2zZQ7hFQCSVt++9iE5sG5t6lSpRQubKUN8uu8+a+yxbJk1Upo1yz6sjR0rjRwpvfCCdXxt1Sp8z3nVVbZn7JgxOS//+msrJ463kuHiatPGAujs2RZeDz007wZGeenQwWYmCysdHjfOSsSPOso6DfuhSxcrj77zTlujeOONwd3vnHNsxjJQvhvN8NqkiTU3C6Z0+IknLOScckrOy1NSbKuo77+3v5U9e1rTtKOPtrXNgwbZz33AAPudWLVK+uADbS9qZ+9Ecs899rtx110F32769PA2a0oETZva7wilw4RXAEha2bfMycy0mVdKhuNfuXI2Q9S0qX0A7NzZSs4GD7ayuUGDwvt8Rx9tASr3tjmjRlkXz1jZtiNa2ra18y++sN+pwIxJMMqXt6ZABYXX776zhjMtWkhvvVW8sRZHly4WRF580cqBg11D3bu3VLduVunw3LlWghyJ9aB5Ofpom1XdsCH/28yfb3sTX3BB/nt/tm5tIeu88+yg0G+/WTnxpEl28Ojll61RWaxuweKHxo3te/rSS7bOOS8ZGdYMLJzrXRNFv372t2HzZr9H4ivCKwAkq/btbeZg3DhroLJhA+EVoUtNtdLKb7/NalS0a5ett+3f37bVSSb16ll57BNP2AfxYNe7BnTvbmWTeYWr336z2da6de331s8qiY4drZw2JUW6+ebg75eaKp15pjXMWbzYwmuTJuHb/7Qw/ftbSfsXX+R/m6eftq/toosKfqzy5aXnn7c1u//8Iz36qIX6aH0t8ei22+yAwC235H19oFlTJNcIx6t+/Wyv11CauiUgwisAJKuSJbO2zAmssSO8oijOOsu65Qa2zZk0yZqzJFvJsGShp21ba9RUuXLoa8i7d7fQm3tPxwULbNayTBkLXrVrh23IRVK5ss0qX3CB7YMairPPtmqPV1+NzjY52bVvb8168isd3rTJZpNPOMH2ng1G5crR22c33tWqJV1zjXVInjZt7+sDzZqYed1bly52wCTJS4cJrwCQzPr2tYYrw4fbhwq/1s8hvlWqZGsZ33nHXk+jRtmHrFBKZhNJYNaoV6/QZ+Hat7eusdlnV1assOC6ZYt1So1WiW1hxo61ztahatzYQvrLL9va7GiG1xIl7HX56ad2kCC311+3PgBXXBG9MSWba66x6oQbbth7y6Hp0yPTrCkRlC5ta6zHjg1tq6YEQ3gFgGQWKGmcOdNmXZk9QFFddpmFgWHDpNGjrbw1v/WCiS6w7jXUkmHJvmft22eF1w0bLGwtXmwfWqO9p2uknHOOrS3dti264VWy0uGVK7PK3AMyM6Unn7Sf3+GHR3dMyaRiRSsfnjjRysezmzGDZk0FGTjQGuQVVPae4AivAJDM6ta1fQwlSoZRPI0b2xY8jzxiwSAZS4YDjjxSeuop6dRTi3b/7t2tU/SyZfY9/fln24IlkX5Hjz/eym2l6IfXI4+0cJS7dPiLL6yRUPbtcRAZF1xgTb5uvNEOGkhZzZpY75q/E0+U0tKkhx/2eyS+IbwCQLILzA4l0gdj+OPKK61ZU9my1lwkWZUsaXvtFrVZVffuVhbYpYvNwL76auJ9P8uWzQr30Q6vVatKnTpJn3yS8/InnpDq1LGAgMgqVUq69147MPP223ZZoFkT613zV7q0HVz54gsL+kmI8AoAye6SS6Trrgu9sQyQW+fOFgoGD7Z1ayiaww6z4PvXX9Jjj0lDhvg9osi45x5bH52eHv3n7t/f9uL991/7/59/Wgfniy6yYIXIO+kkKxG+7TZpxw5b7yoRXgtz4YW2DdkjjxTt/i+/rBKbNoV3TFFEeAWAZFe/vjR0KNs7oPg8z9axvfyy3yOJb6VLS3feaXvnXnmlz4OJoGrV/CsvP/poOx871s6HDbPQesEF/ownGaWkSPffb2ufn3/e1rvSrKlwVarYnt0jR0qLFoV23y+/lM45R2n5dduOA4RXAAAQPqmprBcMhxtuoONtJDVrJjVtaute162z0uyTT/Z/C6Jkc+SRViZ/993S5MlS69Y0awrGlVfa0oInnwz+Pjt2WGO9Jk3038CBERtapBFeAQAAkFw8z2Zfv/rKmmtt3mxrCRFdnic98IC0apWtf6VkODgNGtja7Oeft62dgvHEE7au+MknlRnHpfGEVwAAACSf/v2l7dtt1q9TJ+mQQ/weUXJq1y6rfJzwGrxrrpE2bpReeKHw2/73n3TXXfaaj/Pmb4RXAAAAJJ9OnaRKlaSdOynR9tuDD9re0Ece6fdI4sehh1rJ9eOPW0lwQa691jrBP/54NEYWUYRXAAAAJJ9SpaRjj83aoxj+adzYti6qVcvvkcSXa6+1WdV33sn/NhMnWnOnG2+073OcI7wCAAAgOQW63NJtHfGob1/pwAOlhx+2Bk657dwpXXqp1LChNYFLAIRXAAAAJKeyZW3rESAeeZ7Nvs6ebdvg5PbUU9Jvv1m5cNmyUR9eJBBeAQAAACAenXKKlJZms6/ZLV0q3XGHzc4ec4w/Y4sAwisAAAAAxKPSpW2bp/HjbbuhgBtusG7aTzyRUHtvE14BAAAAIF5dcIFUvrz0yCP2/ylTpDfesJLifff1d2xhRngFAAAAgHhVtap07rnS229LCxZIl1wi1asn3Xyz3yMLO8IrAAAAAMSzK6+0jsO9e1sDp0cftdnYBEN4BQAAAIB41rChNGiQ9NdfUs+e0sCBfo8oItjUCgAAAADi3S23SH/+aVvkJFCTpuwIrwAAAAAQ75o3l2bO9HsUEUXZMADg/+3de7BdZXnH8e+PhGsLknApJRQEtbYwEEcptRUJ4AVhkFDBCrUFSqHKVKu1rZQWq0VRvAwohRZQLLQRYbi0AaejJAGiY8MUSk0lRSo2QWOhgoSr3PP0j7XOsGezk3NqTvZeOef7mTmzzlnrXTvPyTyTnd9+37WWJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmpqlHXMGFJHgDuHXEZOwIPjrgGaYz9qC6xH9Ul9qO6xH5Ul3S9H/eoqp0GHdikwmsXJLm9qvYfdR0S2I/qFvtRXWI/qkvsR3XJptyPLhuWJEmSJHWe4VWSJEmS1HmG1/+/S0ZdgNTDflSX2I/qEvtRXWI/qks22X70mldJkiRJUuc58ypJkiRJ6jzDqyRJkiSp8wyvE5DkF5Jck+SRJI8muS7J7qOuS1NbkmOTXJvk3iRPJrk7ySeSbNs3blaSLyR5MMkTSRYn2XdUdWt6SPLVJJXkY3377UcNTZIjknw9yePt+/PtSQ7tOW4/aiiSvC7JjUl+lOSxJHckOblvzFZJPp3kvvZ9fVmSg0ZVszZ9SXZL8tdtL/2kfV9+6YBxE+q9JJslOSPJqiRPJVme5Jih/DITZHgdR5JtgJuAXwJOBH4HeAVwc5KfGWVtmvL+BHge+HPgLcDfAqcBi5JsBpAkwA3t8fcCxwCb0/TnbqMoWlNfkuOBuQP2248amiTvAhYC/wb8BvB24Gpgm/a4/aihSLIfsJimv04F3gbcBlya5LSeoZe2x/8SOBK4D/haklcNtWBNJS8HfhNYA3xjPeMm2nsfBT4CXAAcDtwKXJ3kiEmtegN4w6ZxJHkfcC7wyqq6p923J/Bd4INVde4o69PUlWSnqnqgb98JwOXAG6rqpiTzgX8CDq2qm9sxLwFWAguq6g+HXLamuCSzgLuAPwKuAM6uqjPbY/ajhqKdWbgLOKOqPruOMfajhiLJx2k+cJ5dVY/37F8GUFW/lmQu8C3g5Kr6u/b4TGAFcHdVHTX0wrXJS7JZVa1tvz8F+DywZ1Wt6hkzod5LsjPwA+Ccqvpwz/lLgJ2qar+h/FLjcOZ1fEcBt44FV4CqWgl8E5g/sqo05fUH19Zt7XZOuz0K+J+x/5i15z1CM9tgf2pj+CRwZ1V9ecAx+1HDcjKwFrhoPWPsRw3LFsCzwJN9+x/hhf9rH9WOuWrsYFU9B1wJHJZkyyHUqSlmLLiOY6K9dxhNLy/oO38BsG87eTdyhtfx7QPcOWD/CmDvIdcizWu3d7Xb9fXn7kl+dihVaVpIciBwAvAH6xhiP2pYDgS+AxyX5HtJnktyT5Le3rQfNSyXtdvzk+yaZPskpwJvAM5rj+0DrKyqn/Sdu4ImMLx8KJVqOppo7+0DPA3cM2AcdCT3GF7HN5tmHXm/h4BZQ65F01iSOcBZwOKqur3dvb7+BHtUkyTJFsDFwGeq6u51DLMfNSy70tx/4tPAOcCbgUXABe3lPmA/akiq6k7gYJoZ/R/S9N2FwLur6sp22Hj9OHsjl6npa6K9Nxt4uF58TWmnenTmqAuQNL52hmAh8BzwuyMuR9PTB4GtgbNHXYhE8+H7tsBJVXVdu++m9lrYM5KcP7LKNO0keQVwLc0M1btplg/PBy5K8lRVfWmU9UlTieF1fGsY/Onsuj7FkCZVkq1prtHaC5hXVat7Dq+vP8eOSxskzaPB/gI4Bdiy79qsLZNsDzyG/ajh+THNzOuivv030txd+OexHzU8H6e5pvDIqnq23bckyQ7A55J8mabf9hhw7lg/PjTgmDQZJtp7a4Dtk6Rv9rVTPeqy4fGtoFkD3m9v4D+HXIummSSbA9cA+wNHVNW3+4asrz+/33vXQ2kD7AVsRXPThjU9X9DcYXMNsC/2o4ZnxTjH12I/anj2BZb3BNcx/wrsAOxM0497to9g7LU38Awvvs5QmiwT7b0VwJbAywaMg47kHsPr+K4HXptkr7Ed7bKk17XHpI2ifZbrl4BDgaOr6tYBw64H5iSZ13PedsBbsT81eb4FHDLgC5pAewjNm5/9qGH5x3Z7WN/+twCrq+p+7EcNz/3Aq9p7A/T6VeApmhmrG2ieA/v2sYPt40reAdxYVU8PqVZNPxPtva/SrCB4Z9/5v03zlIGVQ6h1XC4bHt/ngfcAC5OcCRTNA3x/QHPzEmljuZDmH5qzgSeSvLbn2Op2+fD1wDJgQZI/pZkBOwMI8Kkh16spqqoeBm7p358E4N6quqX92X7UsPwzcDNwcZIdgf+m+ffyzbxwXwD7UcNyAXA1cEOSv6G55vUo4HjgvKp6Bvj3JFcBn21XVa0ETgP25MVhQZqwJMe2376m3R6e5AHggapaWlUT6r2q+lGSc2nuG/AYcAdNwD2Upp87IS++oZT6tdd7nQe8ieZNbwnw/t4HAEuTLckqBl+jAPBXVfWRdtxs4DPA0TRLO5cBH6iq5Ru/Sk1nSQo4u6rO7NlnP2oo2lnUTwDH0lzb+h3gnKq6omeM/aihSHI4cDrNUvWtgO8BlwAXV9Xz7Zixm979FrA9sBw4fewDQOmn0b4XD7K0qg5ux0yo95LMoPmQ71RgF+Bu4KyqumZj1P7TMLxKkiRJkjrPa14lSZIkSZ1neJUkSZIkdZ7hVZIkSZLUeYZXSZIkSVLnGV4lSZIkSZ1neJUkSZIkdZ7hVZKkKSrJqiQLRl2HJEmTwfAqSZIkSeo8w6skSZIkqfMMr5IkTYIkc5Ncn2RNkieTfDPJ63uOX5ZkdZJfT3JbkqfaZb3vHfBaByRZnOTxJE8kWZLkgAHj5iVZlOSRdtzyJL83YNxxSe5qx9ye5MDJ/xuQJGnjMrxKkrSBkrwa+BdgNnAqcAzwY2Bxktf0DN0OuAq4HDgauAU4P8lJPa+1H7AUmAWcBJzQnrc0ydyecfOBJcAWwLuA+cAXgT36yns98MfAh4B3ADOAryTZfgN/bUmShipVNeoaJEnapCVZAuwKzK2qZ9p9M4A7gbur6ugklwEnAsdX1ZU95y4CfhF4aVVVkmuAN7Y/P9yO2Q5YBdxSVW9LEmAl8CBwQFWtXUddq4CXAHtV1Zp23/7AbcA7q+qKSf2LkCRpI3LmVZKkDZBka2AecDWwNsnMJDOBAIuBg3qGPw9c2/cSVwK7A3Panw8CvjIWXAGq6lHg+vbPAXglzQzrF9YVXHssGwuurW+3293H/+0kSeoOw6skSRtmNs1S3A8Bz/Z9vQeYlWTs/XZNVT3bd/7/ttux8DobuG/An3M/zVJigB3a7eoJ1PdQ7w9V9XT77VYTOFeSpM6YOeoCJEnaxD0MrAUuBP5+0ICqWtus9GVWks37AuzPtdsfttuHgF0GvMwuwNgM6oPtds6AcZIkTUmGV0mSNkBVPZHkG8Bc4I5xlvHOoLmZ05U9+44Dvs8L4XUpcESSbavqMYAk2wJvpbnBE8B/0VwDe0qSS8obWEiSpgHDqyRJG+4DwNeBryW5lGbZ747Aq4EZVfVn7bjHgE8l2RH4LnA8zc2ZTuoJoB8FjgSWJPkkUMDpwDbAWQDtjZ3eD1wH3JTkIuAB4JeBnavqwxv595Ukaei85lWSpA1UVXcAv0LzeJzzgRuBzwH70oTaMY/SzLSeCCwEDgHeV1WX97zWfwAHt2MvB/4BeByYV1XLe8YtBN7U/ngpzQ2dfp9mRlaSpCnHR+VIkjQE7aNy3lhVu426FkmSNkXOvEqSJEmSOs/wKkmSJEnqPJcNS5IkSZI6z5lXSZIkSVLnGV4lSZIkSZ1neJUkSZIkdZ7hVZIkSZLUeYZXSZIkSVLn/R8A1H9HsTQoBwAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA74AAALYCAYAAABbpC7yAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAADIbUlEQVR4nOzdd3gU1dcH8O8Noffei0gRCNIEUUgEEQEh2Ht7bWDvvff+UxAVwa7YRYUIqKjgRnpRIXQIBCH0BAKB9Pv+cXbMZrNlZne2Jd/P8+TZZHd29mZ3s5kz59xzldYaRERERERERBVVXKQHQERERERERBRKDHyJiIiIiIioQmPgS0RERERERBUaA18iIiIiIiKq0Bj4EhERERERUYXGwJeIiIiIiIgqNAa+REQUMkqpj5RSWinVIdJjMSilhjjH9KSH265WSq1SSuU6t7nTeb1WSs0P81A9UkrNV0pxLUIiIiILGPgSEZElSqmTnQHtFqXUUefXBqXUVKXUwEiPL1BKqcEAPgJQHcAkAE8BWByBcTzpDLSHhPuxg6GUildK7XaOfUqkx0NEROQqPtIDICKi2KCUqgLgNQC3AygA8BuA7wGUAOgM4FIANyilrtJafxqxgfq3FEA3APvdrh/lvLxaa+0e8HYDcDTUAzPpKgC1Ij0ID0YDaA5AA7hEKXWn1vpYhMdEREQEgIEvERGZ9zwk6F0B4AKt9TbXG5VS9QE8CKBB2Edmgdb6KID1Hm5q6bzc7eE+nraPCK319kiPwYtrISdBJgK4C8AFAKL5BAgREVUiLHUmIiK/lFKdAdwLyZKOcg96AUBrfUhr/RCAqX72VV8p9aBSKtVZGluglNqulJqilGrhYfsGSqnnlFLrnXNvDymlNiql3lNKNXbZroZS6n6l1Gql1GGl1BGlVLpSappS6niX7crM8TV+BnCNc5Otztu1y308zvFVSrVUSk1USm1WSuUppfYppf5QSv2fyzbVlFK3K6XmKqV2On/fXUqpz53Pq+v+5gN4wvnjPGMcSqltrtt4muOrlGqmlHpTKZXh8hgfK6WO87DtNudXHef4M5VS+c75zRe4b++PUqo5gLMAzAfwMoBiSCDsbXu/z5vLthcopX5TSmUrpY457zNFKdXO/ffx8ljlni/XcnKl1LVKqb+c+/7BeXsrpdTTSqmlzrHlOx/3VaVU3UB+J6XUUOdjvuXl/qc6b/f590NERIFhxpeIiMz4P8jJ0ila632+NtRa5/vZVzcATwL4HcC3AI4B6AVgHIAzlVJ9tdbZAKCUUgB+AXCS83ImgKoAjgNwCYAJAA449/spJMu4AMC7kOxje0gJ85cAtngZzzbIfN5znOOYCOCgn98BSqluAOZBynvnOX+XegD6ArgDMl8YABpBSsQdAFIAHALQFcBFAEYqpfpprbc6tzXucxqAj51jg7/xKKWaAVgCoAOAuQA+dz7GlQBGK6UGe8haV4U8pw0BTIeUT18C4Gul1Eit9S/+ngMXV0GOKT7VWu9WSs0FMEIp1VFrne42VrPPG5RSbwC4DcBeAN8AyIK89hcCmAMg2Oz3AwASIa/LzwAOO69PgmStfwOwEFK+fTKAewAkKaUGaa0LLf5O8wFsAnCZUuoerXWe21iuc16+F+TvREREnmit+cUvfvGLX/zy+QU5mNcATrd4v4+c9+vgcl19AI08bHuFc9tHXa470Xnd6x62rw2gpss+SwB872G7agDquvw8xLnPJ/2N1eU2DWC+23UrnNdf6WH71i7fVwfQysM2pwEoAvCe2/VPOvc7xMtzOl/+fXsc++Nu11/rvH6e2/XbnNf/AKCay/XDnNf/ZPF1XguZA13X+fNlzv0842Fbs8/bWOd2KwDUd9uuput7yPn7bLPwfBnPcQ6AHh7u0xRAbQ/XP+q83xUB/k73O7e7zG2bOpCge5WV551f/OIXv/hl/oulzkREZIZRgrwz2B1pKYnO8nDTZ5BA5AwPt5VrkqS1ztWlzZM0AOVluwKt9WH364OhlDoZks37TXto5KW13unyfb7WOtPDNn9AAkZPv6+VsVQHcDEkK/qy280fAkgDMMS1NNjFXVrrApcx/QYgA0B/C49/CiSLP8Plef4BwBEAVyul4ly2Nf28AbjJeXmH1vqQ23bHvLyHrJqqtV7jYRz7tNa5HrZ/23n532tm8Xf6CEAhypeBXwwJft+3NHoiIjKNgS8REYWdUmqYUipFKbVHKVXknINZAikPbemy6VpI4PagUmq2UupmpVSCswT6P1rrHAA/AbhUKeVQSt2tlOqnpBN1KBiBoalyYOdYvlJK7XDOvzXmEPdE2d83EF0B1ACwRLuVz2qtNYA/nD/2crvfQV1aYu1qB6w1KDOCuP+CPi0NxKYDaAtguMu2Vp63/gCOaq3/tDAWq5Z7u0EpdaFS6lel1H6lVLHz9TLK6l1fM9O/k9Z6L+SkwOluc6+vA5APNgMjIgoZzvElIiIzdgM4AUBrABuC2ZFS6mIAX0BKO3+ClKkamdo7IaXBAACtdZFSaihkDu75KF1yaKdS6hWt9USXXV8AKUW9FMD/nNcdUEq9DSm5LYR96jsvy2Vy3SlZH/g3SGD/M4DNAHIhWer/g8xDDkY95+UeL7fvdtvOcMh9Q6cimDwxrpSqhdJss3vg9ymAqyGB8c/O60w/b85tM8yMIwgenzOl1H2Q7PleyFzinQCMkwpPwOU9Cmu/EyDN3y6ENFN73Dk/+BQAX9mUxSYiIg8Y+BIRkRkLIXNjT4c0pQrG45Agop/WerNxpTOLe7/7xlrr/QBuUUrdCqAHpMz0dgATlFI5WusPndvlAngIwENKqU7Osd4C4DFIkPmE+76DcNB52crEtg9B5hkP0lovdL3BeRIgWDnOy+Zebm/utp2dLgRQ1/lV6JaIN5ytlGrkDOoOOq8z87wdhPlseAnkOfakvpfrAXlflKGUioecQMkE0Mv5/jNua47y76ODzkszvxMgJ0G2QMrAnwSbWhERhQVLnYmIyIyPIMHFOKVUE18bOuec+nI8gLWuQa9TH0jTIo+0SNNaTwBwnvPqsV623ay1ngpgqHPcHrcLwjLn5Zkmtj0ewAEPQW9z523uip2XZsu0N0BOJAzw8twnOS//Mbk/K4wy5y8h81PdvxZBsqOXO7ez8rwtA1DLmTH3JxtAM2fQ+h+lVG0AnT3fxasmkOz4Iteg12mQl3EC5n4no/z8PQDtIEtAXQlgKyQgJiKiEGHgS0REfmmtNwF4FdLtdpZSqlx5rlKqrlLqWciyRL5sB9DZuQSPcd96kGWE3PfZwdNjoTSLmefcrqlSqoeH7ZpB/te5Lx0TFK31UgArAQxTSl3hfrtSqrXLj9sBNHKWtBq3VwPwJmRJIXdGuWtbk2PJB/AV5Dm5x20cV0M6Y8/XWge79E8ZStZGTgSwUWt9qdb6evcvANc7N7/WOVYrz9tk5+VEpVR9t+1qKKUauVy1DPJcXu6yjQLwAqT7txV7Ie+Xvkqp/07EKKVaAnjefWOLv5PhQ0iTqymQ9+iHzoCYiIhChKXORERk1sOQjOxtADYqpX6FNJ8qgWQuh0MyZVf62c+bkCB3pVJqOiQjOApSWuo+T7I3gOlKqSUA1kHmZLaDrLlbAGCSc7vWAP5SSv0FYLVzP82c22nIOrp2uwKyVM6nSqlrACyFlPz2hgRbfZzbvQl5bhYopb6CzKE9AxKo/YPyTaeMpaOedwbzhyCNqN70MZb7IcsjPaeUSoIEYl0BnAtpyHSTj/sG6lpIJ+0PvW2gtV6rlFoKyUb30Vr/BZPPm9Y6RSk1CaXvtx8gJwXaARgJKRH+wflQb0LmzL6nlBoOYB8kKG8Az8+xV1rrEqXUZMg6vn8ppWZB1mIeA2kU1tXD3cy+F4zH2KOUSoFULpTAx3NIRET2YMaXiIhM0VoXa61vhzTi+QLS7OoWAHdAAotvAZystZ7mZ1eTANwKmXM6DsBoAN9DSkXdG1AtB/AK5P9VMoC7IeWmMwD0dykf3gZZm/Wwcz93Oy//BJCktf4mkN/ZF631OsgyNm9DAv97AFwECWxfd9luJqQBVAak2dOFAJY4f4+DHva7FhLE7YcEfc8AuNfPWPYCOBnAWwC6O7cfBGAa5HlaH/Av6oFziaKrIEGbv07ERlBnZH1NPW/ObW+HNCvb4Ly8A8AAAN9A1s41tkuDBMMrIE3OroSclDkVHp5jEx6EzOWNh7zHkyDP7WWeNrbyO7n42Hn5s9Z6RwBjJCIiCxQra4iIiIjCSyn1GICnAZyvtf4u0uMhIqroGPgSERERhZFzGahNzh/ba62LIjkeIqLKgHN8iYiIiMLA2aH6NMic9lYAbmXQS0QUHgx8iYiIiMLjDMjc4b0AXkRp52oiIgoxljoTERERERFRhVapMr5NmjTRHTp0iNjj5+bmonZtq8sJEoUG348UTfh+pGjC9yNFE74fKZrEwvtxxYoV+7XWTd2vr1SBb4cOHbB8+fKIPf78+fMxZMiQiD0+kSu+Hyma8P1I0YTvR4omfD9SNImF96NSKsPT9VzHl4iIiIiIiCo0Br5ERERERERUoTHwJSIiIiIiogqNgS8RERERERFVaAx8iYiIiIiIqEJj4EtEREREREQVGgNfIiIiIiIiqtAY+BIREREREVGFxsCXiIiIiIiIKjQGvkRERERERFShMfAlIiIiIiKiCo2BLxEREREREVVoDHyJiIiIiIioQmPgS0RERERERBUaA18iIiIiIiKq0Bj4EhERERERUYXGwJeIiIiIiIgqNAa+REREREREVKEx8CUiIiIiIqIKjYEvERERERERVWgMfImIiIiIiKhCY+BLREREREREFRoDXyIiIiIiIqrQGPgSERERERFRhcbAl4iIiIiIiCo0Br5ERERERERUoTHwJSIiIiIiogqNgS8REREREVFlpjXw6qvAxo2RHknIMPAlIiIiIiKqzLZsAe67D3jppUiPJGQY+BIREREREVVmDodczpoFlJREdiwhwsCXiIiIiIioMjMC3z17gGXLIjuWEGHgS0REREREVJmlpgJDhgBVqgApKZEeTUgw8CUiIiIiIqqsduwA0tOBs88GBg8GZs6M9IhCgoEvERERERFRZZWaKpdJSUByMrB6NZCREdkxhQADXyIiIqLK5O+/gXPOAfLyIj0SouCVlABXX106R5WsS00F6tYFevWSwBeokOXODHyJiIiIKpPnngNmzADWr4/0SIiCt3Yt8MknFbY8NywcDmDQIJnf26WLfDHwJSIiIqKYtWcP8MMP8v2OHREdCpEtjExvZmZkxxGr9u8H1qyRMmdDcjIwfz5w+HDEhhUKDHyJiIiIKosPPwSKiuR7Br5UETDwDc6ff8plYmLpdcnJQEEB8MsvkRlTiDDwJSIiIqoMSkqAd9+Vrq1VqjDwpdindWljpl27IjuWWJWaClSvDvTvX3rdoEFAw4YVrtyZgS8RERFRZfDrr7Jkyc03A61aMfCl2JeeLpneevWY8Q2UwwEMHCjBryE+HjjrLGDWLKC4OHJjsxkDXyIiIqLKYMoUoEkT4LzzgDZtGPhS7DPKnM89FzhypMLNSQ25w4eBlSvLljkbkpNl/u+SJeEfV4gw8CUiIqLoMGmSlOGS/Xbtkk7O//d/ktlh4BsbLrsMx0+eHOlRRK/UVDmZc/rp8jPLna1ZtEimQLg2tjKMHCmZ3wpU7szAl4iIiKLDV18BCxZwfdlQ+PBDKVm84Qb52Qh8tY7suMi7Y8eA6dPRYOXKSI8kejkccrKsdWv5meXO1jgcMt//lFPK31a/vgTEFWiZKAa+REREFHnHjgFLl8r3O3dGdiwVjdHUauhQWZ8TkMA3Nxc4dCiyYyPvli4FCgpQc9cunqDwZOdOYMsWCc5atZLrGPha43AAffsCdep4vj05WdZJTk8P77hChIEvERERRd6SJUBhoXzPElx7/fILsG0bMH586XVt2sgln+vo5Zy/Gp+bCxw4EOHBRCGjm3NSEtCypXzPUmfz8vLk5IqnMmdDcrJcVpByZwa+REREFHlGkxqAwZjdpkwBmjaVBkAGBr7Rz/VvooJk3GyVmiqZyl69pCy3Zk1mfK1YtgzIz/cd+B5/PNCtGwNfIiIiItukpspBFsBgzE6ZmXLQes01QLVqpdcz8I1uhYXSeGjoUPmZgW95DoesNxsfDygl5c4MfM0zTqwMGuR7u+Rk4I8/KsS0CAa+REQUnN27Of+MglNYCCxcCIwaBTRowGBs/36goMCefb3/ftmmVoaWLSVYqOzPdbT66y+Zg33VVfIzA9+yDhwA0tLKZisZ+FrjcAAJCUDjxr63GzsWKCoCfv45POMKIQa+REQUuB07gHbtKlTXR4qAlSuBo0flILayL7NTUgJ07w688krw+youBt57DzjjDKBTp7K3Va0KtGhRuZ/raGZk40aMQH6jRtLEiUotWCCXroFvy5ac42tWUZGcbPRV5mwYOFCWjKoA5c4MfImIKHArV0q2bt26SI+EYpnRpCYxkYHvnj3Avn3A4sXB7+vnn4Ht24Fx4zzfXtmf62iWmgp07gy0bIm8Vq2Y8XXncMh61P37l15nZHxZgeTf338DR47IZ64/VaoAZ50FzJ4tAXMMY+BLRESBS0uTS5aXUTAcDjnIb9GCwVhGhlwaf1vBmDIFaNYMOPtsz7dX9uc6WpWUSODrDEqOtWzJwNedwwGcfLIEv4ZWraQ8/PDhyI0rVhgVBWYCX0Dm+WZlybzzGMbAl4iIAsfAl4JVUgL8+WdpyV2bNpL1tGuOa6wxAt9t24I7gN+xA/jxR+Daa8s2tXLFwDc6rVkDZGf/9zeR16oV8O+/lfdvwt2RI1Jt5F6myyWNzDOaCbZubW77M8+U6RExXu7MwJeIiAJnBL480KBAuR3ko00bKVWsrO8pI/AFgLVrA9/P++/LSQX3plau2rSRTq3MkEUX1/Vp4cz4ai0nQ0iyjsXF5bOVrVrJJU/E+mZUFJiZ32uoVw8YMiTm+3kw8CUiosAUFgLr18v3PNCgQLmX3BnL7Pz7b2TGE2kZGdJtGQi83NloanXmmUDHjt63M57rnTsDexwKDYdDXpsOHQAAx4yAjuXOwuGQeaennFL2ega+5qxbJ12xzZY5G5KTgQ0bUDOGq0QY+BIRUWA2bpTgt3VrNhSpDLQGZs0C8vPt3a/bQX6lX192+3ZZYqR2bWD16sD2MWeOPH/emloZKvtzHY20lr+JxMT/ToDkxXrgm5kpyzPZJTUV6NsXqFu37PWhLHXOzQXmzrV/v5FgnGy0kvEFJPAF0HjhQpsHFD4MfImIKDBGNmr4cJl7lp0d2fFQaH38MTBmjFzaRevSkjsjy1nZg7GMDDkJ0KNH4Bnf6dNlbc6xY31vV9mf62iUni6Bm0tQUtCoEVCjRuwuafTYY8CIEfacHM3Pl47nnrKV9eoBtWqFJuP79NNSQRGrr4Gr1FTJjvuqBvGkQwegZ080XLkyJMMKBwa+REQUmLQ0KTcbOlR+ZnlZxZWdDdx/v3z/xx/27XfLFjnIdz2IrVcPqFOn8gZjGRlA+/aS9Q008F2+XDreVq3qezsjk1hZn+to5Ckbp5QEKbGa8d26VZbo2r07+H0tWybBr6dspVKlSxrZKT8f+OAD+d54fWKVh4oCS2bMQNqzz9o/rjBh4EtERIFJS5MlaIyzxgx8K67HHpM5Yb17y0GTXWXt3g7yK2u34YMHgZwcoF07CXyNNX2tyM2VplgnneR/2xo1gKZNK+dzHa0cDsnWd+tW9vrjj4/dwNd4f9mxRJfxmTF4sOfbQxH4fv89sH+/fDYZjcdi1datMqffapmz4bjjoOPj7R1TGDHwJSKiwKSlycE5l5Co2FauBCZPBm6+Gbj+ejmIde08HIzUVKBJk/IH+ZU18DWeVyPjC0jXayv++Ue6tvbrZ277yvpcRytv2Tgj4xtrvRS0tjfwTU2VaQCNG3u+vWVL+/8XTZkCHHeczHGN9YyvW8fwyoaBLxERWXf0qJSpuga+zPhWPCUlEvA2aQI880zpwZJdB38Oh2Ru3A/yK2sw5inwtRosLF8ul1YC38raQTva7Nwpwa2noKRjR1m/1moFQKRlZwPHjsn3wQa+RUXAggW+gzYj42vXCYKNG4H582VZsCFD5P9eLP+vcziAhg2B7t0jPZKIYOBLRETWrVsnBxYJCdJMpEGD2D4YIM8+/BBYsgR45RV5jXv0kIMmO8r9duzwfpDfpo1kbYqKgn+cWLJ9u1y2bw+0aAE0amQ9WFixQu5rzN/1p7KeZIhGvrJxxpSSWCt3Nt5bSgUf+P7zj6w57S/wPXpUpgzYYepUID4euOaa0l4EsVzubFQUxFXOELBy/tZERBQc4wCmZ0+5DMW8KoqsrCzggQckI3vllXJdXJz8bEfG19dBfps2km22oxlOLMnIAKpXB5o1k0AhkAZXK1ZIttds45q2beW1PnrU+niD8fffwGefhfcx7bZzJ/D887Jush0cDmns1qtX+duOP14uYzXw7d9fyvZLSgLfl/GZ4Wv9WTun3uTlAR99BJxzjpxM6t1bXp9YLXfetQvYvLnSljkDDHyJiCgQaWlygG4cjIViXhVF1iOPSLOlt94qG0QlJUn5X7BBaWqq94P8tm3lsrJlIjMypLGVkY0xAl+zZZu5uVKNYaaxlcFY0mjnTmtjDdarr0r5aKzNWXX1zDPydzJ7tj37S00FBg2SDKM7Y53rWA18R46U92cw/QEcDsl8t27tfRuj0sGOE7HffSdN/Yz1sOPj5fWJ1cDXmAZxyimRHUcEMfAlIiLr0tJkjlCVKvIzM74Vy7Jl0tDl1luBE08se5uRLfjzz+Aew+HwfpBfWdeXNZYyMiQkAIcOmQ9K//7bWmMrIHLPdXq6zP3cvz+8j2uXw4dLM9ZTpgS/vwMH5HPVWzauZk35nI21dWR37JATOcOHy8+Blju7rvnti52B75QpEmgPG1Z6XWKi/A5ZWcHvP9yM597oH1AJMfAlIiLrjI7OhlatJOMby9kbEsXF0tCqeXPgqafK396nj8zrDibrceCAlD16O4itzIFvu3alP1ttcGW1sRUQ2cAXsK9DeLh98YU0mzrzTGDOnNL52YEyTiT5KuONxbV8d+yQiiDjBFqgge/69XKSxF/ga1ep8/r18hk3blzZ+bB2nfiLhLQ0+XypVy/SI4kYBr5ERGTNwYNyMOMa+LZsCRQUxOZZcCrr/fclgHr1VaB+/fK3V60KnHpqcIGvv4P8hg0lw1WZAt+8PFm31zXj26OHXJoNFlaskL9Fs42tgNKy0XA+10eOyO8KxG7gO2WK9DiYMkVO+L33XnD7S02V6SP9+3vfJhbX8t2xQ06u1Ksn7+1AA1/j88bXiQEAqFsXqF07+Izv1KnyWXfNNWWv799fXqdYLHdevbpSZ3sBBr5ERGSVsa6oe8YXYLlzrNu/H3joIclqXHaZ9+2SkoBVq+QkSCAcDt8H+UpVvm7Drh2dDY0ayd+WlYyvlWwvINn7Ro3C+1xv3Vr6fSwGvitWyPrW48fL3NuRI+WEUTBdyB0O4OSTgRo1vG/TsaOUveflBf444fbvv6VVBYE0azM4HHJSx+gr4Y1SwU+9ycsDPv4YOPdcaTTnqkYNYMCA2At8Cwsli83Al4iIyAJP84QY+FYMDz0kc0rdG1q5S0yULNeCBYE9jpmDfAa+wmywcOSIHNhaaWxlCPdz7Zq1jMXAd8oUqUi44gr5efx4+eybNSuw/R05IoG0vzLejh3l727btsAeJ9y0Lh/4rlsnQZjV/Tgc8vyY6VYebOD77bdSvTR+vOfbk5Lk9TpyJPDHCLdNm+R5Z+BLRERkwerVUk5mdN4F7F1CgiJj5Uop17zjDv8HRyefLGWAgWQ9Dh8G/vrL/0F+ZQt8jQDQU+C7dq3/JXP+/lsCBKsZXyBygW/LloHPjT18GLjgguDn1lqVkwN8/jlwySWlUwFGj5ZgK9AmV4sWyevrr4w3kCWN/v4buPhiaSQWbjk50snZNfAtLJQgzIqMDHl/+nt+DMGuMjBlCtCpEzBkiOfbk5Lk9Vq0KPDH8OT552X5pFBgYysADHyJiMgqo7GV65l3I/Blxjd2ffYZUK0a8Pjj/retWVPK/Yx1Na0we5Dfpo2UdQaz7mcsyciQJjpGkGBISJCgxbU82JNAGlsZwh34btkicz779Ak847t4MTB9OvDjj/aOzZ/PP5dgzjUbGB8PXHcd8NNPgWVjHQ7pkO9vmZmOHeXSSuD78cfA11/LV7gZ7ynXwBewXu5sdn6vwcj4BtJscc0a6UHg3tTK1SmnyOtlZ7mz1tJX4dVX7dunq7Q0+X26dQvN/mMEA18iIjJP6/IdnQEJhBo2ZOAbq7QGZs4Ehg713NDKk8REWfbo6FFrj2X2IL9NG5kzuXevtf3HqowMOWCvWrXs9T17yqW/YGHFCrm/cRLKijZt5HnOz7d+30Ckp0v2sn37wANfI/gLdM5oILSWbGCvXnLix9X118vJwECaXDkcQN++UknjS7NmMifbSuBrBGd2LLlklXvge8IJEnwFEvg2bGg+W9mqlZwsOnTI2uMAwLvvygnA//s/79vUrSsnbQI58efNrl1AdrYE3gcO2LdfQ1oa0Lmz7+kllQADXyIiMm/PHvmn7OkAhGv5xq4NG4DNm4HkZPP3SUqSwHTxYmuPlZpq7iC/si1p5L6UkaFbNwmoVq/2ff9AGlsZjOc6XH+/6emSvWzfXuZSBjJXMhKB77JlUjo8fnz5uabt2gGjRkmTKytzWPPzgSVLzGUzlZLnzexavjk5Mt5WraTSwt97yG7ugW+NGhJ8WX3NUlOBwYO9Z2DdBTr15tgxyZCfdx7QtKnvbZOS5LPPrpNFrs9JKJZK8nTCuhJi4EtEROb5micU7LwqipyUFLkcM8b8fU49VQ5ErWQ98vLMH+RXxsDXfX4vIEuzdOzoO1g4fFhOXgTS2AoI73NdXCxl20bgCwSW9XUNfMO1fvjUqZJxvfxyz7ePHw/s3l3692TGsmUSPPmb826wsqTRwoUyVeC116SLerizvjt2lHZZNljt7Lx7N7Bxo/nnBwi82eI330inem9NrVwlJcnrtmyZtcfwxnhO4uPt7xh97Jic2GTgy8CXiIgs8BX4MuMbu1JSpHzTU+DlTf36QO/e1g7SrBzkV6bAt7hYfk9vz7+/YCGYxlZAeJ/rzExZ89sodQYCC3yNrGd2dnhOuB06BHzxBXDppTI/2ZNRo+S5nDrV/H6Nv5/Bg81t37GjBL5mgn2HQwKpMWOkEdinn8r85HDZsQNo3lxKhw0JCRKEmW22ZZxYC0fgO2UK0KULcNpp/rc1Xi+7yp3T0uS5GjTI/sB33Tp5vzDwZeBLREQWpKVJCZj72oaAHGzs2hW+7AvZ48ABWZbISpmzITFRSigLCsxtbxwkmjnIb9JEDpgrQ+C7a5eUjfsKfDdu9F5WGUxjKyC8ga+RrXTN+FrtzKy1BL6BNksKxGefyXx2X9nA+HiZ6/vLL/6bkRkcDvk9Gjc2t33HjjKOPXv8b2tMK6hdW8adkxPeJlc7dnhu1qa1BGNmpKZKlr1PH/OPG0ipc1qaZMjHjTO3ZFLjxkCPHvYFqUYpcmKidL0/fNie/Rr7Bhj4goEvEVH0ysmRZiALF0Z6JKXS0kqb7bhr2VLmtoWiMQeFzpw5Ug4ZSOCblCSZmxUr/G+rtaxz2qOHuYP8uDigdevKEfh6W8rIkJAggfHGjZ5vX7FCnqsWLQJ7/Lp1JYsZ7sC3RQtp5mU145udLZ+PY8fKz6EOfI2mVn36+C8nv+46CZzefdf/fpculcDObLdiwHxn52PHZP9GpnTwYJkvHs5yZ2+BL2D+NXM4ZFqFe9M3X+rWBerUsZbx/fBDOdF29dXm75OUJCcNi4rM38eTkhJpapWQEJqlktLS5Hfr1Mm+fcYoBr5ERNFq40aZt2f3WoGBcv3n7Emg5WUUWSkpEoAEMj/UOGA3U+43fbqcxLnxRvP7ryxr+ZoJfAHvwUIwja0M4Xqut2yRrt7t2snJjbZtrQe+RpnzgAHy3g1106YlS4BVqzw3tXLXpo2s6/vBB76bXH3+uQQ5zZsD99xjfixm1/JdulQqMYzAVynJZi5ZAvzzj/nHC4anwLdTJ5lvbCbwzc6W591KmbPB6tSbBQuk03yTJubvk5gomdlgn89t2ySLn5AQmqWS0tLkpEd8vH37jFEMfImIopVRpmWmpC0ctm+X7qsMfCuOggJZe3T0aPMdU101bSpVCf4O0o4cAe68U+YEM/Atzwj8PHV1BmTeYXy852Dh8GE5SRZoYytDuJ7r9HT5PY0MXiBLGrlmja02SwrE1KlSLnzZZea2Hz9ePrdnzCh/W0kJ8Oij0iDr5JMlQDWCWTPat5cg1l/gm5oq27lOK7jqqvA1uTpyRBpFuQe+8fEShJl5zRYskGy7lYy4wUrgW1go8+StnjyycuLPF+PETUKCZKr79rU/8GWZMwAGvkRE0cv4p717d2THYfA3T4iBb+xJTZWS0UDKnA1JSbL8RnGx922eeQbYuRN46y1rWQcjGKvo88YzMoBGjeSg15Nq1YCuXT0HC3/9FVxjK0M4A1+jXBcILvA97jj5PFqzRgLKUDh4EPjySwl6/S3BZRg5UoJ79wDzyBFpMvXcczIXeO5caxlGQJYEat3a/5JGxtzhhg1Lr2vUCLjoImDatMCWkLJi5065dA98AfMnK1JT5QTJySdbf3wrqwysXSvz562ePGrTRt7LwQapxnPRo4dcJiXJCZG8vOD2C0hTtn//ZeDrxMCXiChaRWvga/xzdmfML+SSRrEjJUUyQGecEfg+kpLk4Mrbgey6dbKcyjXXyFw9K9q0kQPS/fsDH18s2L7df0dtb8FCsI2tDG3ayN+ulTVoA+Ep8DU6PVvZR/PmcqIgIUHms5ptJmXVtGmyfzNL3BiqVJHA9tdfpYMxIMH94MGSBZ4wQbLIrt2OrTA6O3tTWCjTCjyVCI8fL1UCX30V2GOb5b6Gr6uEBAnGDh3yvQ+HQ8rZa9a0/vhGxtfMSTOjR0Egf0OJiRKgB3NyLi0N6NCh9MSKnUslrVkjlwx8ATDwJSKKXkYAGU2Bb7t23pfyqFFDMgrM+MYGrYGZM4Fhw6SMM1BGuZ+nrIfWwC23SIDy0kvW911ZljTytoavq4QECXbcl6NZsUKep+bNgxtDmzbyeoXy8+bwYWDfvvKBr9bWXuMtW0r3YTTbC0W5s9HUql8/60HRtddKAPzeexKEDhggwfmsWcAdd5jrHOyNv7V8//pL3ieeAt9TT5WTl6Eud/YX+AKlQZknublyUieQMmdAAt+8PMnY+7N8ufxfC6T5U1KSnJhbv976fQ3upciDBsmlHeXO7OhcBgNfIqJoZQSQ0TLH18w8Ia7lGzvWrpUD8WDKnAE5GdK+veeDtK++AubNA55/XuYDW1UZAl+tzQe+gLxuruxobAWE57k2gjXXOa3GvGYrSxq5Zo27d5fLUAS+ixbJfq1kew2tW8v6uZMnA0OHSmC1eLGUQQerY0f5nPW2Fq4x59RT0Gg0uVq2TALkUDHeR61bl7/NTGfnJUukW3Igja0Aa0sarVgh82oD6XNgjC/QILWgQIJm1/+tjRvLz3asEZyWJicevfUPqGQY+BIRRSsjgNy3L/jlEoJVWCglq/4CXyvzqiiyUlLkcsyY4PeVlFS+3O/wYeDuu+WActy4wPZbGQLfrCzJbpkNfF2DhZwcexpbAeENfN0zvoD5eb4FBVIma+yjTh2Z6xuKwPfTT6Ua4tJLA7v/zTfLazR4sARy3brZMy7jd/dW3u1wSPbSCP7cXXmlVOiEMuu7Y4fMX65Ro/xt7drJ6+arG7fDIYGo1ekRBrM9JwoLpStzoCePjj9epvkEGvhu2iT/393/t9q1VFJammT4AwnqKyA+C0RE0WrXLimV01qC30javFkOOJnxrThSUiQo9VSKaFViolQmbNpUet1TT8l7+O235X0ciObN5b4VOfD1t5SR4bjjZK6ja4BnZOxiLePrGvi2bSuXZgPfjAxpZOWaNQ5VZ+f584EhQ7w3HfPnzDOBlSulc3qjRvaNy9daviUlchLKV6a0YUPg4ouBzz6TE1Sh4GkpI4NS/l8zhwPo1QuoXz+wxzcb+K5ZE1hjK4NS8lw7HIHN8/VWipyYKA3I/v47sHEBMp7Vq1nm7CLsga9Sqq1S6lul1CGlVI5S6jullKn8u1KqnVLqY6XUdqXUMaXURqXUs0qpICYnERFFoaIiCSSMMr5Iz/M1O0+oVSsJdkLVYZXssW+flHEGW+ZscC/3S0uTBj7XXx9YR1ZDlSrynqoMga+/UsQqVeTzwDVYsKuxFQA0aADUqhXa53rLFgm6XDsNV68umUmzga+n4DkhQcpFrTTI8mfvXtlnoHNMDX36lC7dZBdfa/muXSvr3/ob9/jxElh98YW9YzP4CnwBec1Wr/YcLBYUSFl4oGXOgPlS52AaWxmSkuT3tdqdHJC/5ypVpGu7KzuWStq7V+YfG/PgKbyBr1KqFoDfAZwA4GoAVwLoDGCev+DVefuvAJIAPAbgLADvAbgHwAchHDYRUfjt2SMHBH36lP4cSWlpUip1wgm+t2vZUoL2AwfCMy4KzOzZ8v6yK/Dt0gVo1qy03PmWWyRT88ILwe+7oq/lazbjC5TPkq1YIRnTZs2CH4dS8lz/+2/w+/LGvaOzwcqSRt4C36KishUHwTICjmCCr1Bp0kSy0J4CX+Pkk79xDxwoAdHUqfaPDzAX+B44IMGZuxUrZP5yMM99nTrSJdlfxnf5cvmssrKWsrtg5vmmpQGdO5cvCW/dWsYUTIMrNrYqJ9wZ3xsAdARwjtb6B631DABjAbQH4K9zwCBIkDxea/2x1nqe1vplABMBnO8MqomIKgbjn3XfvnIZDRnfTp38LyvBtXxjQ0qKvFbG+ytYSkmGwuEAPv9cLl94wfoapZ5U9MB3+3b5uzLzXCUkyN9WVpb8bFdjK0Oon2u7Al8jS2ww0yzJqtRUeV3sfH7topQ8j57W8nU4JGg67jj/+xg3ToJMI+tpl7w8yTT6C3wBz6+ZEewNHhzcOMxMvQmmsZWhRw+pYgg08PUWmBpLJQVaQcXAt5xwB75jASzWWm82rtBabwWwAMDZfu5rLHaW43b9QcjvEURfeCKiKGOUZxkZ32gIfM3882TgG/3y84Gff5amVsEsqeIuKQnYtg247Tagf3/guuvs2W/bthKMBbNOZjQzOjqbeS2MksU1a2QN1E2b7GlsZQhl4FtcLO8PT4Fvu3aSaTZzgG8sZeQaqHTtKuWidga+DgdwyimBr7Ubap6WNNJaAqXERHPvpyuukODe7iZXO3fKZTCB7wknBF/J4C/wLSgIrrGVIS5OgnSrZclHj8r72dv/1qQkyYoHulRSWpqcULOjIqSCiA/z4/UAMMPD9WsAXOjnvr8C2ATgJaXUTQC2AxgA4A4A72itc33dmYgophj/rDt1kpKtSJY6Hzsmza0uucT/toEEvunpqGkcKIXKzJm+y6+VAkaNCn4t1Fjwxx8yt8+uMmeDUe538GBwDa3ctWkjB4gHD5adGxqo1FR5P/tQMz6Iw6P8fFm3dehQc9ubWcrIYBwgr14tgSRgf8Y3M1P2bdfrZ9ixQzroeiopbd9enre9e6VDri+essbVq0u5vV2B76FD0lTo8cft2V8odOwIzJkjwa4R5Kany+tntkS4QQP5XP/8c+DVV72v0W6VrzV8Dc2byxJn7q9ZcbF0M7744uDH0bKl9DLwZs0aCX7tOHmUlCSVNLt3+38PG9atk9fPV+ALyIkAo9+HFcYJaztPcMa4cAe+jQBke7g+C4DP/2Za6zyl1GAA0yGBsuE9ALd6u59SahyAcQDQvHlzzJ8/3+KQ7XPkyJGIPj6RK74fo1uHhQvRPi4OjnXr0L9+fRxetQrrIvR61dm0CSeVlGBNXBz2+RmDKijAaQC2LlyIDJNzpnrffju65Odjvqf1Hm1QZ/NmnHTDDX63yxw9GhvvvTckY4gmnSdPRovq1bGgalWU2PmeKi7GwKZNsX/wYGw+ckQ64tqg6aFD6AFg2fffI9dTttCC+CNHcOq55yLOzxIhXbp2xXyj27BFrX74AV0mTsTyKVNwpEsXv9ufunkz9rdsiY1mni+tMahOHez9+WfkrV6N4wEsyMtDoU3PdaujR9GluBgLv/8eBXaUqbto8Ndf6A3g75wcHHQbb+OcHPQEsOK773DY1wG+1hi8aRN2d+yIzW776N6sGeouXYolNjwXjZYswYla4+969cqNNRI8/b9uVVSELnl5WPjddyho3BgA0GLOHJwAYGmNGjhqctz1+vVD3w8/xLrnn8ceO9YYBtBs7lx0B7Bk504c8zGOXm3aIG7hQvzlsk2dzZtx0qFDWNekCfYE+dx3LC5Gmx074Jg3z2Pw13LWLHQFsKSoyOc4zahbpw76AVj75pvYe8YZpu7T/Kef0A3Aktxcz4+vNU5p3BgHv/kG6/z11/Bw38H//IPdI0eW+1sJVkwfP2qtw/YFoADAix6ufxZAkZ/71gAwD8AGAFdAmlzdCyl9nmzm8fv166cjad68eRF9fCJXfD9Gueuv17plS/l+8GCthwyJ3Fg++URrQOt168xt37ix1jfdZG7bwkKta9TQRdWqaV1UFPgYfZkwQca/dKnW27Z5/hoxQuvOnUPz+NGkpETrdu20Hjs2NPs/elTr4mJ797lwobx+s2cHv68vvpB9/fCD9/fCHXfokrg4rQ8fDuwxLrxQHuPxx/1ve+SIbPvss+b3P3iw1omJWl9yibyWdkpJkfEsWWLvfrXW+r33ZN/p6eVvW7VKbvvqK9/72LtXtpswofxtTz2ltVLynAbrwQe1jo/XOjc3+H3ZwOP/6zlz5LlITS297ppr5PPXyt9gcbH8rzn//KDH+Z8XXpCx+fsbuu02revUkc8lw8SJct+MjODH8dprsq8DBzzfPn681vXrl338QBUXa92smfxdmnXvvVpXry7/B725+GKtW7e2PsZt2+R3f+cda/czIRaOHwEs1x5iwXDP8c2G58yut0ywq+sADAFwltZ6mtbaobV+FdLV+UalVC9bR0pEFEmZmaXNW1q0iOwc37Q0mefWqZO57a2s5bt2LZCXhyoFBZ47lNrB4QA6dJB5p+3be/464wyZL+lv6YtYt3q1NFOyu8zZULNmcE1iPLFzfdmUFJnzNmaM9/fCqFFQJSW+SyS90bq0wU1Kiv/tt2+XS7OlzkBpZ2e7G1sBoV3Ld8sWID6+dN1eV8bv76/BlaeOzoaEBHn+160LbpyAvIYnnSTLO0UrT0saORwy19TK32BcnPw9/PyzlJvbYccOKaP2t/5xQoJMuzD+DgCZitC+vf/lvcwwpt54+1xfsUL+huwoBY6LA0aPlvLzwkJz91m9GujWTf4uvElKkjnT27ZZGw8bW3kU7sB3DWSer7vuANb6uW9PANlaa/cWdkudl92CHBsRUfTIzCz9p928eWTn+Kal+f/n7KplS/MBpGs3UTsb0xiMZi/+5rwZtwezZmIsMIKx0aMjOw4rWrSQg8pgg7GiIjkoHT3a9/zVU0+FjosLrEPrpk3yt9qlC/DXX/7HHGjgm50t85TtbGwFhDbwTU+X39PT50i9ehIoBRv4AsF/jhw7BixbFp3LGLkyGqIZz0lmppxcCGTcyckSgP7xhz1j87eUkcH9NTNOHAW7drLBOHns6URsQQGwapW9J4+Sk2V++J9/mtveTNNI47mw+nlkPKc9PIVdlVe4A9+ZAAYqpf77xFJKdYAsVTTTz313A2iolHJPOZzsvAxxZxQiojByDXxbtJADXbvOxltltqOzwUrGd/ny0qxKKALfDRuAffv8Hwz26QPUrl05At/+/csuBRPtqlaVv4Fgg7EFC+TvyF+2u25dHO7cObD3gnEfY/3iH3/0vb2VNXwNrn+Ldmd8GzeWRlGhCnx9zdFu165s5s/bPgDPS/Ucf7yMPdjPkSVLJGMX7YFvtWqSPTeWNApm3eFhw2QdWTNVCmaYDXyNoMx4zTZulAZndj33vpotpqXZ19jKMHy4vC5mnsfsbMnk+vvfGuhSSWlp8ho0aGDtfhVcuAPfdwFsAzBDKXW2UmospMvzvwD+66WulGqvlCpSSrm20/sIwGEAs5VSVyulhiql7gPwKoAVkCWRiIhiX2GhBGuupc5AZLK+OTlyMGo18N2929zSJCtWACedhGOtWoUm8DUOFvxlEKpWBU49NbAsX6zYswdYujR0Zc6hZMcyOykpclB65pl+Nz3UsyeweLH1k00Oh3SqPfdcCfL8HQBnZEj22ThAN8M1g2N34KtU6JY08hf4mlnLd8sW+Vz0VIJcpYp0vg32c8ThkOdh0KDg9hMOHTuWngxwOKS0uHdv6/upVUume6Sk2LNsmNnAt359Cd6N1yyY4N0TXxlfo9rIzr+hOnXkJMLMmf6fxzXOPr3+/rfGxZWu52uF1RPWlURYA18tSw6dDmAjgE8BfAZgK4DTtdZHXDZVAKq4jk9rvQ3AQAB/Q5phzQZwA4CpAIZrrQNc3ZmIKMrs2SP/NF0zvkBk5vkGMk+oVSspK92/3/d2hYWyhuJJJyG3Q4fQBb7NmwOdO/vfNjFR5lxl+2s5EaOmT5f3VWUOfIcMAerW9bvpoV69JOhdtszaYzgcctCuFDB2LPDbb0Cuj9UWMzLkd7OyfFKTJvKZ0L69fG+3UAS+hw7JcmK+Or2bCXz9Bc/G/OdgOBzAiSfGRqbMdS1fh0NO3gW6FNfYsfL8B/v8FRTI/zAzgS9Q9jVzOGTNWRPd0E2pXVvK6D1NvVm+XF7jIDvFl5OcLCdo/K29a/zOxtrcviQmWutBUVQkc90Z+JYT7owvtNbbtdbna63raa3raq3PcQa1rtts01orrfWTbtev1VpfpLVuq7WuqbXuorW+V2tdQY9SiKhSMs5Ou87xBSKT8Z00ScoH+/c3fx9fZ9ldORtboV8/5B53nJS52V3ObczvNdO8JClJAsMFFbCA6N13gTvukJLuXjHYCzLYYGzjRvkyGfQfMg5GrWRZ/v1XGtAY1QXJyfJ+/vVX7/exsoavq8svB666yvr9zAhF4Otrbq6hfXsJkA8d8r0fX/vo2VPKRwM9eVVYKE3Nor3M2dCxo5wQ3bFDAqlgxj1mjFwGW+5sfO5bCXzXrZNgzZjfa+e6s96m3tjZ2MqV2ecxLU1OwplZNs1qD4otW+Szh4FvOWEPfImIyA/jrK57qXO4M76//w58+SXw4IOlwbcZvuZVuTJKzU46SQLfoiIJTuySkSFl2mYbpQwYIKWwFancuagIuPNOYNw4KcH7/Xf7D/TCoU0bKbvPyQns/sZBqMnAt7B+fSmbtfJecC/TTEyUUk5fB8AZGYF1r331VeDpp63fz4w2bSR4NDNVwSyzgS/gPeubny8Bnr+ML1BaRmrVypXA0aOxFfgCwCefyGUwTaFatpT5rsEGvsZJEyuBb34+MG+evPZ2P/eeAt/8fPsbWxnatpVyczOBb0KCuc9jqz0o2NHZKwa+RETRxj3j26yZXIYz8C0oAG69VZrIPPCAtfuaDXyXL5cz3p06SeAL2FvubAQtZg+kataUzHZFCXwPHpTsw8SJku398cfYKN/0xDiI3hlgH8uUFMkGWsmuJiVJ9r+42Nz2DoeUVZ54ovxctSowcqQ8756CyMJC+X0CyfiGUps28vfvb6qCFXYEvtu2SUWGr3LpYDs7m+0JEC2M5/Ojj+Sk3YABwe0vOVmaewVTXRRI4AsAb78tl3Y/955WGUhLk78/u7uiG5KTgYULpbzfE62tzcGtWhU45RTz/5vS0iSg7sYFb9wx8CUiijaZmdLQwgh4q1cHGjUKb+A7caKUn73xhgSEVhgZan/zkVasAPr2BeLicLRtW5mbZmfgm5oqgZ6Vs95JSTIuX/MyY8GmTcDAgTLHdOpUYMKEwOf+RYNgltnJzpblRazObU5KAg4flnnoZqSmSkMk16WSkpMliPA0VzgzUwLiaAx8AXvLnbdskY7R9et738Zf4GsmeG7TRk4+BPo5kpoq80utVLhEknESYNMm4OSTpTNzMJKTJSibNSvwfVgNfLt1kyBt5syyJ47sYmR8XZtNLV8ul6HI+ALyPJaUALNne759zx4Jiq3+bzLbgyItDejUKbrXoY4QBr5ERNFm1y4JHl0PoMO5lu+OHcBTT0mzE2O+khXVqknTHV8ZX5fGVgCgq1aVA067M77ugYg/SUlSHrx4sX3jCLfffpOD4P37ZX7pDTdEekTBCyYYmzNHsrZjx1q7n5X1M/ftkznr7tUFo0bJ+89T2WMgSxmFQygCX39zcwHphl29uvcljcwEvkoF3uCqpMTcmt/RpFEjCRYBezKlvXvL6x9MufOOHVLJ4+skh6uaNSVIKykBBg+29nltRqtWUsGQlVV63YoVskSQp2Wx7NCvn/wP9/Y8BlKKbKUHBTs6e8XAl4go2mRmll9ntUWL8GV8775bAoUJEwLfh7+1fNeskXlWrmfc7ejIatizR9bwtXoQe+qpkm2P1XLnt98GRoyQ53/pUuC00yI9InsY5fOBBGMzZ8qJIysN2gAJAI47ztx74c8/5dL9/daokZx8YeDrP/CNi5P5zt4yvlu2SJBkVJR4Y3yOWF2WJy1NpgfESpkzIIG+8bzaEbArJSc7f/lFGg8GwuxSRq6MZnKheO6Nzw7XCqRQNbYyxMVJ1vennyTodhdI4DtggJQ8+/s8ysuTCgAGvh4x8CUiCpX582VupVWZmeXX9QxX4Dt3LvDNN8DDDwd3NtzTvCpXLo2t/tOzpxwg21Fm7C0Q8adePcl6WF0z8eef5YRBJD31FHDLLZJlXLjQ/mU6Iql6dSn9txqMFRbKwefo0XIwalVSkrwX/AVRqalSZuppzmBysjTScQ/ojJ8DaW4VSs2aSVm8XYFvUZH8rr7m5hp8LWlkBM/+gpWEBCkjtfp5abUnQLTo2FHe26eeas/+xo6VBl/z5gV2/0ACXyNIC8Vz777KQH6+lAyHqszZkJwsUyU8BappaVLhYExnMqNmTQl+Z80Cjhzxvt369XLimoGvRwx8iYhC5eWXZY6s1aU1PAW+4Sh1zs+XhlbHHw/cd19w+/KX8V2+XIJM14PhYDuyunI45EChb1/r901KkiVNPJ2p90RraQD2+uuBZ0mCdfAg8NJLwAUXAD/8UFr+WJG0aSNLBlmRmirL4wS6dnFiopSM+1uT0+GQOdXVqpW/zXjsH38se31Ghhz4Wp1DH2pxcUDr1vYFvv/+K8GvmRMxZgJffwJtcJWaKh15oy0D78+4ccAzz5han9qUoUOlg3Cg5c6BBL6XXgrcdJP1qgwz3Jstrl4d2sZWhmHD5GSYp+cx0FLkW2+Vz6LBg71PCWBHZ58Y+BIRhUJuriwdA0jJrVmFhTJf0FOp85Ejvs/0Buv112U5oTffDL5JSqtWknHx1hHXpbHVf4LtyOrK4ZAumJ4CEX+SkiSANRqg+LN0aWkDJG8HI6E2bRpw7JgsPWX3HLloEcj6sikpki0ePjywxzQyUL7KC3NygL/+8p6t6toV6Ny5/AFwoEsZhYOda/mamZtraN9ePjfcTyBpHdrAV2t5jc2u+R1NRoyQCh271Kghfy8//mi9XLyoSCp9rAa+J5wg0zSqVrV2PzPcM76hbmxlqFULOOMM+bt3fR5LSuTkbiCB6SWXyOuydaucJFi0qPw2aWnyPHbuHPjYKzAGvkREoTB3rmRQAWtr0xrleZ5KnYHQZX23b5eswbnnyhIswWrVSoJeT0uiFBRI6af7GffjjpPsV7CB78GDEogGWjY3eLBcmi13njKl9PtIBL5aS+fmfv1CfzAXSVaDMa3loPP00yWDFYhOneRvz9d7YdEiOZj19X5LTpbS0cOHS6/bvj16s4uRDHyB8pn9vXvlZKKZcmmjhNTK58jmzfLZG0vze0MpOVleA7MdzQ27d8vfgtXAN5Rq1ZJGW8bUmxUrZO59hw6hf+zkZAlS164tvS4jQ05gB5qRHTVKmi/WrQsMGVK6hrMhLU1OJITiJEIFwMCXiCgUUlKk3DQ+3lrG130NX4MR+IZqnu9dd0mg8Prr9uzP/Sy7K0+NrQDJVHbvHnzgu3Ch/C6BBr5Nm8oSG2aaGh06BHz5ZWlG0VuZZigtXizle+PHh/+xw6lNG5k2YHYO+Pr10hAp0DJnQLJ/SUnAH394z345HPJ3PnCg9/2MHSsnfObOlZ+1jo3A12rGz5P0dDkINxMMGRlw9xNIVoJnwHqjPOPERqzN7w2V0aPlvW+13NnqUkbh4jr1JtSNrVwZqyLMnFl6nR2lyN26yXrLgwYBV18tU22M6ip2dPaJgS8Rkd1KSqQBxahRcqBmJeNrnJV2L3U21pUMRcb3p5+A774DHn3UvgNx93lVrjw1tjLY0dnZ4ZAD7ZNPDnwfSUnSIMtbqbbBKDF++mkp245E4DtlClCnjsyTq8iMg+mdO81tbxy0B7Ikl6vERDmg9/baOhxyIO0rqzxokCyfYoxp3z5530Rz4HvsmPX+BJ5s2SLZNTMl+N7W8g0k8F2zRj6LzXA4ZAm2E04wt31F17y5NFKqaIFvXl54Glu5Pm6/fmWfR+P/W48ewe27cWNpqnjjjdJP5Nxz5bMxI4OBrw8MfImI7LZsmQSoycmyNm00Z3zz84HbbpNx3nOPffv1FfguXy6lZ57KFhMSJPg/cCDwx3Y4JKiuVSvwfSQmytzN1au9b6O1BJ39+km2r1Wr8Ae+2dnAV18Bl18uwW9FZnWZnZQUoE8faVgUDCML6KncOS9P5nj7yxTGx8uJsFmz5GRKtC5lZLBzSSOzc3ONx/V0AmnLFrk0W56akCCVAWb/Hh0O+ZuPtfm9oZScLP/LfDUpdBetga+xysDq1TIPOdSNrVwlJ0tVzt698nNamnwmmV3n2JeqVYHJk6Uvx+zZpb8XA1+vGPgSEdktJUWyG6NGSWObTZvMZx4yM+W+TZuWvb5pUzkgtDvwffVVmd82aZI0AbKLkaH2tKSRr1KzYDs7Hz0qgXWwJYtmmhoZJcbjxsnPvjrS+pOVJScffvrJ2v0+/VSCr4pe5gxYC8YOHJCS92DKnA0JCUCDBp7fC0uXSgmzmbmhycmS6V2yJHYCXzs6rKenm5ubC8iBvKcTSOnp0mnabAdsKw2uduyQeZgscy5r7Fi5nDXL/H127JDXqGHD0IwpUEbGd9ky+TmcvRDGjpWTpLNny8+hKEW+5RZgzpzSpnAMfL1i4EtEZLeUFGmQ1KiRBDN5eeaXYdm1S7K77mWBRjBsZ+C7bRvw3HPA+ecDZ55p334B6abctGn5bIHR2MrbgUewnZ2XLJHO2ME2qWnbVrJLvgJf9xLjYALfVavkBMmNN0rwboaRce7fXzKbFV3r1nJpJvCdPVtONtkR+MbFyd+zp/eCwyEncIyGaL6MHCmZ35SU6F3D19C3r3SFfeKJ0iZ9gcjOli8ra0p7+juykjUGSstIzXyOcH6vZwkJ8lpYKXc2ljKKtsx5q1byf2HuXCkRDucJp9695TlJSZExrF8fmsB0+HAJ7KdNq1hruNuMgS8RkZ0yMiSIMQ64u3aVS7PzfDMzy8/vNdi9lu+dd8oBil0Nrdx5Wss3LU2CX2+Bb+vWUgIWaOBrBCKDBgV2f1dJSbI/Tw1+XEuMjfUz27WTAz9/84I9MUo5MzKAF14wd5+FC6VbaGXI9gJSut6okbnANyVF/o4CWcfZk6Qk+Rt2P/HkcAA9e5rLcDVoICdkUlKkeVOdOtGXGTNUqyZVIJs2Af/7X+D7sTo3F7An8K1XT/Zj5nPE4ZC/4V69zO+/MlBK/o/9+qvM9zYjkDV8w8GYevPLL+FrbGVQSvoM/PKLVFAUFMhnRih06iT/k8grBr5ERHYyzo67B75m5/lmZpaf32to0cK+jO+sWcCMGcDjjwc/B9IbY16VK1+NrQA5SAimwVVqqhzANmgQ2P1dJSZKaaqnkxbTppUvMW7fvnQdS6vS0yUbePHF0qhk0yb/95kyRQ7YL7nE+uPFqrZt/Qe+BQVSMj5mTNl1ooNhZAP//LP0uqIiOflgpbogOVkOfufPl/dLtGXGXI0YIdUgzz4beCVDIIGv+wmkY8ekaY/VLJbZzxGHQ06UVdT1r4ORnCzP/2+/mds+WgNf42Ty0aORWfItOVmWMHrzTfmZpcgRw8CXiMhOKSlS3tyli/zcvLkEJ9EU+OblAbffLh1M77or+P154ynju3y5BKW+DmKNA1arS6kUFFgPRHzxNs/XW4mxt460ZqSny/1ff13mWt92m+/fPysL+Ppr4IorAl+jNhaZWV/2jz9kvVw7ypwNfftKxtn1vfDXX9JAyUqJrDGmVauid36vq9dfl+A80M+JQDO+hYWln3Xbtsml2XnChoQEKSstLPS+zf79UjXBMmfPTjtNKhPMlDuXlMgJimgMfF3/p4azsZXh9NPl8+OTT+RkHLuHRwwDXyIiuxw+LJkc1wNupSTra6bUuaBADsS8lTq3aCGlzsGurfnyy3JA+uabUtIYKq1aycGra+mvmTUUExKklNhq5nTlSslO2HUQ27mznLhwD3wXLpSsnXuJcTCB75YtEhy0bClLI/38M/DDD963/+QTmXtZWcqcDWYC35QUoEYNYNgw+x63alXglFPKvheM762caOnUqfSgNxYC37ZtpSrk+++leY5VW7bIXH9jOoAZ7n9HgQTPgHyOFBRI8z5vjAw+A1/PqleXzP+PP/r/v7N3r1RBRGPg6/o/NRIZ3xo1ZA5uYaF8Bpht0hZFdu0C3n1XCmmmTo3dOcQMfImI7PLLL3Kg5Z5pMrukkZHh8Jbxbd5cgp1DhwIfY3q6zCG9+GJ7AwNPWrWSLMC+ffJzfr7vxlaGQBtcGU1q7Mr4KiX7cl/GxigxvvjistcHm/E1DuxvvVXmgN1xh2QU3RkZ55NPrnzzEtu0kffTqlXAunWev1JSgDPOCG45K0+SkuRxDx6Un1NT5SDW24kqb4zPh1gIfAHJ9nbtKlUIRtdYs6zOzQXK/x0Z898DCXwBaWjk7b3y448S3EUiCxgrkpOlcmflSt/bGQ0cwxz45uQAX3wB/N//ySIFHlfCq1lTKo2aNIlYQ7nCkfJ3v7lGAv78Uyqfo5nWcn73+edLV+sbN06uq1OnKNLDC1h8pAdARFRhpKRIsxr3xkpdu8p/5mPHfJ/p9baGr8F1Ld9A57DecYfMJQ2mYY1ZRkCQmSljT0uTM97+Al/XjqxWuk07HHKSwVhKyQ5JScC338pBePv2pSXG115bft3c2rWlY6jVwPfQITlaMw7s4+OBt96Sx37uOTnycPXnn1LC+cEHgf9eseq44+TSX8D/yCP2P3ZiohwNLlggS5WlpgLnnGN9P+ecA7zyilQUxIJq1eT9eMYZMu7HHjN/3/R0yZRb4SnjW7s20KyZtf2ccIKM/Y47fG83dKi9S7lVNGedJeW5H3zg+7M7jGv4ZmUBM2cC06eXnm+uXx/4+GN5e15yCXDzzTIb5T8dO8r/1gjMq9+1C7jh/TH4HvH4aFUfPOdcMvqEE+QpNb769AntcuybN8u/yaIiKWKJjy/9Mn4uKZHCtZkzS8859e8vU/3HjpXzSX/8sR1AbGZ9GfgSEdmhuFgaRo0aJf89XHXpIgfMmzf77uZolPb6KnUGJPANZI5QSopkOF59tXRpmFAyAvjMTJkj6a+xlaFpUwlerWR8i4slELnwwsDG6o1RApmaKgfkn37qu8S4XTvp2GvF1q1y6TqHMTERuOoqea2uvrq0SRog2d569YCLLrL2OBXBBRdIEORriZ3q1eVg3W4nnyxHhw5H6UmQQEpkTz0VWLTI7ag8yg0bJhUOzz8v88qNExC+FBbK34LVLrPuJ5CMrLHFgKWkWg2k3JuKE+tu9T1cq4F5ZdO0qXzevfOOpPy8nXQKceC7Z4/M/pg+HZg3T4K3du0kwD3/fPmzWrMGePtt+Zj+6CP5V3PLLfLWrfntt1JyHGZLlgDnnQccPNgc815ehlvP7YSB6+Xf4fLl0jds2jTZVik51/vyy8CJJwb/2EVFcp7uxx/l37/ZViPVqsmf/H33SWlzOA4XwoWBLxGRHZYskfm5nhrquC5p5Cvw9ZfxNTKZgSxpdOyYNLTq3l0uw8E18AXkv3zDhuYOmnv2tBb4pqVJ5tTuuXoJCZJKSE2VA3h/Jcbt25tfusrgrZTz5Zel8/Ztt8mcX6UkM/ztt8D111euplaG6tUDy7LaoVYtCVZTU2WNZyDw99vAgbYNK2z+9z85gr7zTnlf+rN9u5yQCmRN0fbtS08gpadLSbkFRUXADTcAH300APHxA/D008D991tv3JyaKlP6W7aUeK51a7mMpj+9bdsk42n0UwyJZ58FvvlGokiHo0y39J075aaR83aga7VqUE2aBPwwBQXycm/YIB+jxteGDaX/9jp3Bu69V4Jd93YRPXsCkycDL70kwe/bbwPXXAPcfTdw7bXH4bTT5NyVa7bT+L5qVfmob9PGvmbwH34oS7O3bi3nuk48sTcAYEwnCSgNu3ZJILx4sYy5d28Z9zPPeD8c8CY7W6bj//ijXB48KIHskCHy8o0YIVnlwkL5OykqKvt9cbEcJliZlh9LGPgSEdkhJUX+e44cWf42o6TR3+nWzEw5Mmva1PPtrhlfq154QY6Q5s2T//Dh0Ly5HJUYmWwzja0MCQnA1KlSd2XmKMTu+b2GKlWAwYPlYO/PP2VeoK8S4/btZU6h1uYzVN6a9zRvLgect90maY4LLpBavsrY1CpaJCZKANi0qRzNGgFwjPn6azmH88gj0nDWlNatgSeflDTQjz+WPXL3xEdTqjVrgPffl6TusGHysVlms3btZEkvrWU/FqY85OcDl10GfPcd8PDDspuHH5Yg4NNPzU2tzsiQX/ObbzzfbgRIrVtLwDl+fHhXqPn3X3kNv/oKWLZMPmpuvVUS8iEplW3USE7EXXst8OmnOHL+1fjuO3k+f/tNXqam2IF6NVrj0IY4dOtmfteHDwMTJ8pHW3q6fOQbmjWT53f0aClyGjlSnmd/H6316kmQd/PN0uT97bflMczM8KlRQ/5lG4szuH41bmzuY72wUILtN9+UGQJffin39aZlS/lzGjMGuOce+difNEnud999Euj7el23b5dzUd9/L/+qiovluTv3XNnn8OEVN5C1TGtdab769eunI2nevHkRfXwiV3w/2qxHD61PP9377a1ba33VVb738X//p3WbNt5vLy7WumpVrR980NrYNm3Sulo1rS+7zNr97NCsmdbjxmmdlydjf+ABj5uVez++957WgNabN5t7nAsv1LptW61LSoIbrycvvSRjOfNMrevX1zo31/u2r70m2+7fb37/N96odePGnm8rLNS6d295/+TkaN21q9annGJp+GSd18/HWbPk9VVK60svDeuY7FBUpPVDD8mvUL26XF59tdb79pncQUGB1t27a33ccVofPep723fekQf491+ttdaHD2v9/vvy9gXk46BdO/ke0LpLF61vv13rOXO0LrzlDq3r1NF650658c03TQ3vyBGthw+Xu0yYINeVlGj98cda162rdb16Wk+b5v3+ublaP/641jVqaF2zptZPPaX1gQPyETpvntz3xRe1vvVWrc89V+v+/bWuVUseb/RorR2O0HwEzZs3T2dmav3GG1oPGlT6nPXtKx9Pt90mb8l27bT+6Sf7H19rrYsKinV2t1P0oepNdcua2RrQumNHrZ94QuuNG7Xe2TlJL4hP1FWrynvM18ek1lofO6b1669r3bRp6cfrY49p/emnWi9ZonV2tr3j37NH66VLtV64UF6n337T+uef5U96xgytp0/XesoUre+9V+vkZPmojY8vfa4BGes552j9v/9pvWyZfDx7epykJNn+nns8b2PGli1aX3SR7KdlS/mXWFQkt5WUaL1mjdbPPqt1v36l4+veXeuHH9Z68WI5XAiVWDh+BLBce4gFIx6MhvOLgS9RKb4fbbRli3ycvv66922GDtV64EDf+xkxQo6kfGnTRgJks0pKtB41So76MjPN388uvXtrPWaMHCUAWn/9tcfNyr0fFy+W7X/4wf9jlJRo3by51pdfHvx4PVm0qPTI4pZbfG87fbpst2KF+f2feabv133hQtnn0KFy+eGH5vdNAfH6+XjwoEQYgNaTJ4d1TME6eFCCM0Dr66+Xnx96SA7umzSRgMNU0DZvnuzkiSd8b3f//bqkWjW9eGGxvv56iWMBrbt1k8Bh7155vA0btJ44UT6matSQbe6LlxNIM2+YKVfMnu13WNnZWp96qtZxcZ7/RNLT5XZAzlm4BlYlJVp/+aWcOwO0vuQSrbdvN/FcaAmMn3mmNIA75RT52LIr8PjmG6179876723Xs6cEPJs2ld1uwQKtTzhBtrnqKmvn3twdO6b1unUSFE6aJB97rVpp3RsrdRHi9O89btV//un2funYUR8791J99dUyhg4dtE5JKb/vggKtp06Vf2WA1mecIYFuNCoslKD+xx/lnObVV0uwb/w7qFNH/m0/+6wE04sWyXuoRg3fJ1isWLiw9GRRQoLWd98tJ4mMMQwcKCdjNmyw5/HMiIXjRwa+DHyJyuD70UYTJmi/2ckbb9S6YUPfR5Y9e2p99tm+H6tfPzlCNOv77/0H5aF01lkyZiP7k57ucbNy78ecHNn+2Wf9P8bGjbLtO+8EP15P8vMl/QNovWqV722XL5ftvvvO/P6PP17riy/2vc2118p+/WWcyRY+Px9795bXIi0tbOMJ1oYNEhTFx2v91ltlP4ZWrZKDZ0CypaaKLC67TFLG770nR/huX8WfTNO7jz9Fp1frqgHJil57rRzE+/oIPHpUMpbvj5ETSP/DXVoD+unL1+uNG73fb/durXv1kizy9OnetysslCC1ShUJUObP13rlSq0TE+X3791bAphA5OZKYrpDh9IA/4MP5OMjEEeOaH3NNbKvNm1y9eOPS5bPl7w8yZrGx0uxzddfe3++i4vlnO3332v99NMS1CUmSnGJa5bTeP3GjtX622+1LrzxVjm74Hpyr6REqoruu09rLc9rt25y33PO0TojQx7v88+17tSpNGD77bfAnptI27FD6y++0PrmmyUYdX2u2ra1dt7TjJISOQHSsaO8tsOHa/3221IQEQmxcPzIwJeBL1EZfD/aaNgw+S/vy+uvy0eur5rCxo21vukm3/sZPVrrPn3Mjy0xUWq2Aq23CtZ110md1vXX+wz8Pb4fO3SQ1Is/Dzwgz20oT3mPHSsZV3/27bN2oqGwUI5kHnrI93Z790pK6d57ze2XguLz8/GRR+S9GcpaQhvNmSPnS5o0kWStJ0VFEhDXrSvZqhdflMycV5mZskP3CMnt6+eGF+mpU7U+dMjioJ0nkHK79NLFULp2fJ5WSkpQf/+97MdIRobWnTtLcPbzz+Z2v2SJBGBKyVeTJlLmapSSBqOwUAK8Xr3kaWjdWs7J+Xw+3axZI2WrSmn96KNa//rrfEtj+Oef0hLYs8+Wc4OLFsk4brpJyqXr1i19qZSSDGxSkhQUPf20nMNYuFBOKpT52M7Olqh64MDSv4G9e2VHEyf+t1l+vtYvvCDnDGvXLg2ETzxRMsGhKAmPlP37pVz6lVek1DlUioqi47xnLBw/MvBl4EtUBt+PNjl4UAKX++/3vZ0xN/DPPz3fnpcntz/9tO/9GIGkGUVFcjR4++3mtg+Fxx6T7MCJJ8ppai88vh/HjJHT6b6sWydpHivl34HIy5P6P39KSuRI7667zO1361Z53d97z/+2OTmRO4FRyfj8fCws9Hn0uXatBCvr19s/LitKSrR++WX58+vVS95q/uzYofV55+n/Smo/+8zHVN5DhySicn4dXrlRP3PVRt1FbdQnN9qoZ7y6UZccywts8MYJJKW0bttW79olc2+NcuJevaSc+Z9/JMNWv76U+lpx+LDWd9wh55KysgIbpi8lJZK9NubkdukiGVN/Ad+HH8pHSLNmWs+dK9cF8v+6sFACMaN83PiqV0/Oh95yi5QcL1kSQDD18ceys/ffl59XrpSfPaTbt26V+dB9+kiWNEbOF5EPsXD86C3wtalhNxFRJfXzz7IGgKdljFy5LmnkidGp2d/aBS1aAHv3lm196c369cDRo9JJOVJatZKxrlplfRwJCfI7FBR4vl1raWVau7asXxFK1aubWwNSKWkba6xB6o+Pzrfl1K1bfo1oCr/4eFnayIPPP5cVj559FujWTdbvXLLE/K43bQIefFCaRQ8bJl2Fvb39fcnNBa68UpbwOf98WcvTTAPq1q2lgfgPP8hHx+WXS8fZm26S7sFau2xcrx7QuTN0p874+q/O6DK6Mx7/tDPOuKkzftrSGWPv6QxVo7r1wQPSArdmTXnAjh3RogXw1FPSvfb996Vr7TXXyKpi+fnA/PmyjqsVdeoAEyYAr7wiq6zZTSlZOiY1FZg5U942F1wgK1n98Uf57Y8ckSW7r7lGtvn7b+kIHKj4eOkGvHq1dDOeMUOWDD94UDr/vvmmLPk0YIDXt7N3V14p3e4feEDWs/axhm+HDtJhe+VK4JJL7FsqiCgQfPsREQUjJUUO0k45xfd27dvLMkLeljTyt4avoXlzOeo7cMD/2FaskMuTTvK/bai0bFn6fSCBb1GRRAOefPONrKXx3HOydkO0sBL4elvDl2JKfr4snXL55bIG519/yVJB8+dLEDNkiCynUyZwdDp6VJaFOe00WTLl1VdlHc30dOCii2Rln0cekdXIfNm/H/joIwm2mzUDPvtMAvCvvrK+7uzZZ8s5ut9/B8aOlaVmBgyQdVJfe03OvQHA5s2yxMzFF8uf+pIlwFtvAQ0aWHu8cowTSECZv40aNWRFnVWrgF9/lTVSU1PlOY9WSsl50VWrZCW0zEx5P4wZI0EpIMuQ9+8v74MnnpAV0Vw/OoPRqZMsrTN2rAShZldZ80kpeaGzs4FHH/UZ+BJFEwa+RESBKioCZs8GzjpL1nv1JT5ejkC8ZXyNtW7NZHwBc2v5rlghR7xGtjkSXH8fqwG4sTBmWlr52w4fBu66C+jTJ/rWtLWa8a1alQeMNsnMlPVpv/kG+OcfCSpDbds2SX5NniwZtnnzJBB75hnJUL72mpzfOOssuf6zz+Sj46+/ZK3RVq2Aq66Ssb/4oqzROnu23GfOHODkk+X6jh1lPdOUFDn3Bcg5oVdfBZKS5JzYNddIZvb//k+yvI88EnigExcHDB0KfPKJfDxNmSJZ0nvukczw8OHyJ7p4MfDGG8DSpRK82cZD4GtQSjLikyfLyYJYUKWKvD4bN8qSuAsWSMb67LPlpEJ2tgTzTz7p/99JVDjxRKm4eecdSSfHx8ubkCiKsWaKiChQixZJmZe/MmdDly7+M77+TvO7Br49e/redvlyOdKO5FGUEfg2alR6IGtW164y9rQ0SSm5euYZec6mT4++o8T27SX9lpvrP9WWni5pmGj7HWJIcTHwyy/A1Kllg0JD27byp9e1q1x26SJZWDvKW1NSJGjVGvj+e+Ccc8reXqeOnJ+55Rbgiy8k4LniCskO5+RIBvOCC4Drr5fg1TVIjYuTbOrIkRIMv/ce8O67krlr21b2vW6dbNurlyTezj5bzgXZktVzUb8+MG6cfK1dC3z4ofzpnXeelNHalZ0sw/i8OP74EOw8cmrWBO67D7juOjmh8cYbcuJk2rTSj/eY8dRTUlLw889SmsDPMYpyDHyJiAI1c6Zk60aMMLd9166SwikuLn+AkJkpZ8ybNPG9D+PIaM8e39sVFckksRtuMDe2UGneXI7CTzrJ+tF4jRpA587lM75r1wKvvy5HjgMH2jdWuxgH7Nu3y0RPX9LTgY4dsXmzBBU9esiczMTE8B9Dbt8ucw/POUemE4dbcbFkaOPizJXm7tghpaPvvy9jb9ZMMq5XXSXzYjdulPNMGzfK12efAYcOyX2rVAEGDZJzVmPGyJ+mlbdnUZEEmi+9JIHmt9/6rlavVk3mb155JTBrFvDllzIn9bLLzAXgbdtKjPHoo8CPP0oAXFQkc2/HjrV+TikY3bvLvNhXXgnxA/nI+FYEjRrJiZAnnpA5tnafrAiL+vWl5OCKK1i1QjGBgS8RUSD27JGjz1GjpMmLGV26yBF5Rkb5g7nMTEmb+Ov8YZSS+St1jobGVoAE88nJ5rPi7hISJIA3aC3ps7p1gRdesGWItmvXTi7NBL5btiCna3+cfro0nVm0SJrONG0qAej550u5abVq3ndRXCwZwc2b5S3Uo4e14R47JkHMiy/K961ayXmFCy80fzBeUgJ8/bUk4vfskfFWqyY9wdy/LymRZHhurrxFje/z80v3V6+eHEe3bi2Xrt8XFEjGcdYs2dfw4ZJ1HDu27PPkPu9Ta2DfPsmSzp0r2dr77pOvTp0kAB4zRk46uCoslDJU4ysrSwJeh0Oq7CdMMNf3DJA/72D+HKpWBc49V74qvKFD5UXs3j3SIwkpq/Ovo85ll0mps9UPHqIIYOBLRBSIBx6Qo/aXXzZ/H2Ou7YYN5QPfXbvM1QvWrSu1cv4C32hobGWYMSPw+yYkSE3l0aOSFvnyS+kYNHmyRIfRyMhU+Zvn64ykJs3qiCNKAqlOnYCffpJf+Ysv5NxKgwYSKJ13nrz0mzZJkGt8padLcGY44wyZhzlihO/AVWt5ae66S+apXnihZEsff1wqy999V4Jwf1PEf/1V/hxWrpRpf5dcIsFpfr5cun6fny/Z1pYt5YC/Vi25dP2+uBjYuVO+duyQBP+uXWUbmTdvLo953XXmK2GVkqxws2bSSOrZZ+XcxKxZEgRPnixBbL16QJMmJ6GgQF6i3Nzy+6pVSxoRXXGFucemAJxyikyEpuimlJz1IooBDHyJiKz6809pc/rww9YaRxldWDZulEyxq8xMiXr8UUrKnc0EvrVrx07nF28SEiRCW7dOyp7vuUeC+UiXcPvSqpVEd34C3z2Lt6I5gHUFx2OuozRDecEF8pWXJ5nJ6dOlqv7TT0vvW6uWvF0SEiQz3KmTBIBLlgCTJsnbq3t36eZ6+eXlM5Lr1wN33CFzY3v0kObYp58ut40aJf1qHnlEppHfd598777kycqVsvTO3LkS63/6qSR/QrFcSVGRvOV37pRzIIMHS/YzWO3aSbnwTTdJgPvrrxIIr12bh86d66BhQ5T7atRInutoPe9CRESeMfAlIrKiqEhKbdu1k8DXiqZNJX3nqcFVZqZ0tzGjRQv/c3yXL5fJh7HebMS1s/Pnn0v0M2NGdP9e8fFSk+sj8M3MBF68dgveAPDAlI7o6aEivUaN0rLYwkLpAluligS5LVp4zuYOHSrB7pdfSvnv9dfL2/TWWyW4q1YNePppYOJEOS8yYYI0WnINIqtUkbf4BRdI0Pv88zI/duJEKSfeulXmmn7xhQSBr70m+6ge4JKtZhhPaSinEdauLc2hzj4bmD8/DUOGDAndgxERUdhxOSMiIiveeksWZJwwwfrkLKUkQ+y+pFF+vkwcNNsatXlz3xlfo7FVNJQ5B+v44yWi+uoribxuuMHmNVOCt2aNBJhDh0qzpaNH4XNJoz17ZCmWBlnpAICeZ/tv3lO1qqz9mZgobxNfJczVqknJ8t9/SwazXz8pX27bVoLm//1Pbt+4UbK+3jKnzZvLUjZ//CEV9uecI73ETjgB+OEHCajT06VUOpRBLxERkR2Y8SUiMmvXLokgRo4sv26JWV26yEKf7vsF/K/ha2jRQsqtvVm3TroURbqxlR3i43G4XXfUnTMHuTUa4fVaz6PKCxKI1a0rS7rUrStPnZEcDgetJaj83/9kJY+aNSWwvO466Sz8S4v26HVwPtxjyn37JOjdvh24aUQ6sLhpyFooG2udDhtW2gjbeAsPGGB+P0lJUtY8aZJ8XXONdKI1+3YlIiKKBgx8iahyOnZM0l0XXihtYc24/36ZeDlpUuBrT3TtKpMhXdd4NdbwtRL47t8v9a+e0nXR1NgqSNOmAdicgCvwFx5WL+KtSY3LrdNquPtu6TUWyiro/Hwp8X3tNWD1ankpnn0WuPFGKftNTQXefhv45et26KN3YtTwItx4azzGjJGuzWecIVnSWbOAls+nh22plu7dpVlVoKpWlef37rvtGxMREVE4MfAlosrpq68kEnj/fUmF3Xab72DW4ZAo7LHHzDWh8sZoNrVpU2k3I6sZX2NJo337PN9nxQpJhcZwYyutZXnI++8H7up9DS7sXx8T37kOE5ScezhyBDh8uPTLCEbXrZPv69e39nhFRXIuoqhIvgoLy35fUCDB6qRJUmWekCBL6lx6adky36Qk+Tp0UntUua8E2Wk7cc457dGunTSH2rpV1mEdOhTAdVuicx1iIiKiCoiBLxFVTlOmSGB4wgmS+V2zRtZu8ZRBLSyU7j0dOkgb22C4LmlkBL5GxtfsHN8WLeRy927Pga/R2CoU7XXDoKREMosTJwIXXQS88MlQVK8+FACgIGXFNWuW7ao7eLB0IL71VlkFJSXF3DI3hYXyVnjySeDAAf/bn3mmNPQePtz3eZL6J8qSRn9+noGUg+3x9tuyRu8PP0jWF4WFUu982WX+H5SIiIiCxsCXiCqfVauAxYslRXjHHbJWy4svSrefb78FGjcuu/2kSRIYz5hRfk0Xq4xssWuDq8xMCbjdH9cb18DXXVER8M8/Unsbg/LzpfHS118Dd94pc2jNxu833ijnMi64QOawTp8uDaE80RqYPVvm465fLxnYMWPkZYiPL/1y/bl7d/kyxbmWb/yODJx7JXDuuRLQ//e7/PuvLFhrdhFaIiIiCgoDXyKqfKZOlfrUq66SSOSFF2Qx0+uvl4gpJaU0wsnMlE4+o0fLujLBqlVLlkJyXdIoM1OyvWYjPCPw9bSkUQw3tjp0SALEefOAV16RJXutTqU+/XRg6VJ5qYYPlybc48aV3Wb1atn33LmyNPCMGbJ9oNO2PWrXTi5dOjuXeXnTpaNzuOb4EhERVXaxWQdHRBSo3FxpLnXBBWUzrFdcAcyfL7cPHCjpQEBSgoWFwBtv2BcZdelSNuO7a5f5MmegdI6vp4zv8uVyGWONrfbvr4akJGkO9emn8rQH+nR36iQJ/TPOAMaPB26/XRLhe/bIz717y9M0YYIsDzx2rM1BLyC12M2aeV/Ld8sWuWTgS0REFBbM+BJR5fLVV0BOjkRA7gYOBJYtk0goORm49lrplPTkk/YGKEZnZ60l4srMtNaIqmZNoF49z4HvihWyPE7nzvaNN4S0liD01lv7IjdXGkideWbw+61fXxL3998vvcsWL5aS5mPHJBB+7DHpwhxS7drJPF5P0tNlwV2uCURERBQWzPgShduMGVKPqXWkR1I5TZkCdOsm3ZA8adtW1sg95xzgvfck4L3/fnvH0KWLBN9798rPmZnWA6AWLbxnfKO8sdWhQ8B33wE33CCx4YABQEFBHObPtyfoNcTHyzTu996Tac9Dh8pU7ddfD0PQC8g8X28Z3/R04LjjQrv2EhEREf2HGV+icJs9WyYx5uRYX3OFgvP33zIBdMIE37WttWsD33wDfPAB0L+/ZFjt5NrZuX59IDvbWqkzIIGv+xxfo7HVzTfbM06blJTIsObMAX76CVi4UPo61asn5ciPPw40abIM/foNCsnjX3cdcOWVkmANq/bt5e/dyOy7Sg/fGr5ERETEwJco/IymNvv3M/ANt6lTgRo1JAryJy5Oml2FglHWvHGjZJgB6xnf5s2lO7WrtWtlkdsoaWxVXAx8/rn0Btu6Va7r2xd44AFg5EipLDdWj5o/vzCkYwl70AtI4HvsmKy33KxZ6fVayxzfU06JwKCIiIgqp+ithSOqqIzA18yioWSfI0eAadOACy8MU52rD+3aSVfpDRtK1/C1o9Q5ShpbaS2Z3b59pXF2w4bARx9JD68VK4DnngMSEz0vmVyhOJc0KlfunJ0t9d5cyoiIiChsGPgShVNRUelBMAPf8PryS+DwYc9NrcKtShVpPhVs4HvokGR4DUZjK2Ot4AhYtgwYNgwYNUrONXz5pVx39dWlqzBVGt4CXy5lREREFHYMfInCaft2qf8EGPiG25QpslbvqadGeiTCWNJo1y75OZA5vkDZeb7Ll0uaNQKNrTZtAi66SBpVpaUBkybJksIXXxzVfbZCi4EvERFR1OAcX6JwMg54AZnjW5Hl5gK1aoVggVQnrSWlWLeu/21XrpSg0M61eIPVtSswc6YERVWrll1T2AzXtXzbt5e1hv/5B7j1VtuHWlQEHDwoFbqevjZuBD77TKq3n3gCuOcecy9LhdegAVCnTvkljYw1fI87LuxDIiIiqqwY+BKFk2vgW5Ezvnv3ykH9tGnAuefav/+sLEkvLloEfPIJcP75vrefMkWaWl1xhf1jCVSXLhJRLlwoZc5WA3Ij42vM8127FsjPt9TY6uBBefocDjmHcPSonK/IzS39/ujRstXUntSuLRXkjz1WGo8T5DX1tKRRero8UXXqRGZcRERElRADX6JwSk+X7F6dOhU78F2yRCKmxYvtD3zXrweSkyWL1qULcMEFwNNPA48+6jl4PHxYWgtffLF0WYoWxpJGy5cH1ozKvdTZZGMrrWXO7TvvyPzbY8dkSnCjRpKgb9lSAtlatcpeNmzo/atGDevDrzS8Bb4scyYiIgorBr5E4bRli2RClarYpc4rVsjlhg327vfnnyWArV4d+P13yW6OGycLwa5dK+vuuq+5+8UXks6MhqZWrowljYqKrDe2AkqXxzEyvitWyMK4XjoFHzkiT8U770jld+3asqrT+PEyLZhCpH17qUxwtWULMHhwZMZDRERUSVXWliNEkWFkeho3rtgZXyP7uHGjPfvTGpg4ETjrLAkkli4FBg2SVOPHHwMvvQR89RWQlFTaJdkwZQrQs6csGhtNGjcundcbSOBrzAs2Al8Pja20Bv76C7jlFnmIceNkKvDbb8vTNGUKg96Qa99eJkIfPiw/FxQA//7LjC8REVGYMfAlCqf0dMnIVeTAV+vSjO/mzZLRDEZBgURsd94JjB0LLFhQ2i0XkOz5/fcDP/wgZdD9+0stLyDB4MqVktaMlqZWroysr9WOzgZjLd/CQmDVqv/KnDMygBdeABISJLB9/33gnHNkOvE//wA33STJYQoD987O27cDJSVcw5eIiCjMGPgShUt2tnQTqugZ38xMCcZ695aAbNu2wPe1fz9w5pnAe+8BDz8MTJ/uvSHQ2LES2VWrJpnfr76SlGbNmsDllwc+hlAy5vkGkvEFJPDdswdYswbIz8ev2f2QmAh06CBPV8OGwOTJwM6d0sTqlFOiM/6v0Nq1k0ujszOXMiIiIooIzvElChdjCZOOHWXt1oo6x9fI9l5+OfD331Lu3KmT9f2sWweMGSNR22efAZdd5v8+PXtKGfR55wGXXALEx8tE1gYNrD9+OBgZ3wADX928OY7+vhgfXrMctwK48f2TUK0b8Nxz8nR16GDbSClQ7hlf188BIiIiChtmfInCxTXT07ixrBFz9GhkxxQKK1bIPNOLLpKfA21wddddwKFDwB9/mAt6DU2bAr/+Clxzjfx8882BPX44nHyyBOdGAGzB5s3Adwuk1LnexuU4Vr0+vllxPNaskWwvg94o0bKlzMc2At/0dGnOFmh5OxEREQWEgS9RuLgHvkDFLHdevhzo3h1o21ZqbQNtcPX338DZZ0twaFX16jKxdf/+wJYKCpfTTwf27bMUpebny+pNCQnAX7tboDaO4sp2f6DmqX3Rp69iKXO0iYuTvwXXwLdjxzJNyIiIiCj0+J+XKFzS02UJmjp1gCZN5LqKVu5sNLbq108mk3btGljGd98+mbuakBD4WJQC6tcP/P7hYqEM+9dfpZr7iSekWdU9L8tavmr9+ugO8Cs717V8uYYvERFRRHCOL1G4bNlSesBbUTO+O3dKwNqvn/zcpQvw22/W97NmjVwGE/hGsWPHZGnX338H0tIk4dujhyTKu3eXRLmr3buBu++WdXg7dZLljM88E8AvzUs3Mp5zij7t2wO//CInhrZsARITIz0iIiKiSoeBL1G4pKcDp54q31fUwNdobGVkH7t2lXbCR45478bsSVqaXFaQwLewUFZY+v13+Vq4UEqWq1QBOncG5s4tO927ZcvSQLh+fVnCOC9PMr0PPijLFwOQrs4GZnyjV7t20tBu1y5Zz5dLGREREYUdA1+icCgslOVMrrxSfq7IgW9cHNCrl/xsNG3auFEWlDUrLQ1o1KhsYBdjDh4Evv5alhd2OIDcXLm+d2/glltkem9ioqynW1Iib481a4C1a0sv339f7jd8OPDWWxIkl2E8Pw0asHw2mrVvL9leh0N+5mtFREQUdgx8icIhI0OiG/dS54o2x3f5cklV1qolPxvr1AYS+CYkxNyis8XFMg/3o48k4M3Lk9Lkq6+WQPe000qnd7uKi5Ny5w4dgNGjS68vKZFzI02aeHkqGjeWtHHfvjH3XFUqxpJG8+bJJQNfIiKisGPgSxQOrh2dAVnepF69ipXxNRpbnXVW6XWdOklAZqXBldYS+F5+uf1jDJF164CPPwY+/RTIzJQ5utdeC/zf/0kFcqAxaVycrM7kVZUqwIgRst4xRS/3wPe44yI3FiIiokqKgS9ROBiBr+vcvsaNK1bgu2MHsHdv2SZLNWvK/EYrSxrt3Cnr90b5/F6tpZT5tdeApUslBh01SubjJifLikphMWtWmB6IAta2rVxu2iQTuI2KCCIiIgobBr5E4ZCeLpFQy5al1zVpUrFKnd0bWxmsLmkUA42tdu0CbrwRmDlTGlD973/AZZfF9JRkCiXjb3/XLpY5ExERRQjX8SUKhy1bpLwxzuVPrqJlfFeskLSn0djK0KWLZHy1NrcfI/Dt0cPe8dlAa2DaNBnaL78Ar74KrFolSw0x6CWfjHJnBr5EREQRwcCXKBzS08sf8Fa0wNdobFWzZtnru3aVJVx27za3n9WrgVatpKtzFMnMBM4+Wxpzd+sG/P03cM89EusT+dWunVwy8CUiIooIBr5Eoaa1BL7ua3dWpMDXaGzlOr/X4LqkkRlGR+coobUsRdyjh6y3+9prsiqN0bCayBQj48s1fImIiCKCgS9RqGVlATk55TM9TZrI9QUFkRmXnf79F9i3z3Pga0SIZub5FhfLArZREvju3CmNqq6+Woa0ahVw113M8lIAWOpMREQUUQx8iUJtyxa59FTqDEhgHOu8NbYCpKNtjRrmAt/0dFn8NsKBb04O8OSTwAknAL//DkyYAPzxB9C5c0SHRbFs9Gjg0kuBPn0iPRIiIqJKiV2diULNfQ1fgxH4HjgQ+52RjMZWJ55Y/ra4OIkYzZQ6G42teva0d3wm5eUBkycDzz0nL8uFFwIvvMDqVLJBhw7A559HehRERESVFjO+RKFmJvCNdcuXS5bWvbGVoUsXcxnftDRAKekeFUZFRcCHH8ow774b6NsXWLZM1ull0EtEREQU+xj4EoVaerpkdGvVKnt9kyZyGetr+fpqbGXo2lWeh8JC3/tKS5MTBLVr2ztGL7QGvv9eEtXXXisv06+/ylJFnqq2iYiIiCg2MfAlCrUtWzw3tKkoGd/t2yV49xf4FheXZr+9CWNH5z17gEGDgPPOkwB4+nRgyRJg2LCwPDwRERERhREDX6JQ87SUEVBxAl9fja0MZpY0ys+X28MQ+O7bJwHuP/8A770nSwefd55UWRMRERFRxcPAlyiUCgpkqR9PGd9atWRObKyXOq9YAcTHe25sZTACX1/zfDdulMm2IQ58s7KAM8+URHxKCnDddTJ8IiIiIqq4eLhHFEoZGVJH623tzsaNYz/ju3w50KOHLFnkTaNGMqfZV+BrdHQOYeB76BAwYoQsFTxzJnD66SF7KCIiIiKKIsz4EoWStzV8DbEe+BqNrcx0gura1Xepc1qapF6N7LDNDh8GRo0C/v4b+PZbCYCJiIiIqHJgxpcolIxmTt7WxIn1wHf7dhm/r8ZWhi5dgNmzvd+elibBcbVqACSm3rVLHuLff+XS+DJ+rlIFGDcOuOUWoFkz77vOzQXGjAGWLpUlipKTLf6eRERERBTTGPgShVJ6upQAt2jh+fYmTSQFGauWL5dLsxnfDz+UeuP69cvfnpYG9O8PQILeSy8Fvvqq7CZ16gDt2slXv37Azp3A008DL70EXHmlrMHrvgTwsWPA2WcDf/4JfPaZNLEiIiIiosqFgS9RKKWnS5mzt3bBsZ7xNRpb9ezpf9uuXeVy48b/Atz/5ObKc3XNNQCAH3+UoPfGG4HRo0uD3fr1yz+VGzYAr78OfPyxdGgeNQq45x6Zv1tQAJx/PvD778BHHwGXXBL8r0xEREREsYeBL1EoeVvD19C4MZCdLWvcVqkSvnHZZflyaUblq7GVwXVJI/fAd+1auUxIQF4ecMcdQPfuwBtvAFWr+t5t167AO+8Azzwjl2++CZxxBtCrlzy9v/8OTJ0KXHWV9V+PiIiIiCoGNrciChWtva/ha2jSBCgpAQ4eDNuwbGOlsRUgz0NcnOfOzqtXy2VCAl55Bdi61VzQ66ppU+Cxx6SR9nvvAYWFEvS++SZwww3m90NEREREFQ8zvkShsn8/cOSI/4wvIOXOxvexIiNDFsU109gKAKpXBzp08Bz4pqUBNWtimzoOzz8PXHghMGxYYMOqUUPW5r32WmmO1apVYPshIiIiooqDGV+iUDE6OpsNfGONlcZWBm9LGqWlAd274577qyAuDnj11eCHpxSDXiIiIiISDHyJQsVYw9dXqXMsB74rVkgtspnGVoYuXSTwLSkpe31aGnY27onvvgMeeUQaWRERERER2YWBL1GoGBnfDh28b9OkiVzu3x/y4djOaGxVvbr5+3TtChw9CmRmll534ACwaxem/ZWATp2kIzMRERERkZ0Y+BKFSnq61NrWrOl9m1jN+FptbGUwOju7zvNdswYA8Pu+BEycaC2OJiIiIiIyg4EvUagYa/j6Uq+erIMba4Hvtm2yDJPZxlYG17V8nQ7+mQYAaHlGAs46y6bxERERERG5YOBLFCpbtvie3wtIB6bGjWOv1Pm33+RywABr92vdGqhVq0zGd/lHachGAzw2mZ2oiIiIiCg0GPgShUJeHrBzp/+MLyCBb6xlfKdOBbp3B3r3tnY/pUobXAFwOIBqm9KQ0zYBx3dS9o+TiIiIiAgMfIlCIyND5sFWxMD3r7+AZcswIW88tmUEEKx27Qps2ICiIuC2WzVOjEtDqxEJ9o+TiIiIiMiJgS9RKBgdnf2VOgOxF/hOnYqCKjXwVPqVuOkmie8t6dIF2LYNUyflY9/qXWhQko2qvRn4EhEREVHoMPAlCgVjDV8zGd8mTWJnju+RI9CffYZvq1yMqk0b4qefgG++sbiPrl2BkhJ89NgWXNVHGlshgYEvEREREYUOA1+iUEhPlyZOzZr539bI+FpOnUbAF19AHT6MSQXj8eGH0tT5jjuAQ4fM76Kooyxp1LlkA+4/yxn49ugRgsESEREREQkGvkShYCxlpEzMgW3cGCgsBI4cse/x58wpu1auXaZMwba6PZHRYiBGjgSmTAH27gUeftj8Ll77UQLfe8duRKPMNKBFC8l6ExERERGFCANfolBITzc3vxeQwBewr9w5Nxc491zgwQft2Z9hxQpgxQq8njsOV1ypUKWKZHxvuw2YPBlYvNj/LhYtAh5+qT4O1miOPrU2AGlpLHMmIiIiopBj4EtkN61LM75mGNlOuxpczZ0L5OcDqalASYk9+wSAKVNQWLUmPi65AldfXXr1M88ArVoB48dL4tqbnBzg8suBtm2BOn27AuvXA2vWMPAlIiIiopBj4Etkt717JetqNvA1Mr52Bb4zZ5bub906e/Z5+DDw+eeYVfcSdOrXoMyU3Lp1gTffBFatAiZM8L6L226TVZ6mTQPie3QFli4Fjh5l4EtEREREIcfAl8huxlJGkQh8S0qAWbOA/v3l59TU4PcJAJ9/DuTm4oWs8WWyvYZzzgHOPht44glg27byt3/5JfDJJ8CjjwKDBkGWNCoulhsZ+BIRERFRiDHwJbKblTV8gdJSZzvm+C5dKhnnO+6Q+mOHI/h9ag1MmYKdTXvhr/gBuPRSz5tNmgTExQG33FK2QXVGBnDjjcDAgcBjjzmv7Nq1dAN2dCYiIiKiEGPgS2S3LVukm3P79ua2b9hQtrcj45uSAlSpAowaBSQlSeAb7DJJy5cDf/2FSXnjMHqM8tqAuW1bme87ezYwfbpcV1wMXHmlJKI/+wyIj3du3EU6O+O444A6dYIbHxERERGRHwx8ieyWng60bg3UqGFu+ypVgAYN7At8Bw0CGjUCEhOBnTs91x5bMWUKiqrXwuTDl+Oqq3xvetttQJ8+wO23y9q+L74o1dZvveVW+d2xo/zeLHMmIiIiojBg4Etkt61bJZNpRePGwZc6Z2QAq1cDycnyc1KSXAZT7pyTA3zxBRytLkXVxvUxerTvzePjZW3f3buBiy+WOb+XXAJccYXbhlWrAnffDVx7beBjIyIiIiIyiYEvkd0yMsyXORuaNAk+45uSIpdjx8pl9+5SRh1M4PvZZ8DRo3hsx3hceilQrZr/u/TvD9x6K/Dzz5L4njxZKrnLefll6YpFRERERBRi8f43ISLTiouBHTusB76NGwOZmcE9dkqKzJ015s/GxUm5c6CdnZ1Nrfa17YOF/56EiR66OXvz7LNS6nzLLVLFTUREREQUScz4EtkpM1OC30AC32AyvocPA/Pnl5Y5G5KSgE2bgF27rO9z6VLgn3/wQdXx6N5doV8/83etVw/4+GNgwADrD0tEREREZDcGvkR2ysiQy3btrN2vSZPg5vj+8gtQUFA+8E1MlMtAsr5TpqCkVm08l34prr7aS7kyEREREVEMYOBLZCcj8A0k43v0KJCXF9jjpqTIfN5Bg8pe36cPULu29cD30CHgyy+xostlyI2rV745FRERERFRDGHgS2SnQDO+jRvLZSDlzsXFwKxZsnZvvNu0/apVgVNPtd7gato04NgxPLl7PIYPB1q1sj4sIiIiIqJowcCXyE4ZGUDTpkCtWtbuZwS+gZQ7L14s93MvczYkJsoyR1lZ5vbnbGqV06UfZu/uh6stNLUiIiIiIopGDHyJ7LR9u/UyZ0Dm+AKBZXxTUiTTO3Kk59uTkiSYXbDgv6vefx944YUT8O67wIYNcvN/Fi8GVq/G9MbjUa8eVxwiIiIiotjH5YyI7JSRIevnWhVMqXNKigS33tYNGjBAFuBNTQWSk7FkCTB+PFClSlP88ots0qyZJIaTkoArfp2ChnXq4KF/LsGFlwI1a1ofEhERERFRNGHGl8guWkvga3V+LxB44JueDqxd673MGZDItX9/wOHAkSPAFVcArVsD06cvwoYNwLvvAiNGACtWAE/ckY2aKV/hvaOXY8/RuixzJiIiIqIKgRlfIrscOCCdmQMpdQ50jm9Kilz6CnwBSeW+8goeuj0XW7bUxvz5QElJEbp0Abp0Aa6/XjbLemoaaj6Zh73njse4xsDgwdaGQ0REREQUjZjxJbJLoEsZAUD16kCdOtYzvikpQLduwPHH+94uKQkoKsLaDxfjvvvkx3K0RqNvpgD9++ORb/tgyhSu3UtEREREFQMDXyK7BBP4ApL1tRL4HjoE/PGH/2wvgL2dTkUx4nBRcweeftrLRgsXAmvWyARgIiIiIqIKhIEvkV3sCHytlDr//DNQVOQ38NUauO6uevhH9cYV7RyoXt3LhlOmAHXrApdcYn4MREREREQxIOyBr1KqrVLqW6XUIaVUjlLqO6WU6W5ASqluSqlvlFL7lVLHlFIblFJ3hHLMRKZs3y7lyg0bBnb/Jk2sZXxTUiRYPuUUn5u9+y7w44+ASkpC7dWLgYKC8htlZQFffy2dr2rXtjhwIiIiIqLoFtbAVylVC8DvAE4AcDWAKwF0BjBPKeX3aFspdRKAJQCqA7gewFkA/gegSqjGTGSa0dE50ImxVkqdi4qA2bOB0aOBKt7f/hs3AnfdBQwfDvS6NRHIywOWLy+/4SefAPn5LHMmIiIiogop3F2dbwDQEUBXrfVmAFBKrQKwCcB4AK95u6NSKg7AJwB+01qf63LTvNANl8iCjIzAy5wBa4HvokWSpfVR5lxYCFx5pfTN+vBDIK5aotzgcACnnlq6odbA1KnAyScDvXoFPn4iIiIioigV7lLnsQAWG0EvAGittwJYAOBsP/cdAqAbfATHRBFlR+B78KBkc/1JSQGqVgXOPNPrJs89ByxdKlN3W7cG0LSpdIBOTS274Z9/AuvWMdtLRERERBVWuAPfHgDSPFy/BkB3P/c1VhStoZRarJQqVErtVUq9oZSqaesoiazKzZVsbTCBb5MmcpmV5X/blBRgyBCgXj2PNy9eDDz7LHDVVcCFF7rckJgogW5xcel1U6YA9esDF18c8NCJiIiIiKJZuEudGwHI9nB9FgB/HYFaOS+/AvAmgAcBnATgaQBtAZzr6U5KqXEAxgFA8+bNMX/+fMuDtsuRI0ci+vgUOrW2bcMAAGtzc7E3wNe42Z496A5g6Zw5OOojgK65YwdOXr8em4YPx04Pj7VvXzXcfnsfNGmicOGFyzB/fmmQ26xJE3TPycHyDz7AkZYt8eeMGTj166+ROWYMNi9dGtC4iezAz0eKJnw/UjTh+5GiSSy/H8Md+AbDyE5P01o/7vx+vlKqCoAXlVLdtNbr3O+ktZ4KYCoAnHTSSXrIkCFhGawn8+fPRyQfn0Lop58AAN1HjUJ31/mzVji7LQ/o2FEys968JtX+ne++G507dChz0549wI03SgL6t9+A/v3d9nP88cDzz+Oko0cxv04dDF65EigsRJunnkKbnj0DGzeRDfj5SNGE70eKJnw/UjSJ5fdjuEuds+E5s+stE+zK6Poz1+36X5yXfYIYF1FwjDV825lemas8o9TZX4OrlBQgIQFwC3oPHJDuzf/+Kw2f+/f3cN+2baUcOzW1tKnVKacADHqJiIiIqAILd+C7BjLP1113AGtN3NeXkoBGRGSHjAwgPh5o2TLwfTRuLJe+At/sbAlax44tc/XBg9LnauNGYOZMYPBgz3cHACQlAQ4HGvzzD7BhA5taEREREVGFF+7AdyaAgUqpjsYVSqkOAAY5b/NlDoB8ACPcrh/pvPSwOClRmGRkSDbVx5q6fpkJfH/6SRpTuSxjdPgwcNZZwOrVwHffAcOG+XmcpCRg3z50fOcdoEED4KKLAh8zEREREVEMCHfg+y6AbQBmKKXOVkqNBTADwL8AphgbKaXaK6WKlFLGXF5orQ8AeAHAjUqp55VSZyilHgTwOICPXZdIIgq7YJcyAoDatYFq1YD9+71vk5ICNGsGDBgAADh6VGLgpUuBL7+UANgv5/zhehs2SNvnmmyKTkREREQVW1gDX611LoDTAWwE8CmAzwBsBXC61vqIy6YKQBUP43sawP0ALgIwG8BNAF4BcENoR07khx2Br1Iyz9dbxrewEJgzBxg9GoiLQ34+cO65gMMBfPIJcN55Jh+nSxcJngFg3LjgxkxEREREFAPC3tVZa70dwPl+ttkGCX7dr9cAXnN+EUWHwkIgMzP4wBeQcmdvge+CBTKZNzkZhYVSofzLL8AHHwCXXWbhMZQCLr4Y+1esQJMenqbcExERERFVLLG0nBFRdNq5EygpCa6js6FxY++lzikpQLVqKDhtOK64XJpYvfUWcM01ATzOG28gbf58DAlmrEREREREMSLcc3yJKh5jKSM7Mr7eSp21BmbORP7g0zHs7Dr45hvgf/8Dbr45+IckIiIiIqromPElCpadga+3UucNG4DNm/HMgbuwIk8aWV18cfAPR0RERERUGTDwJQqWEfi2bRv8vho3BrKypHQ6rrQgY/XzKegJYG71MUidC/TrF/xDERERERFVFix1JgpWRgbQogVQo0bw+2rcWNbpPXQIgMS/Tz4JZH+ago21emPGX+0Y9BIRERERWcTAlyhYdixlZGjSRC4PHEBurnRunvTUAQxWC9DxjmS0aGHPwxARERERVSYMfImCtX27PR2dAcn4AtiVdgCDBgHffw9Mu2wO4nQJ4s9NtucxiIiIiIgqGQa+RMHQWgJfuzK+zsD3nqv3Y9s2YNYsYFRRipRSs8aZiIiIiCggbG5FFIy9e4G8PNsD37oFB7D4L+CEjgXAxT9JzXMcz1MREREREQWCR9JEwbBzKSMAizfLHN8Lhh7ACScASE0FcnKAZJY5ExEREREFioEvUTBsDHy1Bu59pj6KEYfTEpxr+aakSLfoM84Iev9ERERERJUVA1+iYNgY+M6YASxYFIeCOo1QLWe/RMIzZwLDhgG1agW9fyIiIiKiyoqBL1Ewtm8H6tUD6tcPajdFRcBDDwEnnADUaNMEOHAAWLsW2LoVGDvWpsESEREREVVODHyJgmHTGr4ffACsXw+88AKgGjeWwDclRW4cMybo/RMRERERVWYMfImCYUPgm5sLPPEEcOqpwNlnQzo7798vgW+/fkCrVvaMlYiIiIiokmLgSxQMGwLf118Hdu8GXnkFUAoS+G7bBixaxG7OREREREQ24Dq+RIHKyQEOHgwq8N23D3j5ZeCccyTjCwBo0gQ4fFi+Z+BLRERERBQ0ZnyJAmVDR+dnngGOHpW5vf9p3FguW7cG+vQJfHxERERERASAgS9R4IzAt127gO6+ZQvwzjvAdddJN+f/GIHvmDHO2mciIiIiIgoGA1+iQG3fLpcBZnwfeQSoWhV48km3G1q0kEsuY0REREREZAsGvkSBysgAqlUDmje3fNdly4CvvgLuvhto2dLtxpEjge+/B0aNsmecRERERESVHJtbEQUqI0PKnOOsnT/SGnjgAelhdd99HjaIj5duV0REREREZAsGvkSBCnApozlzgHnzgIkTgXr1QjAuIiIiIiIqg6XORIEKIPDNyQFuugno0gW48cYQjYuIiIiIiMpgxpcoEPn5wK5dljs633MPsGMHsGCBTA8mIiIiIqLQY8aXKBA7dsilhYzvTz8B770H3HsvMHBgiMZFRERERETlMPAlCoSxhq/JwPfgQeD664Hu3YGnngrdsIiIiIiIqDyWOhMFwmLge9ddwO7dskpRjRohHBcREREREZXDjC9RIDIyAKWANm38bvrjj8BHHwEPPgj07x/6oRERERERUVkMfIkCkZEBtGrlt0NVVhZwww3AiScCjz8eprEREREREVEZLHUmCkRGhqmOzrffDuzfD8yezS7ORERERESRwowvUSBMrOH7/ffAZ58Bjz4K9OkTpnEREREREVE5DHyJrCopAf7912fgu38/cOONEvA+/HAYx0ZEREREROWw1JnIqt27gcJCn4HvLbcA2dnAr78CVauGcWxERERERFQOA18iq/wsZfTFF8DXXwPPPQf07BnGcRERERERkUcsdSayykvgqzXw1lvAlVcCp5wC3H9/BMZGRERERETlMPAlssoIfF26OhcWAjffDNx6K3DWWcDPPwPxrKcgIiIiIooKDHyJrMrIABo2BOrWBQAcOACMGAG88w7w4IPSzdl5ExERERERRQHmpIis2r79vzLndeuA5GRp8vzJJ1LmTERERERE0YWBL5FVGRnA8cfjp5+Aiy8GatYE/vgDGDgw0gMjIiIiIiJPWOpMZEV2NvS6dVhyNAGjRwMdOwJLlzLoJSIiIiKKZqYyvkqpgQBGAhgIoBWAmgD2A9gA4A8AP2its0M1SKKo8dNPUMXFuGPuGJxznpQ3164d6UEREREREZEvPjO+SqmrlVKrASwEcBeAWgA2AVgCIBvAyQDeA7BTKfWRUuq4EI+XKKKOfJmCPWiGXtcPwDffMOglIiIiIooFXjO+SqlVAJoC+ATAVQD+1lprD9vVBzAGwOUA1iql/k9r/VWIxksUOYWFqPLLHMzGuXjksTjEcaIAEREREVFM8FXq/D6AKVrrPF870FofAvAZgM+UUr0AtLBxfERRo9ixADXzDmJ7r2TXJXyJiIiIiCjKeQ18tdYTre5Ma/0PgH+CGhFRlPr3rZloiWroec/wSA+FiIiIiIgsCLhYUylVSynFGY5UOWiN6nNTkFr1dIy+qE6kR0NERERERBZYDnyVUu2UUg4AhwHkKKVS2dSKKrrsxRvQ8shmHBycjOrVIz0aIiIiIiKyIpCM7zuQZYx6ARgMoATAVDsHRRRtVj+fAgDo+dCYCI+EiIiIiIis8tXV+Xyt9XQPN50MoLXR9Eop9QyA70I0PqKI0xqo9XsKNtTqja7D2dWKiIiIiCjW+Mr4TlJKzfJQxpwB4EIAUErFATgHwLaQjI4oCqycewB9ji7AkaHJkR4KEREREREFwFfg2xXAZgD/KKUeU0pVc15/LyQo3gcgG8CVzuuIKqS/X5iDKihB13sY+BIRERERxSKvga/W+rDW+g4AiQBGAkhTSg3XWv8O4HgA1wC4HMDxWutfwjJaIjtobXrT3FygwZ8pOFizBeqc1i+EgyIiIiIiolDx29xKa/2P1noQgJcAfKaU+hJAVa31j86v/SEfJZFdli4FGjQAFi82tfn0LwpwRtFPyBs2BogLePUvIiIiIiKKINNH8lrr9yHlzzkA1iml7nTO8SWKHZ9/DuTkADfdBBQX+9185QQH6iMHza9nmTMRERERUazyGrgqpRoopT5QSu1SSmUrpWYDaKa1Hgcpfb4CwEql1MBwDZYoKFoDKSlA8+bA338Dkyf73Hz9euC4NSkojK8BNfyM8IyRiIiIiIhs5ytj+x6AkwDcAeAqAArAbKWU0lovAdDfuc0spdS7IR8pUbDWrQPS04EnnwSGDwcefRTYs8fr5h+8r5GMFBQPGQbUqhW+cRIRERERka18Bb5nALhXa/211joFkuE9DtLYClq8CaA7gGred0MUJVJS5HLMGGDSJODoUeCBBzxuWlgILP5gLTpiK2pcwDJnIiIiIqJY5ivw3QEpaTaMBlAMYLfrRlrrPVrrq0MwNiJ7paQAffoAbdoAXbsC994LfPwx8Oef5TadNQs4NcslUCYiIiIiopjlK/C9E8A1SqmDSqm9AN6FZICPhGVkRHbatw9YuBBIdsnePvII0LYtcMstQFFRmc3ffx84v2oKdN9+QOvWYR4sERERERHZydc6vr9CSpsvAzAOQGet9cRwDYzIVrNnS3OrsWNLr6tdG5gwAVi1Cnjrrf+u3rkTWDprH/oVLoIayzJnIiIiIqJY53M5Iq31Qa31bK31D1rr7eEaFJHtUlKAVq2Avn3LXn/uucDIkcDjjwO7dgGQ6ueRejbioMtmiImIiIiIKCb5Ws6or7fbfNynhlLqhOCGRGSz/Hzg559lrq5SZW9TCnjjDSAvD7j/fpSUAB98AFzbJEVKnPv0icyYiYiIiIjINr4yvg6l1Eyl1EillM/MsFKqnVLqYQBbAbATEEWXP/4Ajhzxnr3t3Bm4/35g2jRMu+EP/LslH6ce8RIoExERERFRzIn3cVtXAM8AmAEgRym1CMA/APYByAfQEEBHAAMAJECC3nu01p+HdMREVqWkADVrAsOGed/moYeQ9+6n6PvBLXj2tJdQ9Q8fgTIREREREcUUr4Gv1nongGuVUg8CuAbACAB3A6jpstlWAA4ADwL4WWutQzhWIuu0lsD3jDMk+PViz+FaeCB/Ij7COei+4QbZ9vTTwzhQIiIiIiIKFZ8lzACgtd6rtX5Ja3261ro2gEYAWgGorrU+Xmt9jdb6Jwa9FJXS0oCMDJ/Z2+Ji4PLLga+OjUVO4lmI270LOPNMn4EyERERERHFDl+lzh5prQ+GYBxEoTFzplyO8T71/Nlngd9+A957T6HekDeAkxZKJExERERERBWC5cCXKKakpAAnnQS0bOnx5t9+A556CrjySuDaawGo44EDB4A4v8UQREREREQUI3h0TxXXnj3A0qXA2LEeb969WxK7J5wATJ7s0sCZQS8RERERUYXCjC9VXLNmSXMrD/N7i4uByy4DcnIk61u7dgTGR0REREREYcHAlyqulBSgbVugV69yNz31FDBvHvDhh0CPHhEYGxERERERhQ1rOqliyssDfvlFmlr9V8MsfvlFGlr93//JFxERERERVWymAl+l1OdKqcRQD4bINvPmAUePlitz3rsXuOIKoHt34K23IjQ2IiIiIiIKK7MZ34EA5iul1iilbldKNQjhmIiCl5IiE3eHDi1z9bRpwL59wOefA7VqRWhsREREREQUVqYCX611RwBnAdgA4FUAO5VSHyqlBoZycEQB0VoC3+HDgRo1ytw0YwZw4onyRURERERElYPpOb5a65+11ucBaAfgRQBDASxQSv2llLpRKVUnVIMksuTvv4EdO8qVOe/fD/z5J3D22ZEZFhERERERRYbl5lZa691a62cAnAogFUAvAG8DyFRKvaKU4sIwFFkpKdLQavToMlf/+CNQUgKcc05khkVERERERJFhOfBVSp2ulPoawFYAPQG8DgmCJwG4EcAnto6QyKqUFGDAAKB58zJX//CDrG7Up09khkVERERERJFhah1fpVRjANcAGAfgeAArIUHuF1rrPOdmi5VSqwG8H4qBEpmSmQksXw4891yZq48elWWMrruu3OpGRERERERUwZkKfAHsBFAC4CsAl2utl3nZbj2AvXYMjMijyZN9r0N05Ihcus3vnTsXOHaM83uJiIiIiCojs4HvwwA+1Fpn+9pIa/03gOOCHRSRV99/D+zeDQwZ4n2biy4CEhLKXDVjBlC/PnDaaaEdHhERERERRR9Tga/W+rVQD4TIlKws4OSTgW+/NX2X4mKZ9jt6NFC1agjHRkREREREUclUcyul1OtKqU+93PapUuoVe4dF5EVWFtC4saW7LFwoSxmxmzMRERERUeVktqvzWAC/eLntZwDn2DIaIn+ysoBGjSzd5YcfgGrVgJEjQzMkIiIiIiKKbmYD39YAtnu5bYfzdqLQKioCDh2yFPhqLfN7hw0D6tYN4diIiIiIiChqmQ18swF08nJbJwBH7BkOkQ/Zzt5qFgLfNWuALVvYzZmIiIiIqDIzG/j+CuBRpVRz1yudPz8MYK7dAyMqJytLLi0EvjNmyOXYsSEYDxERERERxQSzyxk9BmAZgE1KqR9RWt48BkAegEdDMzwiFwEEvj/8IE2gW7YMzZCIiIiIiCj6mcr4aq23AegP4AcAQwHc6bz8HsAArfXW0AyPyIXFwHfnTmD5cnZzJiIiIiKq7MxmfI3g96rQDYXIDyPwNbmc0cyZcsn5vURERERElZvZOb5EkWcx4/vDD0CXLsAJJ4RuSEREREREFP1MZ3yVUs0AXAqgK4AabjdrrfV1dg6MqJwDBwClgPr1/W566BAwbx5w551yFyIiIiIiqrxMBb5Kqa4AFjm3rw1gP4BGAKpAljo6FKoBEv0nKwto2BCI81+oMGcOUFjI+b1ERERERGS+1PkVSFfn5gAUgFEAagK4HsBRAOeGZHRErrKyTJc5z5gBNGsmHZ2JiIiIiKhyM1vq3B/AjQDynT/Haa2LAHyglGoKYAKkyzNR6JgMfAsKgNmzgYsuAqpUCcO4iIiIiIgoqpnN+NYBkKW1LoGUNTdxuW0ZJDAmCi2Tge/8+UBODrs5ExERERGRMBv4bgPQwvn9BgAXutw2BsBB+4ZE5EVWlqmljH74AahdGxg2LPRDIiIiIiKi6Gc28J0LYLjz+9cAXKOU2qCUWgPgDgAfhGJwRGWYyPiWlMj6vSNGADVrhmlcREREREQU1czO8X0IQHUA0Fp/rZQ6BuBiALUATATwbmiGR+RUXAwcPOg38F2xAti5k2XORERERERUym/gq5SqAuAEAJnGdVrrFAApIRwXUVkHDwJa+w18p00D4uOB0aPDMywiIiIiIop+ZkqdNYDlAPqEeCxE3mVlyaWPwHfPHmDqVODKK01NBSYiIiIiokrCb+Dr7OT8L4DaoR8OkRcmAt///U+WMnrooTCNiYiIiIiIYoLZ5lZTANyplKoWysEQeeUn8N2/H3j7beCSS4DOncM4LiIiIiIiinpmm1vVBXA8gHSl1E8AdkFKoA1aa/2E3YMj+o8R+HqpYZ44ETh6FHjkkTCOiYiIiIiIYoLZwPdhl++v9XC7BsDAl0LHR8b34EHgjTeA888HuncP77CIiIiIiCj6mQp8tdZmS6KJQuPAAbls0KDcTZMmATk5wKOPhndIREREREQUGxjQUmzIypKgt0qVMlcfPgxMmACMHQv06hWRkRERERERUZRj4EuxISvLY5nz22/LTcz2EhERERGRN6ZKnZVSJSjbzKocrXUVX7cTBcVD4JubK0sYjRgB9O8foXEREREREVHUM9vc6mmUD3wbAzgTQHUAH9k4JqLyPAS+774L7NsHPPZYhMZEREREREQxwWxzqyc9Xa+UqgIgBcAhG8dEVF5WFtCx438/5uUBL78MDB0KDBoUwXEREREREVHUC2qOr9a6GMDbAO60ZTRE3rhlfD/4ANi1i9leIiIiIiLyz47mVtUBlO86RGSXkhIgO/u/wLegAHjxRcn0DhkS2aEREREREVH0M9vcqp2Hq6sBSADwIoDldg6KqIxDhyT4dQa+n3wC/PuvzPFVKsJjIyIiIiKiqGe2udU2eO7qrABsAXCLXQMiKicrSy4bNUJREfDCC9LF+cwzIzssIiIiIiKKDWYD32tRPvDNA5ABYJlzri9RaLgEvl98AaSnA6+/zmwvERERERGZY7ar80chHgeRdy6B79SXgO7dgeTkyA6JiIiIiIhih6nmVkqpLkqp07zclqSU6mzvsIhcOAPfw9UaY9Ei4JxzmO0lIiIiIiLzzHZ1ngDAW45tDIDXbRkNkSfOwPeP1Y1QXAyMGBHh8RARERERUUwxG/ieBMDh5TYHgP72DIfIgwMHAACzFjZE3brAKadEeDxERERERBRTzAa+dSHNrDwpBFDfnuEQeZCVBV2vHubMjcewYUDVqpEeEBERERERxRKzgW86gGFebjsdstwRUWhkZaGwbiNkZLDMmYiIiIiIrDMb+H4C4C6l1C1KqeoAoJSqrpS6BcCdAD42+4BKqbZKqW+VUoeUUjlKqe+UUu2sDlwp9aBSSiul/rR6X4oxWVk4qBoBYOBLRERERETWmV3H91XIPN5JACYqpbIANIIEztMBvGRmJ0qpWgB+B5AP4GrI2sDPApinlDpRa51rcj8dATwKYK/J8VMsy8rCzmON0LkzcNxxkR4MERERERHFGrPr+BYDuEApdTqA4QAaA9gP4Bet9XwLj3cDgI4AumqtNwOAUmoVgE0AxgN4zeR+JgP4DEBXs78Dxa6SA1nYfLAdRlwa6ZEQEREREVEsshQ0aq1/h2RsAzUWwGIj6HXuc6tSagGAs2Ei8FVKXQagL4BLAXwXxFgoRhTtzcK+4kYscyYiIiIiooCYmuOrlBqjlLrVy223KKXOMvl4PQCkebh+DYDuJsbRELJm8P1a6yyTj0mxTGtUOZSFQ3GNMGRIpAdDRERERESxyGxzq8cA1PZyW03n7WY0ApDt4fosAA1N3P8VABsBfGTy8SjW5eSgii5G/eMaoU6dSA+GiIiIiIhikdlS5xMArPRy29+QRlMhpZRKBHAVgL5aa23hfuMAjAOA5s2bY/78+aEZoAlHjhyJ6OPHoqNr9uMsALXaFPG5sxnfjxRN+H6kaML3I0UTvh8pmsTy+9Fs4BsHwFu+rS6Aqib3kw3PmV1vmWBXUwC8D2CHUqqB87p4AFWcPx/TWue730lrPRXAVAA46aST9JAI1svOnz8fkXz8WJQyfwUAIOmcE9CRz52t+H6kaML3I0UTvh8pmvD9SNEklt+PZkud/wFwuZfbLgewyuR+1kDm+brrDmCtn/t2A3AjJEA2vgYBGOj8/iaTY6AYsvoPmcrdoW+jCI+EiIiIiIhildmM7/8ATFdKfQPgXQA7ALSGlBCfC+BCk/uZCeBVpVRHrXU6ACilOkAC2Af93Heoh+smAKgC4DYAmz3cTjGsuBhIXy6Bb1zTxhEeDRERERERxSqz6/h+r5S6A8BzAM5zXq0AHAFwu9ba7LJC7wK4FcAMpdSjADSAZwD8Cylllh0r1R7AFgBP/3979x5nV1XfffzzSyZXCLlAAgrkBkmEgFiTCgIKBJCLCCK2Yq1irUD7aq3ah9by2FpLq9W2j4iXPgLl8V5vBCGh1ZCQRE0MSqIlIcAEQkICBCZkQkLul1nPH/sMHCZnZk4yZ845s8/n/XrNa8/svfae38B6MXxnrb1WSummQg0LOz4sIl4Emg5yL2H1Eb/5DQzYVli8e5QjvpIkSZIOTdn7+KaUvhwR3wDOBI4EXgB+mVLadhDP2B4RM8i2JPo2WXi+H/hYh+cE2UhuuVOxlUNz5sCRbMq+GFnOot+SJEmSdKCygy9ASuklYE7xuYg4B7gmpfShMp+xDriqmzZrycJvd886t5zvqb5pzhz4kzGtsONwGDiw1uVIkiRJ6qMOaUQ1Ik6MiJsiYg2wAPj9ypalRrdlCyxZAicd3eo0Z0mSJEk9UnbwjYjhEXFdRCwGmoFP8spqyq/tpfrUoObPzxa3GjfM4CtJkiSpZ7oMvhHRLyIujYgfABuArwHjgK8WmnwspXRrSmlrL9epBjNnDgwbBiMx+EqSJEnqmU6Db0T8H+AZYDZwGfBj4GJgLPApyngHVzoUKWXBd8YM6Le5FY50KyNJkiRJh66rEd+PA2OA/wbGppTel1K6L6XURrYNkdQrHn8c1q6Fiy4CNm1yxFeSJElSj3QVfO8AXgLeDjRHxFci4k3VKUuNbE5h3fCL3pag1anOkiRJknqm0+CbUroWOAZ4H7AUuB5YEhGPAp/AUV/1kjlz4MQTYeKYbbBvn8FXkiRJUo90ubhVSmlXSul7KaX2d3tvBPYDf0P2ju/nIuIPI2Jw75eqRrB7NyxYUJjm3NqanTT4SpIkSeqBsrczSiltSCn9S0rpFOBNZCs7TwK+Rbbis9RjixbBjh0GX0mSJEmVU3bwLZZSWppS+gjZ/r1XAQsrWZQa15w5MGAAnHceBl9JkiRJFdHUk5tTSnvJtjn6cWXKUaObMwfOOgsOP5xXgq/bGUmSJEnqgUMa8ZV6w5NPwvLlcNllhRObNmVHR3wlSZIk9YDBV3Vj5szseNVVhRPtI74jR9akHkmSJEn5YPBV3Zg5E6ZNg/HjCydaW2HoUBjsouGSJEmSDp3BV3Vh/Xr41a+KRnshC75Oc5YkSZLUQwZf1YW77sqOBl9JkiRJlWbwVV2YORNOOQUmTy46afCVJEmSVAEGX9Xcc8/BokUdRnshC75uZSRJkiSphwy+qrm774aUSgTfTZsc8ZUkSZLUYwZf1dzMmTBpUjbV+WUpOdVZkiRJUkUYfFVTmzbBggXZaG9E0YUdO2DPHoOvJEmSpB4z+KqmZs2C/fvh3e/ucKG1NTsafCVJkiT1kMFXNXXnnTB+PLzxjR0uGHwlSZIkVYjBVzWzZQvMnQvveleHac5g8JUkSZJUMQZf1cy998LevSVWc4ZXgq/bGUmSJEnqIYOvKuuFF8puOnMmvPa1cMYZJS5u2pQdHfGVJEmS1EMGX1XO/PkwZgwsWtRt0+3b4ac/hSuvhH6leqFTnSVJkiRViMFXlfPDH2b77371q902/clPYOfOTqY5QxZ8Bw+GIUMqW6MkSZKkhmPwVWWklL20GwF33QUbN3bZfOZMOOooeMtbOmnQ2uporyRJkqSKMPiqMn77W3jmGbjhBtizB775zU6b7tqVZeQrr4Smpk4aGXwlSZIkVYjBV5Uxe3Y22nvDDXDWWXDbbdkocAn33QfbtnUxzRkMvpIkSZIqxuCrypg9O1ueecwYuP56ePxxWLCgZNOZM2HECDjvvC6e19rqVkaSJEmSKsLgq5575hlYtgze8Y7s63e/G0aOhFtvPaDpnj0waxZcfjkMHNjFMzdtcsRXkiRJUkUYfNVz996bHduD75AhcM018OMfQ0vLq5ouWAAvvtjNNGdwqrMkSZKkijH4qudmz4bx42Hq1FfOXXcd7N0L3/jGq5rOnAmHHw5ve1sXz9u5M1sBy+ArSZIkqQIMvuqZHTvg/vuz0d6IV86fdFK2V9Ftt0FbGwD798Pdd8Pb355t0dup1tbsaPCVJEmSVAEGX/XMvHnZ6Gz7NOdi118Pq1fD/PkAPPhgtr3vFVd080yDryRJkqQKMviqZ2bPhmHD4JxzDrx21VXZysyFRa7mzctOX3BBN880+EqSJEmqIIOvDl1bW7aw1cUXl16iefDgbJGru++G559n7lz4nd+B0aO7eW578HU7I0mSJEkVYPDVoVu2DJ57rvQ053bXXgv79rH7a19nyZIyRnsh28oIHPGVJEmSVBEGXx262bOhXz+49NLO27zudXDOOez72u3s29vGhReW8VynOkuSJEmqIIOvDt2sWXDmmd1PSb7+eg577kkuHTCPs88u47mtrdnU6aFDK1KmJEmSpMZm8NWhWbcOHnqo62nO7d71Llr7H8UnRtzKkCFlPLu1NRvtLd4eSZIkSZIOkcFXh+bee7NjGcH3uc2DuGP/Bzlz0yzYsKH7Z7cHX0mSJEmqAIOvDs3s2XDCCdk7vN2YNw9u51r6t+2Dr3+9+2cbfCVJkiRVkMFXB2/bNpg/PxvtLWM68rx5sGnUZNJ558Htt2fbIHWltdWtjCRJkiRVjMFXB2/uXNizBy6/vNumKWXNzz8f4vrrYe1a+MlPur5p0yZHfCVJkiRVjMFXB2/2bBg+nHKWaH70UXj2WbJtjK68EiZMgL/6K9i7t/ObnOosSZIkqYIMvjo4bW3wX/8Fl1wCAwZ023zevOx4wQVkWxTdckuWhm+5pfQNu3bBjh0GX0mSJEkVY/DVwfn1r6GlpbxtjMimOZ9wQjbQC2T3veMd8OlPw9NPH3jD5s3Z0eArSZIkqUIMvjo4s2ZB//7ZiG839u6FhQsL05yL3XIL7N8PN9xw4E2trdnR4CtJkiSpQgy+OjizZ2fv9o4c2W3TX/0qWwD6ggs6XJgwAW68EX7wA7j//ldfM/hKkiRJqjCDr8q3di08/HDZ05znzct2O5oxo8TFv/5rmDgR/uzPshWi27UHX7czkiRJklQhBl+Vr30booN4v3f69E4GhwcPhi9/GZqb4eabXzm/aVN2dMRXkiRJUoUYfFW+NWuywDppUrdNt27Npjof8H5vsUsvhXe+E266Cdavz8451VmSJElShRl8Vb7nn4cxY7L5y91YuDBbv+qA93s7uvlmSAk+/vHs69ZWaGqCww/vcbmSJEmSBAZfHYyWliz4lmHePBgyBM48s5uG48fDJz8JM2fCnDlZ8B01qqxwLUmSJEnlMPiqfAcRfOfOhbe+FQYNKqPxDTdk06c/8hHYsMFpzpIkSZIqyuCr8pUZfJ9+Gh57rJv3e4sNGpQtdPX449l2SQZfSZIkSRVk8FV5Uio7+M6blx3LDr4AF10EV12VfR+3MpIkSZJUQQZflWfr1my/3TKC79y5WbNTTjnI7/GFL8DQoXDMMYdWoyRJkiSV0FTrAtRHtLRkx26Cb0rZiO8FF0C/g/2zytix8OCDjvhKkiRJqiiDr8pTZvBdsSJr2u02Rp05+eRDvFGSJEmSSnOqs8pTZvBtf7/3kIOvJEmSJFWYwVflKTP4zp0LU6bA8cdXoSZJkiRJKoPBV+VpD76jR3faZPdu+PnPD3I1Z0mSJEnqZQZflaelBUaMgIEDO23ywAOwY4fTnCVJkiTVF4OvylPGHr6/+EV2fMtbqlCPJEmSJJXJ4KvylBF8Fy+GqVNh1Kgq1SRJkiRJZTD4qjzdBN+2NliyBM46q4o1SZIkSVIZDL4qTzfBd+VK2LLF4CtJkiSp/hh81b19+2DTpi6D76JF2fHss6tUkyRJkiSVyeCr7m3aBCl1GXwXL4ZjjoEJE6pYlyRJkiSVweCr7rXv4dtN8D3rLIioUk2SJEmSVCaDr7rXTfB95hlYu9ZpzpIkSZLqk8FX3WsPvkcfXfLy4sXZ0YWtJEmSJNUjg6+6182I7+LFMHQovOEN1StJkiRJkspl8FX3WlqgqQlGjCh5edEiOP10GDCgumVJkiRJUjkMvupeSwuMHg39Duwu27bBQw85zVmSJElS/TL4qnstLZ1Oc/7Vr2D/foOvJEmSpPpl8FX3ugi+ixdnWxi9+c1VrkmSJEmSymTwVfe6CL6LFsGpp8Lw4VWuSZIkSZLKZPBV9zoJvvv3wwMPOM1ZkiRJUn0z+KprO3ZkK1iVCL4rVsBLL8HZZ9egLkmSJEkqk8FXXdu4MTuWCL6LFmVHR3wlSZIk1TODr7rW0pIdSwTfxYvh2GNh7Ngq1yRJkiRJB8Hgq649/3x27CT4nn12tqqzJEmSJNUrg6+61smI77p1sH6905wlSZIk1T+Dr7rWHnxHj37V6cWLs6PBV5IkSVK9M/iqay0tcNhh2UeRxYvh8MPh9a+vUV2SJEmSVCaDr7rWyR6+ixfDGWdAU1MNapIkSZKkg2DwVddKBN+tW2H5cqc5S5IkSeobDL7qWong+8AD0NaWregsSZIkSfXO4KuulQi+ixdDv35w+uk1qkmSJEmSDoLBV51ra4ONG0sG39NOg2HDalSXJEmSJB0Eg6869+KLsG/fq4Lvvn3ZVGff75UkSZLUVxh81bn2PXyLgu9DD8H27b7fK0mSJKnvMPiqcyWC7+LF2dERX0mSJEl9hcFXnSsRfBctgrFj4bjjalSTJEmSJB0kg6861yH4ppSN+DrNWZIkSVJfYvBV59qD71FHAfDUU/Dss05zliRJktS3GHzVuZYWOPJIaGoCsmnOYPCVJEmS1LcYfNW5lpZXvd/729/CkCFwyik1rEmSJEmSDpLBV53rEHxXroSTToL+/WtYkyRJkiQdJIOvOtch+D7yCJx8cg3rkSRJkqRDYPBV54qC79atsH69wVeSJElS32PwVWl79sDmzS8H38cey05PnVrDmiRJkiTpEBh8VdoLL2THQvB95JHsS0d8JUmSJPU1Bl+V1r6H79FHA9nCVoMGwYQJNaxJkiRJkg6BwVeltQffohHf173OFZ0lSZIk9T0GX5VWIvg6zVmSJElSX2TwVWlFwXf7dli71uArSZIkqW8y+Kq0lhYYOBCOOMIVnSVJkiT1aQZflda+h28EK1dmpxzxlSRJktQXGXxVWnvwJXu/d8AAOOGEGtckSZIkSYfA4KvSOgTfKVOgqanGNUmSJEnSITD4qrQOwddpzpIkSZL6KoOvDpQSPP88jBnDzp3w5JMubCVJkiSp7zL46kDbtsGuXTBmDI89luVgR3wlSZIk9VUGXx2oaA/fRx7JPjX4SpIkSeqrDL46UIfg29QEJ55Y25IkSZIk6VAZfHWgDsF30iQYOLC2JUmSJEnSoTL46kBFwXflShe2kiRJktS3VT34RsTxEXFnRGyJiK0RcVdEjC3jvukRcVtEPBYROyJiXUR8NyImVKPuhlIIvruGjWb1at/vlSRJktS3VTX4RsRQYD7wOuAa4P3AJGBBRBzWze1XA1OBLwGXAH8DvBFYGhHH91rRjailBY44glXrBtPWZvCVJEmS1Lc1Vfn7XQtMBKaklJ4AiIjlwOPA9cAXurj38ymljcUnImIxsKbw3E/1SsWNqKXFFZ0lSZIk5Ua1pzpfDjzQHnoBUkprgMXAFV3d2DH0Fs49BWwEjq1wnY2tKPj26weTJ9e6IEmSJEk6dNUOvlOBh0ucXwkc9LhiRJwEjAEe7WFdKlYIvitXZtsYDRpU64IkSZIk6dBVO/iOAjaXON8KjDyYB0VEE/A1shHfO3peml5WNOLris6SJEmS+rpqv+NbSV8BzgTenlIqFaYBiIjrgOsAjj76aBYuXFid6krYtm1bTb9/Wfbv55wXXmDNtp2sWpWYPn0dCxeuqXVV6gV9oj+qYdgfVU/sj6on9kfVk77cH6sdfDdTemS3s5HgkiLic2Rh9pqU0n1dtU0p3QbcBjB9+vR07rnnll1spS1cuJBafv+ybNwIbW0MGfe7tLUFl1wyjnPPHVfrqtQL+kR/VMOwP6qe2B9VT+yPqid9uT9WO/iuJHvPt6OTgUfKeUBEfBL4BPCRlNK3K1ib4OU9fNfuHAO4orMkSZKkvq/a7/jOAs6IiIntJyJiPHBW4VqXIuIvgH8CPplS+kpvFdnQCsG3uXUM/frBlCk1rkeSJEmSeqjawfd2YC1wT0RcERGXA/cA64Fb2xtFxLiI2BcRnyo6dzXwReCnwPyIOKPow3HJSikE3+XPjWHiRBgypMb1SJIkSVIPVXWqc0ppe0TMAG4Gvg0EcD/wsZTStqKmAfTn1cH84sL5iwsfxX4GnNtLZTeWQvB98KkxTnOWJEmSlAtVX9U5pbQOuKqbNmvJQm7xuQ8CH+ytulTQ0kLq148HV4/i41fWuhhJkiRJ6rlqT3VWvWtpYf/Io9i9r78jvpIkSZJyweCrV2tpYfthrugsSZIkKT8Mvnq1lhZa+48hAk46qdbFSJIkSVLPGXz1ai0tbNg/hvHjYejQWhcjSZIkST1n8NWrtbSwZrsrOkuSJEnKj6qv6qw6tmsXbN1Kc/+jDb6SJEmScsMRX71i40YAnt3viK8kSZKk/DD46hUtLdkBg68kSZKk/DD46hVFwdcVnSVJkiTlhcFXrygE36bXjmHYsBrXIkmSJEkVYvDVKwrBd/TJY2pciCRJkiRVjqs662VtG55nF0OYeOphtS5FkiRJkirG4KuXbV/TwibGcPLUqHUpkiRJklQxTnXWy/Y9soq1jGfq1FpXIkmSJEmVY/BVZu9eDn/yIZYy3RWdJUmSJOWKwVeZRx5hwL5dPDliGsOH17oYSZIkSaocg68yy5YBsG3KtBoXIkmSJEmVZfBVZulStsYRDD7lxFpXIkmSJEkVZfAVAPt/vYxl6Y1MPNEuIUmSJClfTDmCvXuJFdnCViecUOtiJEmSJKmyDL6ClSvpt2c3y5hm8JUkSZKUOwZfvbywlcFXkiRJUh411boA1YGlS9kxcDgvHn6CWxlJkiRJyh1HfAXLltF8mAtbSZIkSconk06j27MHli/nwTanOUuSJEnKJ6c6N7qVK2H3bhbsmc6JBl9JkiRJOeSIb6MrLGz1YJrGxIk1rkWSJEmSeoHBt9EtXcrew4azmhOc6ixJkiQplwy+jW7ZMp4/dhoQBl9JkiRJuWTwbWSFha0eP2IagwfDa15T64IkSZIkqfIMvo3s4Ydhzx6WMp2JE6GfvUGSJElSDhl1GllhYav5W9zKSJIkSVJ+GXwb2dKlpBEj+PnTEw2+kiRJknLL4NvIli1jzynT2LHTha0kSZIk5ZfBt1Ht3g3Ll7Nx7DQAg68kSZKk3DL4NqqHH4a9e3lixHTA4CtJkiQpvwy+jaqwsNVv+02jXz8YP7625UiSJElSbzH4NqqlS2HkSJZumsDxx8PAgbUuSJIkSZJ6h8G3US1bBtOmsfpJF7aSJEmSlG8G30a0ezesWJEF39W+3ytJkiQp3wy+jWjFCti7lx0nT+eFFwy+kiRJkvLN4NuIli4FYM0otzKSJEmSlH8G30a0bBmMHMmjO8cDBl9JkiRJ+WbwbUTLlsH06ax+MgCDryRJkqR8M/g2ml27XrWw1VFHwRFH1LooSZIkSeo9Bt9Gs2IF7NuXjfi6orMkSZKkBmDwbTSFha3cykiSJElSozD4Npply2DUKPa8Zhzr1xt8JUmSJOWfwbfRFBa2empd0NZm8JUkSZKUfwbfRrJrFzz88MvTnMHgK0mSJCn/DL6NZPnyVy1sBQZfSZIkSfln8G0kHRa2GjoUjjmmtiVJkiRJUm8z+DaSZcvgyCNh7FhWr4aJEyGi1kVJkiRJUu8y+DaSwsJWRLwcfCVJkiQp7wy+jWL/fnjsMTjlFFKCJ5/0/V5JkiRJjcHg2yjWrYPdu2HKFDZsgJ07Db6SJEmSGoPBt1E0N2fHKVNc0VmSJElSQzH4NopVq7KjwVeSJElSgzH4NormZjjiCBgzhtWroV8/GDeu1kVJkiRJUu8z+DaKVatgypSXV3QeOxYGDqx1UZIkSZLU+wy+jaK5GSZPBmD1aqc5S5IkSWocBt9GsGMHrF+fjfhi8JUkSZLUWAy+jeDxx7Pj5Mls2QKbNhl8JUmSJDUOg28jcEVnSZIkSQ3M4NsI2vfwnTTJ4CtJkiSp4Rh8G0FzMxx3HBx2mMFXkiRJUsMx+DaC9q2MyBa2Gj0ahg2rcU2SJEmSVCUG37xLya2MJEmSJDU0g2/ebdwIW7a4lZEkSZKkhmXwzbv2ha0mT2b37mw7X4OvJEmSpEZi8M27oq2M1q7NZj4bfCVJkiQ1EoNv3jU3w8CBMG6cKzpLkiRJakgG37xbtQpOPBH69zf4SpIkSWpIBt+867Ci82GHwdFH17gmSZIkSaoig2+e7duXpd2iFZ0nToSIGtclSZIkSVVk8M2ztWth796Xg+8TTzjNWZIkSVLjMfjmWdFWRhs3wmOPwbRptS1JkiRJkqrN4JtnRVsZzZuXffq2t9WuHEmSJEmqBYNvnjU3w8iRcOSRzJ2bfeqIryRJkqRGY/DNs1WrYMoUEsF998H550P//rUuSpIkSZKqy+CbZ4WtjB57DJ55xmnOkiRJkhqTwTevtm2DZ5+FKVO4777s1IUX1rYkSZIkSaoFg29etS9sNXkyc+fCpEkwfnxNK5IkSZKkmjD45lVhK6O9E6ewcKGjvZIkSZIal8E3r1atgggeeOFEtm83+EqSJElqXAbfvGpuhrFjmfPzIfTvD+edV+uCJEmSJKk2DL55VdjK6L774PTTYfjwWhckSZIkSbVh8M2jlKC5mV1jJ7N0qdsYSZIkSWpsBt88eu452LaNR9qmkJLv90qSJElqbAbfPCqs6PzzDZM54gh405tqXI8kSZIk1ZDBN48Ke/j+aPkUZsyApqYa1yNJkiRJNWTwzaPmZtoGDWbJM8f7fq8kSZKkhmfwzaPmZlqPnESin+/3SpIkSWp4Bt88WrWKVWkyEybACSfUuhhJkiRJqi2Db97s2UN68kl+uWkKF14IEbUuSJIkSZJqy+CbN2vWEPv3s3zPFN/vlSRJkiQMvvlT2MroiZjMjBk1rkWSJEmS6oDBN28KWxkd/sbJjBxZ41okSZIkqQ64w2vO7F7RzFaO4vRLRtW6FEmSJEmqC4745sxLS1fRjO/3SpIkSVI7g2/ODFjTzJqmyZxxRq0rkSRJkqT6YPDNky1bGL7zefZPmsKAAbUuRpIkSZLqg8E3R55ZkC1sNfrMyTWuRJIkSZLqh8E3R5pnZVsZnXzllBpXIkmSJEn1w+CbI5uWrGI//Rh//gm1LkWSJEmS6obBNyeeew76r25m07DxxOBBtS5HkiRJkuqGwTcH7rkHTj0VJu5bRdNUpzlLkiRJUrGmWheggqVLYcmSg7pl92646y745RK44Th4/aBm+p1+Ti8VKEmSJEl9k8G3XsybBzfeeFC3DALeW/jg6cLJN7+5snVJkiRJUh9n8K0XH/0oXHttt8327YNbboHPfx6OOQb+/d/h7LMLF5uaYPjw3q1TkiRJkvoYg2+9GDIk++jCmjXw/vfD4sVw9dVZ6B05skr1SZIkSVIf5eJWfcC2bfDZz8Jpp8GKFfCd78D3vmfolSRJkqRyOOJbx3bvhltvhc98Blpa4LLL4MtfhvHja12ZJEmSJPUdBt86tG8ffOtb8A//AOvWwbnnwt13u26VJEmSJB0KpzrXkbY2+OEPYepU+OM/zhavmjsX5s839EqSJEnSoTL41on582HaNHjPe2DAgGyE94EH4IILIKLW1UmSJElS3+VU5zqxdi1s3ZotXHX11dC/f60rkiRJkqR8MPjWiQ98INuqaMCAWlciSZIkSfli8K0TTf6bkCRJkqRe4Tu+kiRJkqRcM/hKkiRJknLN4CtJkiRJyjWDryRJkiQp16oefCPi+Ii4MyK2RMTWiLgrIsaWee/giPjXiNgQETsjYklEvLW3a5YkSZIk9V1VDb4RMRSYD7wOuAZ4PzAJWBARh5XxiDuAa4FPAZcBG4A5EfGGXilYkiRJktTnVXsTnWuBicCUlNITABGxHHgcuB74Qmc3RsRpwB8AH0opfb1w7mfASuAm4PLeLV2SJEmS1BdVe6rz5cAD7aEXIKW0BlgMXFHGvXuBHxTduw/4PnBRRAyqfLmSJEmSpL6u2sF3KvBwifMrgZPLuHdNSmlHiXsHAif2vDxJkiRJUt5UO/iOAjaXON8KjOzBve3XJUmSJEl6lWq/41t1EXEdcB3A0UcfzcKFC2tWy7Zt22r6/aVi9kfVE/uj6on9UfXE/qh60pf7Y7WD72ZKj+x2Nprb8d5xndwLr4z8vkpK6TbgNoDp06enc889t6xCe8PChQup5feXitkfVU/sj6on9kfVE/uj6klf7o/Vnuq8kuxd3Y5OBh4p494JhS2ROt67B3jiwFskSZIkSY2u2sF3FnBGRExsPxER44GzCte6MhsYAPxe0b1NwHuA+1JKuyterSRJkiSpz6t28L0dWAvcExFXRMTlwD3AeuDW9kYRMS4i9kXEp9rPpZR+S7aV0Rcj4sMRcT7ZVkYTgL+v4s8gSZIkSepDqhp8U0rbgRnAKuDbwHeBNcCMlNK2oqYB9C9R3x8BXwf+Cfgv4Hjg4pTSb3q5dEmSJElSH1X1VZ1TSuuAq7pps5Ys/HY8vxP4y8KHJEmSJEndqvZUZ0mSJEmSqsrgK0mSJEnKNYOvJEmSJCnXDL6SJEmSpFwz+EqSJEmScs3gK0mSJEnKNYOvJEmSJCnXDL6SJEmSpFwz+EqSJEmScs3gK0mSJEnKtUgp1bqGqomIjcBTNSzhKOCFGn5/qZj9UfXE/qh6Yn9UPbE/qp70hf44LqU0uuPJhgq+tRYRS1NK02tdhwT2R9UX+6Pqif1R9cT+qHrSl/ujU50lSZIkSblm8JUkSZIk5ZrBt7puq3UBUhH7o+qJ/VH1xP6oemJ/VD3ps/3Rd3wlSZIkSbnmiK8kSZIkKdcMvpIkSZKkXDP49rKIOD4i7oyILRGxNSLuioixta5L+RYR746ImRHxVETsjIjmiPjniBjWod3IiPiPiHghIrZHxLyIOLVWdatxRMRPIyJFxD91OG+fVFVExKUR8fOI2Fb4/bw0ImYUXbcvqioi4qyIuC8iWiLipYj4TUR8qEObwRHxrxGxofB7fUlEvLVWNavvi4jjIuLLhb60o/A7eXyJdmX1vYjoFxE3RsTaiNgVEQ9FxFVV+WHKZPDtRRExFJgPvA64Bng/MAlYEBGH1bI25d4NwH7gfwMXA/8X+FNgbkT0A4iIAGYXrn8EuAoYQNY/j6tF0WoMEfFe4LQS5+2TqoqIuB64B1gGXAn8HvAjYGjhun1RVRERrwfmkfWva4F3AQ8Cd0TEnxY1vaNw/VPAZcAGYE5EvKGqBStPTgR+H9gM/KKLduX2vX8EPg18BbgEeAD4UURcWtGqe8DFrXpRRHwU+AIwJaX0ROHcBOBx4K9TSl+oZX3Kr4gYnVLa2OHcB4BvAuenlOZHxBXA3cCMlNKCQpvhwBrgOymlv6hy2WoAETESeBT4OPCfwGdSSn9buGafVK8rjGg8CtyYUvpiJ23si6qKiPgs2R+rR6WUthWdXwKQUnpzRJwG/A/woZTS1wvXm4CVQHNK6fKqF64+LyL6pZTaCp9/GLgdmJBSWlvUpqy+FxFjgPXA51JKf190//3A6JTS66vyQ3XDEd/edTnwQHvoBUgprQEWA1fUrCrlXsfQW/Bg4Xhs4Xg58Gz7/9QV7ttCNsph/1Rv+TzwcErpeyWu2SdVDR8C2oCvddHGvqhqGQjsBXZ2OL+FV/4//fJCmx+0X0wp7QO+D1wUEYOqUKdypj30dqPcvncRWV/+Tof7vwOcWhj4qzmDb++aCjxc4vxK4OQq1yKdUzg+Wjh21T/HRsThValKDSMizgY+APxZJ03sk6qGs4HHgKsjYnVE7IuIJyKiuF/aF1Ut3ygcvxQRr42IERFxLXA+cHPh2lRgTUppR4d7V5KFjROrUqkaUbl9byqwG3iiRDuok9xj8O1do8jmzXfUCoysci1qYBFxLHATMC+ltLRwuqv+CfZRVVBEDARuBf4tpdTcSTP7pKrhtWTrbfwr8DngbcBc4CuFV5TAvqgqSSk9DJxLNpPgGbJ+91XgT1JK3y80664/jurlMtW4yu17o4AX04Hv0NZVH22qdQGSeldhZOIeYB/wRzUuR43rr4EhwGdqXYgaXj9gGPDBlNJdhXPzC+/+3hgRX6pZZWo4ETEJmEk2MvYnZFOerwC+FhG7UkrfrWV9Up4YfHvXZkr/Vbizv55IFRURQ8jeSZsInJNSerroclf9s/261GORbeH2SeDDwKAO76MNiogRwEvYJ1Udm8hGfOd2OH8f2SrOr8G+qOr5LNk7lJellPYWzt0fEUcCt0TE98j627gS97b3x9YS16RKKLfvbQZGRER0GPWtqz7qVOfetZJszntHJwOPVLkWNZiIGADcCUwHLk0prejQpKv+ua54dUmphyYCg8kWudhc9AHZaqabgVOxT6o6VnZzvQ37oqrnVOChotDb7tfAkcAYsv44obBNZrGTgT0c+F6lVCnl9r2VwCDghBLtoE5yj8G3d80CzoiIie0nClOpzipck3pFYa/e7wIzgHemlB4o0WwWcGxEnFN03xHAO7B/qrL+BzivxAdkYfg8sl+e9klVw48Lx4s6nL8YeDql9Bz2RVXPc8AbCusgFDsd2EU2UjabbJ/f32u/WNhS5j3AfSml3VWqVY2n3L73U7KZC+/rcP8fku3ksKYKtXbLqc6963bgz4F7IuJvgUS2ufN6skVepN7yVbL/SH0G2B4RZxRde7ow5XkWsAT4TkT8Fdmo241AAP9S5XqVYymlF4GFHc9HBMBTKaWFha/tk6qG/wYWALdGxFHAk2T/vXwbr6yDYF9UtXwF+BEwOyL+newd38uB9wI3p5T2AL+NiB8AXyzM5loD/CkwgQODhlS2iHh34dNpheMlEbER2JhS+llKqay+l1JqiYgvkK2T8BLwG7JwPIOsP9eFOHDxLVVS4d22m4ELyX5h3g98rHhzaKnSImItpd/JAPiHlNKnC+1GAf8GvJNsKuoS4C9TSg/1fpVqdBGRgM+klP626Jx9Ur2uMHr7z8C7yd7lfQz4XErpP4va2BdVFRFxCfAJsun1g4HVwG3ArSml/YU27YsD/gEwAngI+ET7Hw6lQ1H4PVzKz1JK5xbalNX3IqI/2R8IrwWOAZqBm1JKd/ZG7YfC4CtJkiRJyjXf8ZUkSZIk5ZrBV5IkSZKUawZfSZIkSVKuGXwlSZIkSblm8JUkSZIk5ZrBV5IkSZKUawZfSZJUUkSsjYjv1LoOSZJ6yuArSZIkSco1g68kSZIkKdcMvpIk1YGIOC0iZkXE5ojYGRGLI+ItRde/ERFPR8SZEfFgROwqTEX+SIlnvSki5kXEtojYHhH3R8SbSrQ7JyLmRsSWQruHIuKPS7S7OiIeLbRZGhFnV/6fgCRJvcfgK0lSjUXEG4FfAqOAa4GrgE3AvIiYVtT0COAHwDeBdwILgS9FxAeLnvV64GfASOCDwAcK9/0sIk4rancFcD8wELgeuAL4f8C4DuW9BfhfwN8B7wH6A/dGxIge/tiSJFVNpJRqXYMkSQ0tIu4HXgucllLaUzjXH3gYaE4pvTMivgFcA7w3pfT9onvnApOB8SmlFBF3AhcUvn6x0OYIYC2wMKX0rogIYA3wAvCmlFJbJ3WtBYYDE1NKmwvnpgMPAu9LKf1nRf9BSJLUSxzxlSSphiJiCHAO8COgLSKaIqIJCGAe8Nai5vuBmR0e8X1gLHBs4eu3Ave2h16AlNJWYFbh+wBMIRvZ/Y/OQm+RJe2ht2BF4Ti2+59OkqT6YPCVJKm2RpFNH/47YG+Hjz8HRkZE++/rzSmlvR3uf75wbA++o4ANJb7Pc2TTnwGOLByfLqO+1uIvUkq7C58OLuNeSZLqQlOtC5AkqcG9CLQBXwW+VapBSqktm53MyIgY0CH8Hl04PlM4tgLHlHjMMUD7yO0LheOxJdpJkpQ7Bl9JkmoopbQ9In4BnAb8ppupx/3JFr76ftG5q4F1vBJ8fwZcGhHDUkovAUTEMOAdZIthAawie+f3wxFxW3LBD0lSzhl8JUmqvb8Efg7MiYg7yKYqHwW8EeifUvqbQruXgH+JiKOAx4H3ki1k9cGi8PqPwGXA/RHxeSABnwCGAjcBFBbB+hhwFzA/Ir4GbAROAsaklP6+l39eSZKqynd8JUmqsZTSb4DfJdvC6EvAfcAtwKlkgbjdVrIR3muAe4DzgI+mlL5Z9KzlwLmFtt8Evg1sA85JKT1U1O4e4MLCl3eQLX51HdlIsCRJueJ2RpIk9QGF7YwuSCkdV+taJEnqaxzxlSRJkiTlmsFXkiRJkpRrTnWWJEmSJOWaI76SJEmSpFwz+EqSJEmScs3gK0mSJEnKNYOvJEmSJCnXDL6SJEmSpFz7/+M6isRpXylAAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA9IAAALYCAYAAACQf8oMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABsx0lEQVR4nO3dd5hcZdnH8e+dntASehUSQJoBlIA0IYCR3gWkSC8qiICKoiC8gIoiTZCOgBRBkN4hEFQM0kEiXTpIDYFdQurz/vHMsJvNJtlNdudM+X6u61xn98xzZu4JR+GXp0VKCUmSJEmS1DE9ii5AkiRJkqRaYpCWJEmSJKkTDNKSJEmSJHWCQVqSJEmSpE4wSEuSJEmS1AkGaUmSJEmSOsEgLUlSA4uIvSMiRcTeRdciSVKtMEhLktQBEbFMKXDeUnQt9Swijiv9Obc+miLi8Yj4WUT064LPSBExqgvKlSQ1qF5FFyBJkgp1PfAg8HbRhbRxDfAfIIDFgO2AXwIbASOKK0uSJIO0JEkNLaU0DhhXdB3t+EtK6dryLxHxU+Ap4OsRsXFK6d7iSpMkNTqHdkuS1A0iYtmIuDgi3oiIiaXz2RGxUDtt942ImyLi1YiYEBHvR8SNETGsnbafz2mOiG0j4p+loc9PlF4fVXq9d2mY9Mul93w+Ir43s/drda08jP2SiFguIq6PiLER0RwR90TEajP4zruUhmB/FhFvRsRpEdG/K4ZSp5Q+BG4s/bpGm89dIyL+EBFjIuLjUp2PRcT3IiJatRseEan064Ztho+3/v49IuKAiPhX6c+2qfTnvMOcfAdJUv2wR1qSpC4WEesAdwD9gJuAl4EVge8Am0bEmqVgWPYH4HHgLuADYBlg21Lb4SmlB9v5mF2Ar5fe/+9M/+/0PwNrAbcDU4CdgT9ExKSU0gUd/CrLkId9jwH+CCxbquu+iFgppfROq+98IHBeqf6LgM+A7YHlO/hZnTGpze8HAFsBfwNuBeYBvkH+c/0icFip3SvA/wHHAq8Cl7R6jycASsH7z+Q/r/8Al5Ze3xL4a0QcllI6owu/iySpBhmkJUnqQhHRB7iKHF5XTyk90+q1nYC/AMcDh7S6beWU0stt3mcl4CHgRHJgbusbwCYppVEzKGVJ4EsppY9L73cG8DTwQ6CjQXpD4Kcppd+0qusE4GhgH+Ck0rVBwKnAR8BXUkqvla7/Ahjdwc+aqYiYnxziAR5o8/KvgO+llKa2at8LuAX4fkScllJ6NaX0CnBcRBwLvJJSOq6djzqQHKLPBg5NKU0pvd+RwL3AbyPimpTSW13xvSRJtcmh3ZIkda2tgC8AJ7UO0QAppWuAR8m9ya2vTxOiS9eeAe4DvlYK521dP5MQDXBUOUSX3u85cgBdISLm6eB3eRk4uc21i0rnNVtd2xaYCzi/HKJLn9lMXiBsduxcGpr+fxFxPvAssARwVkrp4dYNU0qvtQ7RpWuTgfPJ/62zUSc+92DynPHDyyG69H7NwAlAH8Ah3pLU4OyRliSpa321dF4lIo5r5/X+wIIRsWBK6X2AiFgO+Bk58C1ODmutLcD0q2o/Mos6Hm3n2hul80Dgk1ncD/BE24Da5j3KynOm2+t9/mcHPqc9O7Vz7cyU0qFtL0ZEX+BQ8l9QrADM3abJYh35wIgYAHwJeB34Wavp1WXl+e0rduT9JEn1yyAtSVLXmr903nMW7eYC3o+I5clDuOcB7iFvR9UETCVv+bQa0Led+99p59rnWvdGtzK5dO45i9rKpnuPlNLkUsBs/R7lHu732nmPdzv4WW3tlFK6NiJ6AysDZ5KHaT+dUjq/TdvrgC3IvdZXluqYTJ7jvRft//m1ZxB5u60vkOdRz8hcHf0SkqT6ZJCWJKlrlcPn5imlOzrQ/jBy7+7uKaUrW78QEV+lpbe3rTSD60Uo925PtyI5sPCcvHFKaRLwZERsDTwHnB4Rt6aU3gSIiDXJIfoOYMs286R3IQfpjir/s/tXSmntOalbklTfnCMtSVLXeqh07mgQW7Z0vqn1xYjoB3ylq4rqZk+Wzu1953W64gNK+10fSx4a37q3uPznd2s7w9DXm8HbTaWdXvmU0ifkXu2VOzGPXJLUgAzSkiR1rRvIc2x/HBHTBcvSvspfbXWpvDjXeq3aBPBr5rA3t4JuAj4FDoyIJcsXS3OOf96Fn3Mx+c9274hYpnRtuj+/0mevTV6Buz0fklc1b8+Z5KHq55b+MmMaEbFKRNTKPxdJUjdxaLckSZ2zWkRcMoPX/pFSurC0zdXtwD8j4i7yfsS9yHN2NyQvyrVZ6Z7zyFtJXRcRV5NXjP4aMBgYBQzvlm/RhVJKH0bEj8hbRj0REVfRso/0M8Cq5F7gOf2ciRFxEnl/6GOA/YB/kRde+1ZELAo8DAwBtiEH/B3beat7yauC30Dev3sKcFNK6SngHGBdYHfyiun3Av8jL1g2FPgyuZd9dud+S5LqgEFakqTOWZKZz7u9MKX0r4hYHTgS2Jy8Gven5BWv/1Q6AEgpPRoRm5G3VtoJmEgO0LvStb253SqldE5EfAT8FNif3Ot7NXl/6dfo2CrhHXEReYXzPSPiVymllyJiK+A35L21v0oO73sDb9F+kP5B6bwxsDV5hN4bwFMppQTsERG3l77HtsAA8uJuzwDfA/7dRd9FklSjIv/7QpIkqetFxCbk1chPTikdWXQ9kiR1BedIS5KkORYR85f2c259bT7gl6Vfb6x8VZIkdQ+HdkuSpK6wMXmBrjvJw6QXATYFFgWuSCk9UGRxkiR1JYO0JEnqCv8mL+K1IbAgeXGx58hzl88ssC5Jkrqcc6QlSZIkSeoEe6TnwIILLpiWWWaZwj6/ubmZueaaq7DPl1rzeVQ18XlUNfF5VDXxeVQ1qYXn8dFHH30/pbRQ2+sG6TmwzDLL8MgjjxT2+aNGjWL48OGFfb7Ums+jqonPo6qJz6Oqic+jqkktPI8R8Wp71121W5IkSZKkTjBIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVInGKQlSZIkSeoEg7QkSZIkSZ1gkJYkSZIkqRMM0pIkSZIkdYJBWpIkSZKkTjBIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVInGKQlSZIkSeoEg7QkSZIkSZ1gkJYkSZIkqRMM0pIkSZIkdYJBWpIkSZKkTjBIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVInGKQlSZIkSeoEg7QkSZIkSZ1gkJYkSZIkqRMqHqQjYqmIuDYixkXExxFxXUR8oYP39ouIkyPi7YgYHxGjI2KDdtodERE3l9qliDhuJu+5XUQ8HhGfRcSrEXF0RPScg68oSZIkSapjFQ3SETEAuBdYEdgL+DawPHBfRMzVgbe4CDgA+AWwFfA2cGdErN6m3QHAwsANs6hnU+CvwMPA5sAZwNHArzr0hSRJkiRJDadXhT/vAGAIsEJK6UWAiHgKeAE4CDh1RjdGxGrAbsC+KaWLS9fuB8YAxwPbtGq+SkppakT0Ar4zk3pOAv6RUjqw9Pt9ETE3cHREnJZS+t/sfElJkiRJUv2q9NDubYAHyyEaIKX0MvAAsG0H7p0EXN3q3snAVcCmEdG31fWpsyokIpYCVgcub/PSZUBvcg+1JEmSJEnTqHSP9CrAje1cHwPs1IF7X04pfdrOvX2A5Uo/d6YWgKdbX0wpvRwRnwIrd+K9GtLf/gaXXDJn79GvH5x4Isw/f5eUJEmSJEndrtJBen5gbDvXPwQGzcG95dc7WwszeM+xM3q/iDgQOBBgkUUWYdSoUZ382K7T1NRU6Of/+Mer8uSTAxk0aOJsv8e77/Zj/PiX2WuvV7uwMhWh6OdRas3nUdXE51HVxOdR1aSWn8dKB+mal1I6HzgfYNiwYWn48OGF1TJq1CiK+vypU+H552HffeHcc/vN9vtsvjncccdgzjtvMH36dGGBqrgin0epLZ9HVROfR1UTn0dVk1p+His9R3os7fc8z6i3uaP3QkvPdGdqYQbvOWg23q+hPPMMfPwxrLPOnL3P978P//sfXHdd19QlSZIkSd2t0kF6DC1zk1tbGfhPB+4dXNpCq+29E4EXp79llu9H23oiYhlgQAfqaWijR+fznAbpzTaDZZeFs86a85okSZIkqRIqHaRvAtaOiCHlC6Xgul7ptZm5mbya9ueLkpW2t9oFuCulNKEzhaSUXgOeBHZv89Ie5NXBb+/M+zWa0aPzAmHLLz9n79OjBxx8MDzwADz+eNfUJkmSJEndqdJB+gLgFeDGiNg2IrYhr+L9OnBeuVFELB0RkyPiF+VrKaXHyVtfnR4R+0fEJuStrwYDx7b+kIgYFhHfBHYoXVo5Ir5ZOlr3aP8M2DAizouI4RFxOHA0cIZ7SM/c6NGw9toQMefvtc8+MGAAnHnmnL+XJEmSJHW3igbplFIzsDHwPHm/5iuAl4GNU0pNrZoG0LOd+vYBLgZOBG4FlgI2Syk91qbdIcA1tOw5vVPp92uAhVvVcxvwTWBt4E7gcOBXwE/n5HvWu48+ynOk53RYd9nAgfDtb8OVV8IHH3TNe0qSJElSd6n4qt2lIdU7zqLNK+Qw3fb6eOCI0jGz+/cG9u5gPdcBLnXVCf/6Vz53VZAGOOQQOO88uPBC+MlPuu59JUmSJKmrVXpot+rA6NF5SPeaa3bde37pSzB8OJx9NkyZ0nXvK0mSJEldzSCtThs9Ogffeeft2vf9/vfhtdfg5pu79n0lSZIkqSsZpNUpU6fmod1dOay7bJttYKmlXHRMkiRJUnUzSKtTnn0Wxo3rniDdqxd897tw773wH3fxliRJklSlDNLqlNGj83nttbvn/Q84APr2hbPO6p73lyRJkqQ5ZZBWp4weDYMGwRe/2D3vv+CCsOuu8Kc/5Z5vSZIkSao2Bml1yoMP5t7oHt345BxyCDQ3wyWXdN9nSJIkSdLsMkirw8aNy3OXu2tYd9kaa+Q52GedlRc3kyRJkqRqYpBWh/3rX5BS9yw01tYhh8CLL8Jdd3X/Z0mSJElSZxik1WGjR0MEfPWr3f9Z3/wmLLqoW2FJkiRJqj4GaXXYgw/CKqvAvPN2/2f16QMHHQS33557piVJkiSpWhik1SFTp7YsNFYpBx2Ue8D/9KfKfaYkSZIkzYpBWh3y3HPw0UeVmR9dtthisNZacPfdlftMSZIkSZoVg7Q65MEH87mSQRpgxAh46KEc4iVJkiSpGhik1SGjR8PAgbDCCpX93BEj8rDy++6r7OdKkiRJ0owYpNUho0fn1bp7VPiJWXttmHtuh3dLkiRJqh4Gac3SuHEwZkzlh3UD9O4NG25okJYkSZJUPQzSmqWHH4aUignSkId3v/givPJKMZ8vSZIkSa0ZpDVLo0fnbai++tViPn/EiHy+555iPl+SJEmSWjNIa5ZGj4aVVoL55ivm81daCRZf3OHdkiRJkqqDQVozNXVq3vqqqGHdkHvDR4yAkSNzPZIkSZJUJIO0ZuqFF2Ds2GKDNOQg/cEH8PjjxdYhSZIkSQZpzdTo0fm89trF1vH1r+ezw7slSZIkFc0grZkaPTrPjV5ppWLrWGQRGDrUIC1JkiSpeAZpzdTo0Xm17h5V8KSMGAH/+Ad8+mnRlUiSJElqZFUQj1StPvkEnn66+PnRZSNGwMSJOUxLkiRJUlEM0pqhhx6ClIqfH122wQbQp4/DuyVJkiQVyyCtGSovNPbVrxZbR9mAAbDeegZpSZIkScUySGuGHnwwLzI2aFDRlbQYMQKefBLeeafoSiRJkiQ1KoO0Zuill2CVVYquYlojRuTzyJHF1iFJkiSpcRmkNUNNTTD33EVXMa0vfzn3kDu8W5IkSVJRDNKaoebm6gvSPXvCJpvkIJ1S0dVIkiRJakQGac1QczPMNVfRVUxvxAh480147rmiK5EkSZLUiAzSatekSXnP5moN0uDwbkmSJEnFMEirXc3N+VyNQXrwYFh2WYO0JEmSpGIYpNWucpCutjnSZSNGwKhRuedckiRJkirJIK12VXOPNOQg/ckn8K9/FV2JJEmSpEZjkFa7qj1Ib7QR9Ojh8G5JkiRJlWeQVruamvK5WoP0oEEwbJhBWpIkSVLlGaTVrmqfIw15ePdDD8G4cUVXIkmSJKmRGKTVrmof2g05SE+ZkhcdkyRJkqRKMUirXbUQpNdZJ9d3111FVyJJkiSpkRik1a5aCNJ9+sDw4XDnnUVXIkmSJKmRGKTVrvJiY9U8Rxpg883hpZfghReKrkSSJElSozBIq13lHukBA4qtY1Y23zyf77ij2DokSZIkNQ6DtNrV3Az9++e9mqvZkCGw/PJw++1FVyJJkiSpUVR5TFJRmpure350a5tvDvfdB+PHF12JJEmSpEZgkFa7mppqK0h/9hn87W9FVyJJkiSpERik1a7m5upfaKxsww2hXz+Hd0uSJEmqDIO02lVLQ7v798/bYBmkJUmSJFWCQVrtqqUgDXl49/PPw3//W3QlkiRJkuqdQVrtqqU50gCbbZbPboMlSZIkqbsZpNWuWpojDXkLrCFDHN4tSZIkqfsZpNWuWhvaHZGHd997L0yYUHQ1kiRJkuqZQVrtqrUgDXl496efwt//XnQlkiRJkuqZQVrTSan25kgDbLQR9Onj8G5JkiRJ3csgrelMnAhTptTWHGnIwX/DDQ3SkiRJkrqXQVrTaW7O51rrkYY8vPuZZ+DVV4uuRJIkSVK9MkhrOrUcpDffPJ/dBkuSJElSdzFIazpNTflci0F6xRVh6aUd3i1JkiSp+xikNZ1yj3StzZGGvA3WZpvByJF5rrckSZIkdTWDtKZTy0O7IQ/vbmqCBx4ouhJJkiRJ9cggrenUepDeeGPo3dvh3ZIkSZK6h0Fa06n1ID3PPLD++i44JkmSJKl7GKQ1nVpebKxs883h3/+GN94ouhJJkiRJ9cYgrenU8mJjZW6DJUmSJKm7GKQ1nVof2g2wyiqwxBIGaUmSJEldzyCt6TQ3522k+vcvupLZF5F7pe++GyZNKroaSZIkSfXEIK3pNDXBgAE5jNayzTeHjz+G0aOLrkSSJElSPTFIazrNzbU9P7psk02gVy+Hd0uSJEnqWgZpTae5ubbnR5fNNx+suy7cdlvRlUiSJEmqJwZpTadegjTAFlvAk0/C668XXYkkSZKkemGQ1nSamuonSG+9dT7femuxdUiSJEmqHwZpTade5kgDrLQSDBkCN99cdCWSJEmS6oVBWtOpp6HdEblXeuTIlv2xJUmSJGlOGKQ1nXoK0gBbbQUTJuQwLUmSJElzyiCt6dTTHGmADTaAeed1eLckSZKkrmGQ1nTqaY40QJ8+sOmmcMstMHVq0dVIkiRJqnUGaU0jpfob2g15nvT//gePPlp0JZIkSZJqnUFa0/jssxym6y1Ib7459OiRe6UlSZIkaU4YpDWNpqZ8rrcgveCCsM46zpOWJEmSNOcM0ppGeYuoegvSkId3P/44vPFG0ZVIkiRJqmUGaU2jHKTrabGxsq23zmeHd0uSJEmaEwZpTaOee6RXWgkGDzZIS5IkSZozBmlNo56DdETulR45Ej79tOhqJEmSJNUqg7SmUa+LjZVtvXVemfyee4quRJIkSVKtMkhrGvU8Rxpggw1gnnlcvVuSJEnS7DNIaxr1PLQboE8f2HRTuPVWmDq16GokSZIk1SKDtKZR70Ea8vDut9+Gxx4ruhJJkiRJtcggrWnU+xxpgC22gB49HN4tSZIkafYYpDWN5mbo2RP69i26ku6z4IKwzjoGaUmSJEmzxyCtaTQ3597oiKIr6V5bbw2PPw5vvll0JZIkSZJqjUFa0ygH6Xq31Vb5fMstxdYhSZIkqfYYpDWNpqbGCNIrrwyDBzu8W5IkSVLnGaQ1jebm+t1DurWIPLx75Ej49NOiq5EkSZJUSwzSmkajDO2GHKQ/+yyHaUmSJEnqKIO0ptFIQXqDDWCeeRzeLUmSJKlzDNKaRqPMkQbo0wc23TQvODZ1atHVSJIkSaoVBmlNo1HmSJdtsw28/TY88kjRlUiSJEmqFQZpTaORhnYDbLkl9OwJ119fdCWSJEmSaoVBWtNotCA9//wwfLhBWpIkSVLHGaT1ualT81ZQjRSkAbbfHp57Dp55puhKJEmSJNUCg7Q+V95PudGC9Hbb5bO90pIkSZI6wiCtzzU353MjLTYGsMQSsNZaBmlJkiRJHWOQ1ufKQbrReqQhD+9+5BF4/fWiK5EkSZJU7QzS+lyjB2mAG24otAxJkiRJNaDiQToiloqIayNiXER8HBHXRcQXOnhvv4g4OSLejojxETE6IjZop12PiDgqIl6JiM8i4smI2LGddgMi4v8i4vnS+70eEX+KiGW64KvWnKamfG7EIL3CCrDSSg7vliRJkjRrFQ3SETEAuBdYEdgL+DawPHBfRHQkvl0EHAD8AtgKeBu4MyJWb9PuBOA44Cxgc+BB4JqI2KJNuwuBHwMXAFsARwMbACMjosFmCjfuHOmy7beHv/0NPvig6EokSZIkVbNK90gfAAwBtksp3ZBSuhHYBlgaOGhmN0bEasBuwOEppQtSSiOBnYHXgONbtVsY+BFwUkrpdyml+1JKBwH3ASe1ajegdP/pKaWTS+0uBb5TqnG9LvvWNaKRh3ZDDtJTpsDNNxddiSRJkqRqVukgvQ3wYErpxfKFlNLLwAPAth24dxJwdat7JwNXAZtGRN/S5U2BPsDlbe6/HBgaEYNLv/csHR+3afdR6dxw88cbPUivsQYstZTDuyVJkiTNXKXD4irA0+1cHwOs3IF7X04pfdrOvX2A5Vq1mwC82E47yp+TUvoEuAw4NCI2ioi5I2IV4GTgSWDkrL9OfWnkOdIAEXlP6bvuavlLBUmSJElqq1eFP29+YGw71z8EBs3BveXXy+ePUkppFu0A9gF+T563XfYvYERKaWJ7RUTEgcCBAIsssgijRo2aRdndp6mpqUs//8knlwSW44kn/sGLL07usvetJYMHD+Szz1bnd797mg03fL/ocmpKVz+P0pzweVQ18XlUNfF5VDWp5eex0kG62pwI7EGeU/0w8AXgWOD2iNgwpTRdv2RK6XzgfIBhw4al4cOHV67aNkaNGkVXfv4//pHPm266Pr17d9nb1pT114df/hJeeOFLHHts0dXUlq5+HqU54fOoauLzqGri86hqUsvPY6WD9Fja73meUW9z23uXnsG90NLjPBYYGBHRpld6mnalYdw/BfZPKV1UbhQR/wKeB/YHzphFTXWluRl696ZhQzRAr16w9dZ5nvTEidCnT9EVSZIkSao2lZ4jPYY8h7mtlYH/dODewaXVttveO5GWOdFjgL7Asu20o9XnDC2dH27dKKX0AnnBsZVmUU/daWpq3PnRrW2/PYwbBzU6ykSSJElSN6t0kL4JWDsihpQvRMQy5K2mbprFvTcDvYGdWt3bC9gFuCulNKF0+Q7y6t67t7l/D+Dp0irhAP8rnddq3SgivggMBN7s0DeqI83NjbuHdGsjRuS/UHD1bkmSJEntqfTQ7guAQ4AbI+JoIAEnAK8D55UbRcTSwEvA8Sml4wFSSo9HxNXA6RHRG3gZ+C4wmFahOaX0bkScChwVEZ8Aj5HD9sbkLbTK/k5enfuUiBgEPEKeI300MA64tOu/fnVrbrZHGqB/f9hsM7jxRvjDH6BHw22EJkmSJGlmKhoRSot3bUyeg3wZcAU5EG+cUmpq1TTIezy3rW8f4GLyImG3AksBm6WUHmvT7uelNj8A7iT3eO+cUrqlVS1TgE2AC8mrcN9Wuucx4Ksppdfm9PvWGoN0i+23h7ffhn/9q+hKJEmSJFWbiq/aXQqoO86izSvkMN32+njgiNIxs/unkEPxibNo9wHww9LR8Jwj3WLLLfPCY9dfD+usU3Q1kiRJkqqJg1b1OXukWwwcCBtvnIP0dDuSS5IkSWpoBml9zsXGprX99vDiizBmTNGVSJIkSaomBml9zh7paW27LUS4erckSZKkaRmk9TnnSE9rscVg7bUN0pIkSZKmZZDW5+yRnt7228Pjj8MrrxRdiSRJkqRqYZAWAFOmwIQJzpFua4cd8vmvfy22DkmSJEnVwyAtIPdGgz3SbS27LHzlK/CXvxRdiSRJkqRqYZAWYJCemZ12gocecni3JEmSpMwgLSAvNAYG6fbstFM+X3ttsXVIkiRJqg4GaQEtPdLOkZ5eeXj3NdcUXYkkSZKkamCQFuDQ7lnZeWeHd0uSJEnKDNICDNKz4vBuSZIkSWUGaQHOkZ6VIUNgjTVcvVuSJEmSQVolzpGetZ12gocfdni3JEmS1OgM0gIc2t0R5eHdLjomSZIkNTaDtACDdEeUh3cbpCVJkqTGZpAW4Bzpjtp55zy8++WXi65EkiRJUlEM0gJyj3TfvtCzZ9GVVDdX75YkSZJkkBaQg7QLjc3a4MEwbJird0uSJEmNzCAtIAdph3V3zE47wSOPOLxbkiRJalQGaQF5jrRBumNcvVuSJElqbAZpAfZId8bgwbDmmgZpSZIkqVEZpAU4R7qzysO7//vfoiuRJEmSVGkGaQH2SHeWq3dLkiRJjcsgLcA50p21zDJ5eLerd0uSJEmNxyAtwB7p2bHzzvDoow7vliRJkhqNQVqAc6Rnxze/mc8uOiZJkiQ1FoO0AHukZ8cyy8BaaxmkJUmSpEZjkBYTJ8KkSQbp2bHTTg7vliRJkhqNQVo0N+ezQbrzyqt3X311sXVIkiRJqhyDtD4P0s6R7ryll4Z114U//7noSiRJkiRVikFa9kjPod12g3//Ox+SJEmS6p9BWgbpObTTTtCzJ1x5ZdGVSJIkSaoEg7Roaspng/TsWXhhGDEiD++eOrXoaiRJkiR1N4O0nCPdBXbfHV59FUaPLroSSZIkSd3NIC2HdneBbbeF/v3hiiuKrkSSJElSdzNIyyDdBeaZB7bZBv7yl7wntyRJkqT6ZZCWc6S7yO67wwcfwN13F12JJEmSpO5kkJY90l1k001h0CBX75YkSZLqnUFanwfpAQOKraPW9emTt8K64YaWP1NJkiRJ9ccgLZqbc4ju4dMwx3bfPf953nRT0ZVIkiRJ6i5GJ9HU5LDurrL++rDkkg7vliRJkuqZQVo0Nxuku0qPHrDrrnDHHfD++0VXI0mSJKk7GKRFczPMPXfRVdSP3XeHyZPh2muLrkSSJElSdzBIyx7pLrbqqrDyyg7vliRJkuqVQVrOke5iEbDbbvD3v8OrrxZdjSRJkqSuZpCWPdLdYNdd8/mqq4qtQ5IkSVLXM0jLOdLdYMgQWGcdh3dLkiRJ9cggLXuku8luu8FTT8HTTxddiSRJkqSuZJCWQbqb7Lwz9Oxpr7QkSZJUbwzSDS4lFxvrLgsvDCNG5CCdUtHVSJIkSeoqBukGN2ECTJ3qHOnusttueeXuf/6z6EokSZIkdRWDdINrbs5ne6S7x3bbQf/+cMUVRVciSZIkqasYpBucQbp7zTMPbLtt3gZrwoSiq5EkSZLUFQzSDa6pKZ8N0t1n771h7Fi45ZaiK5EkSZLUFQzSDc4e6e739a/D4ovDJZcUXYkkSZKkrmCQbnDlIO1iY92nZ0/YYw+4/XZ4552iq5EkSZI0pwzSDc4e6crYay+YMsU9pSVJkqR6YJBucM6RroyVV4Y113R4tyRJklQPDNINzh7pytlrL3jqKXjiiaIrkSRJkjQnDNINzjnSlfOtb0GfPnDppUVXIkmSJGlOGKQbnD3SlbPAArD11nDFFTBpUtHVSJIkSZpdBukG19QEEdCvX9GVNIa99oL33ssreEuSJEmqTQbpBtfcnHujI4qupDFsthksvLDDuyVJkqRaZpBucM3Nzo+upN69Yffd4eab4YMPiq5GkiRJ0uwwSDe4co+0KmevvfIc6T//uehKJEmSJM0Og3SDa2oySFfaaqvB6qs7vFuSJEmqVQbpBmePdDH22gseeQTGjCm6EkmSJEmdZZBucM6RLsZuu0GvXvZKS5IkSbXIIN3g7JEuxsILwxZbwOWXw+TJRVcjSZIkqTMM0g3OIF2cvfaCt9+Gu+8uuhJJkiRJnWGQbnAuNlacLbeE+ed3eLckSZJUawzSDc4e6eL07Qu77go33AAffVR0NZIkSZI6yiDdwFKCTz91sbEi7b03TJgAV19ddCWSJEmSOsog3cDGj89h2h7p4qyxBqy8MlxySdGVSJIkSeoog3QDa2rKZ4N0cSJgn33gwQfdU1qSJEmqFQbpBtbcnM8G6WLttRf07g0XXlh0JZIkSZI6wiDdwMpB2jnSxVpoIdhuO/jTn/J8aUmSJEnVzSDdwOyRrh777w8ffgjXX190JZIkSZJmxSDdwJwjXT2+/nVYZhm44IKiK5EkSZI0KwbpBmaPdPXo0QP22w/uvRdeeqnoaiRJkiTNjEG6gTlHurrsvXcO1BddVHQlkiRJkmbGIN3A7JGuLksuCVtsARdfDJMnF12NJEmSpBkxSDcw50hXnwMOgP/9D269tehKJEmSJM2IQbqB2SNdfbbYAhZbzEXHJEmSpGpmkG5gzc3Qqxf06VN0JSrr1Qv22Qduvx3eeKPoaiRJkiS1xyDdwJqbc290RNGVqLX99oOpU/NcaUmSJEnVxyDdwJqaHNZdjYYMgU02yat3T51adDWSJEmS2jJIN7Byj7SqzwEHwKuvwj33FF2JJEmSpLYM0g2sudk9pKvVdtvBAgu46JgkSZJUjQzSDcwe6erVty/suSfceCO8+27R1UiSJElqzSDdwAzS1W3//WHSJPjTn4quRJIkSVJrBukG5mJj1W3llWHddeHCCyGloquRJEmSVGaQbmD2SFe/Aw6A556Df/yj6EokSZIklRmkG5iLjVW/nXaCeed10TFJkiSpmhikG5g90tVvrrlgt93gmmtg7Niiq5EkSZIEBumGNXUqjB9vkK4F3/kOfPYZXHJJ0ZVIkiRJAoN0w2puzmeDdPVbbbW86Ng55+S/AJEkSZJULIN0gyoHaedI14aDD4YXXoCRI4uuRJIkSZJBukHZI11bdtwRFloIzj676EokSZIkGaQbVFNTPhuka0PfvrD//nDTTfDaa0VXI0mSJDU2g3SDske69hx0EKQE559fdCWSJElSYzNINyjnSNeepZeGrbbKe0pPnFh0NZIkSVLjMkg3KHuka9PBB8O778J11xVdiSRJktS4DNINyjnStWnECFh2WfjDH4quRJIkSWpcBukGZY90berRA777XfjHP+Cpp4quRpIkSWpMBukG5Rzp2rXPPtCvH5xzTtGVSJIkSY2p4kE6IpaKiGsjYlxEfBwR10XEFzp4b7+IODki3o6I8RExOiI2aKddj4g4KiJeiYjPIuLJiNhxBu85KCJOj4jXImJCRLwREZfM4desevZI167554ddd4XLLoOPPy66GkmSJKnxVDRIR8QA4F5gRWAv4NvA8sB9EdGRSHcRcADwC2Ar4G3gzohYvU27E4DjgLOAzYEHgWsiYos29QwC/gF8HTgaGAH8CPik89+utjQ3Q58+0KtX0ZVodnzve/mf4Z/+VHQlkiRJUuOpdIw6ABgCrJBSehEgIp4CXgAOAk6d0Y0RsRqwG7BvSuni0rX7gTHA8cA2pWsLk8PwSSml35Vuvy8ilgNOAm5r9ba/BuYGhqaUWvftXTWH37PqNTXZG13Lhg2DNdeEs8/OK3lHFF2RJEmS1DgqPbR7G+DBcogGSCm9DDwAbNuBeycBV7e6dzI59G4aEX1LlzcF+gCXt7n/cmBoRAwGKPWA7wlc2CZEN4TmZoN0rfve9+CZZ+D++4uuRJIkSWoslQ7SqwBPt3N9DLByB+59OaX0aTv39gGWa9VuAvBiO+1o9TlrAP2Bd0pztsdHRFNE3FAO2/WsudmFxmrdLrvk+dJnn110JZIkSVJjqfTQ7vmBse1c/xAYNAf3ll8vnz9KKaVZtFu8dP4dcDu5x3sh8nDvURHxpZTSdHOlI+JA4ECARRZZhFGjRs2i7O7T1NQ025//2mtDmTq1D6NGPdq1RamiRowYwrXXLsm11z7IggtOLLSWOXkepa7m86hq4vOoauLzqGpSy89jIy81Ve6N/y/wrXLwjoiXyIuT7QFMt8FQSul84HyAYcOGpeHDh1ek2PaMGjWK2f38vn1h0UWZ7ftVHZZaCq6+GsaMWZdjjy22ljl5HqWu5vOoauLzqGri86hqUsvPY6WHdo+l/Z7nGfU2d/ReaOlxHgsMjJhu+aW27T4onUe27r1OKf0L+Bj48izqqWnOka4Pyy4Lm20G558PkyYVXY0kSZLUGCodpMeQ5zC3tTLwnw7cO7i0hVbbeyfSMid6DNAXWLaddrT6nDHM3NRZvF7TnCNdP773PXjrLbj++qIrkSRJkhpDpYP0TcDaETGkfCEilgHWK702MzcDvYGdWt3bC9gFuCulNKF0+Q7y6t67t7l/D+Dp0irhpJTeAB4BRrTuvY6IdYB5gYc7++VqiT3S9WOLLXLP9BlnFF2JJEmS1BgqHaQvAF4BboyIbSNiG+BG4HXgvHKjiFg6IiZHxC/K11JKj5O3vjo9IvaPiE3IW18NBo5t1e5d8n7UR0XEERExPCLOATYGjmpTz0/JPdXXRsTmEbEn8BfgWeDKLv7uVcV9pOtHz55w6KHwz3/CQw8VXY0kSZJU/yoapFNKzeRA+zxwGXAF8DKwcUqpqVXTAHq2U98+wMXAicCtwFLAZimlx9q0+3mpzQ+AO8k93junlG5pU89IYGvgC8D1wGnAfcDwlNL4OfqyVc4e6fqyzz4w77xw+ulFVyJJkiTVv4qv2p1Seg3YcRZtXiGH6bbXxwNHlI6Z3T+FHKRP7EA9t5O3v2oYkyfDxInOka4n88wD++0HZ54Jv/0tLLlk0RVJkiRJ9avSQ7tVBZqb89ke6fry/e/D1Klw9tlFVyJJkiTVN4N0A2oqDaI3SNeXwYNhu+3gvPPg00+LrkaSJEmqXwbpBmSPdP067DD48EO47LKiK5EkSZLql0G6AZWDtHOk68/668NXvpIXHZta1zuhS5IkScUxSDcge6TrV0TulX72Wbj77qKrkSRJkuqTQboBGaTr2y67wKKLuhWWJEmS1F0M0g3IxcbqW58+cPDBcMcd8MwzRVcjSZIk1R+DdAOyR7r+HXQQ9O0LZ5xRdCWSJElS/TFINyAXG6t/Cy0Ee+wBf/oTfPBB0dVIkiRJ9cUg3YDskW4MP/gBjB8PF1xQdCWSJElSfTFIN6DyHOkBA4qtQ91r6FDYZBM46yyYNKnoaiRJkqT6YZBuQM3N0K8f9OxZdCXqbocdBm++CddeW3QlkiRJUv0wSDeg5mbnRzeKLbaA5ZeH006DlIquRpIkSaoPBukG1Nzs/OhG0aNHniv98MMwenTR1UiSJEn1wSDdgJqaDNKNZK+9YOBAOOWUoiuRJEmS6oNBugHZI91Y5p4bvvc9uP56eO65oquRJEmSap9BugE5R7rxHHoo9Oljr7QkSZLUFQzSDcge6cazyCKw995w6aXw9ttFVyNJkiTVNoN0A3KOdGP60Y/yftK//33RlUiSJEm1zSDdgOyRbkzLLQc77gjnnAMff1x0NZIkSVLtMkg3IOdIN64jj4Rx4+D884uuRJIkSapdBukGk5I90o1szTVho43gtNNg4sSiq5EkSZJqk0G6wUycCJMnG6Qb2ZFHwltvwRVXFF2JJEmSVJsM0g2muTmfDdKNa9NNYdVV4eSTYerUoquRJEmSao9BusEYpBWRe6WfeQZuuaXoaiRJkqTaY5BuMOUg7WJjjW3nnWHppeG3vy26EkmSJKn2GKQbjD3SAujdG444Ah54IB+SJEmSOs4g3WCamvLZIK399oP557dXWpIkSeosg3SDsUdaZXPNBYccAjfdlOdLS5IkSeoYg3SDcY60WjvkEOjfP6/gLUmSJKljDNINxh5ptbbQQrDvvnD55fDmm0VXI0mSJNUGg3SDcY602vrhD2HKFDjttKIrkSRJkmqDQbrB2COttgYPhm99C849F95/v+hqJEmSpOpnkG4wzc0QkefFSmU//zl8+imcfnrRlUiSJEnVzyDdYJqbYcAA6OE/ebWy8sqw445w5pnw0UdFVyNJkiRVN+NUg2lqcli32nf00fDxx/D73xddiSRJklTdDNINprnZIK32rbYabLNNHt798cdFVyNJkiRVL4N0g2ludg9pzdgxx8DYsXD22UVXIkmSJFUvg3SDsUdaMzNsGGy2GZxySssK75IkSZKmZZBuMM6R1qwcc0zeBuu884quRJIkSapOBukGY4+0ZmXddWHjjeHkk2H8+KKrkSRJkqqPQbrBGKTVEcccA//7H1x0UdGVSJIkSdXHIN1gXGxMHbHhhrD++vCb38CECUVXI0mSJFUXg3SDsUdaHRGRe6XfeAMuvbToaiRJkqTqYpBuICm52Jg6bsQIWGst+PWvYdKkoquRJEmSqodBuoF89lkO0wZpdUS5V/qVV+Dyy4uuRpIkSaoeBukGUt4X2DnS6qgtt4Qvfxl+9SuYPLnoaiRJkqTq0OEgHRE9I2K1iFioOwtS9ykHaXuk1VHlXukXX4Srry66GkmSJKk6dKZHOgGPAF/uplrUzZqa8tkgrc7Ydlv40pfgxBNhypSiq5EkSZKK1+EgnVKaCrwOGMNqlD3Smh09esCxx8Kzz8Kf/1x0NZIkSVLxOjtH+jzgsIjo0x3FqHs5R1qza4cdYPXV4bjjXMFbkiRJ6tXJ9vMAywL/jYg7gLfJQ77LUkrp2K4qTl3LHmnNrh494IQTYOut877S++9fdEWSJElScTobpH/W6ud923k9AQbpKuUcac2JLbeEr34Vjj8evv1t6Nu36IokSZKkYnRqaHdKqccsjp7dVajmnD3SmhMRecGx11+HCy4ouhpJkiSpOO4j3UCcI605tckmsOGG8MtfwqefFl2NJEmSVIxZBumImBoRUzp4TK5E0Zo99khrTkXkudL/+x+cfXbR1UiSJEnF6Mgc6eOZdkEx1aimprxolHNbNSe+9jXYdFM46SQ46CCYZ56iK5IkSZIqa5ZBOqV0XAXqUAU0N+fe6IiiK1GtO+EEWGstOOMMOProoquRJEmSKss50g2kHKSlObXmmrDttvC738HYsUVXI0mSJFWWQbqBNDe70Ji6zvHHw7hxcMopRVciSZIkVZZBuoHYI62utOqqsPPOcPrp8N57RVcjSZIkVY5BuoE0NRmk1bWOOw7Gj4ff/KboSiRJkqTKMUg3EHuk1dVWWgn22AP+8Ad4//0+RZcjSZIkVYRBuoE4R1rd4Re/gMmT4Yorli66FEmSJKkiDNINxB5pdYdll4V994VbblmM//636GokSZKk7meQbiDOkVZ3+cUvoGfP5J7SkiRJaggG6QZij7S6yxJLwI47vsGf/wyPPlp0NZIkSVL3Mkg3iKlT4dNPnSOt7rPrrq+xwAJw5JGQUtHVSJIkSd3HIN0gxo/PZ3uk1V3mnnsKxxwD994Ld95ZdDWSJElS9zFIN4impnw2SKs7fec7MHgw/OQnMGVK0dVIkiRJ3cMg3SCam/PZIK3u1Lcv/OpX8NRTcMUVRVcjSZIkdQ+DdIMoB2nnSKu77bwzrLEGHH00fPZZ0dVIkiRJXc8g3SDskVal9OgBJ58Mr78OZ55ZdDWSJElS1zNINwjnSKuSNtoINt88D/P+8MOiq5EkSZK6lkG6QdgjrUo76SQYNy6HaUmSJKmeGKQbhEFalbbqqrDXXnl496uvFl2NJEmS1HUM0g3CxcZUhOOPz3Omjzmm6EokSZKkrmOQbhD2SKsISy0Fhx4Kl18OTzxRdDWSJElS1zBINwgXG1NRjjoKBg2Cn/yk6EokSZKkrmGQbhDNzdCrF/TpU3QlajQDB8LPfw533QV33ll0NZIkSdKcM0g3iOZm50erOAcfDMsuC0ccAZMnF12NJEmSNGcM0g2iudlh3SpO377wu9/Bf/4D555bdDWSJEnSnDFIN4imJoO0irXttrDxxnDssfDhh0VXI0mSJM0+g3SDsEdaRYuA00+Hjz6C444ruBhJkiRpDhikG4RzpFUNhg6Fgw6Cs8+GMWOKrkaSJEmaPQbpBmGPtKrF8cfDPPPkhcdSKroaSZIkqfMM0g3COdKqFgsumOdJ33UX3Hpr0dVIkiRJnWeQbhD2SKuaHHwwrLBC7pWeOLHoaiRJkqTOMUg3COdIq5r07g2nngovvABnnVV0NZIkSVLnGKQbhD3SqjZbbAGbbZbnTL/3XtHVSJIkSR1nkG4AU6bAZ58ZpFV9Tj01z98/5piiK5EkSZI6ziDdAJqb89kgrWqz0kp5vvQFF8CTTxZdjSRJktQxBukGYJBWNTvuOBg0CA47zO2wJEmSVBsM0g2gHKRdbEzVaNCgPE961Ci47rqiq5EkSZJmzSDdAOyRVrU78EAYOhQOP7zleZUkSZKqlUG6ATQ15bNBWtWqVy84+2x4/XU44YSiq5EkSZJmziDdAOyRVi1Yf33Ye2845RR45pmiq5EkSZJmzCDdAJwjrVrxm9/k5/Tgg114TJIkSdXLIN0A7JFWrVh4Yfj1r+G+++DPfy66GkmSJKl9BukG4Bxp1ZIDDoBhw+CHP4Rx44quRpIkSZqeQboB2COtWtKzJ5xzDrzzDhx7bNHVSJIkSdMzSDcAg7RqzbBh8J3vwJlnwhNPFF2NJEmSNC2DdANoboa+ffMWQ1Kt+OUvYYEF4Hvfg6lTi65GkiRJamGQbgBNTfZGq/YMGgS//S2MHg2XXFJ0NZIkSVILg3QDaG42SKs27bln3l/6yCPhgw+KrkaSJEnKDNINoLnZPaRVm3r0gLPPho8+gp/9rOhqJEmSpKziQToiloqIayNiXER8HBHXRcQXOnhvv4g4OSLejojxETE6IjZop12PiDgqIl6JiM8i4smI2HEW771uREyNiBQRdTWb2B5p1bKhQ+HQQ+GCC+Chh4quRpIkSapwkI6IAcC9wIrAXsC3geWB+yKiI1HvIuAA4BfAVsDbwJ0RsXqbdicAxwFnAZsDDwLXRMQWM6irN3Ae8E7nvlFtcI60at3//R8sthgceCBMmlR0NZIkSWp0le6RPgAYAmyXUrohpXQjsA2wNHDQzG6MiNWA3YDDU0oXpJRGAjsDrwHHt2q3MPAj4KSU0u9SSvellA4C7gNOmsHb/xgI4I9z9O2qlD3SqnXzzANnnQVPPgmnnlp0NZIkSWp0lQ7S2wAPppReLF9IKb0MPABs24F7JwFXt7p3MnAVsGlE9C1d3hToA1ze5v7LgaERMbj1xYhYFjga+F7p/euOc6RVD7bfHnbYAY47Dl54oehqJEmS1MgqHaRXAZ5u5/oYYOUO3PtySunTdu7tAyzXqt0E4MV22tHO55wLXJNS+tssPr9m2SOtenHmmXlP9AMPhJSKrkaSJEmNqtKLas0PjG3n+ofAoDm4t/x6+fxRStP9Z3bbdkTEHsAawO6z+OzPRcSBwIEAiyyyCKNGjerorV2uqampQ58/btx6fPTRO4wa1fbvFqSu09HncU7tv/9inHLKChx55LNsueX/uv3zVJsq9TxKHeHzqGri86hqUsvPY12tTt0ZETE/cCrws5TSux29L6V0PnA+wLBhw9Lw4cO7p8AOGDVqFB35/M8+gy9+cUmGD1+y+4tSw+ro8zinNtgAHnkELrxwRY44YkUWW6zbP1I1qFLPo9QRPo+qJj6Pqia1/DxWemj3WNrveZ5Rb3NH74WWHuexwMCIiFm0O5G86vdfImJgRAwE+pVem6+Dq4hXvUmT8uHQbtWLHj3g/PNh/Pi8LZYkSZJUaZUO0mPIc5jbWhn4TwfuHVzaQqvtvRNpmRM9BugLLNtOO1p9zsrAqsAH5PA9FvhJ6bX3gStmUU9NaG7OZxcbUz354hfh2GPh2mvhhhuKrkaSJEmNptJB+iZg7YgYUr4QEcsA65Vem5mbgd7ATq3u7QXsAtyVUppQunwHefXttvOe9wCeLq0SDnAYsFGb49LSa18nr+Rd88pB2h5p1Zsf/QhWXRUOPhjGjSu6GkmSJDWSSs+RvgA4BLgxIo4GEnAC8DpwXrlRRCwNvAQcn1I6HiCl9HhEXA2cHhG9gZeB7wKDaRWaU0rvRsSpwFER8QnwGDlsb0zeQqvc7om2xUXE8NKP95e21qp5TU35bJBWvendGy68ENZeG376UzjnnKIrkiRJUqOoaI90SqmZHGifBy4jD59+Gdg4pdTUqmkAPdupbx/gYvL85luBpYDNUkqPtWn381KbHwB3knu8d04p3dKlX6gG2COterbmmnDYYXDuufD3vxddjSRJkhpFxVftTim9Buw4izavkMN02+vjgSNKx8zun0IO0id2srbjgOM6c0+1c4606t3xx8N118EBB8ATT0C/frO8RZIkSZojlZ4jrQqzR1r1bq654Lzz4Lnn4MRO/dWZJEmSNHsM0nXOOdJqBN/4Buy5J5x0Ejz6aNHVSJIkqd4ZpOucPdJqFKefDossAnvtBRMmzLK5JEmSNNsM0nXOOdJqFIMG5VW8x4zJe0xLkiRJ3cUgXefskVYj2Xxz2H9/OPlkePDBoquRJElSvTJI17nyHOkBA4qtQ6qUU06BJZfMQ7w//bToaiRJklSPDNJ1rrkZ+veHHv6TVoOYd164+GJ4/nn4+c+LrkaSJEn1yHhV55qbnR+txrPxxnDwwXDGGfC3vxVdjSRJkuqNQbrONTc7P1qN6Te/gSFDYO+9W6Y4SJIkSV3BIF3nDNJqVHPNBZdcAq+8AkceWXQ1kiRJqicG6TrX1GSQVuNaf304/HA45xy4556iq5EkSVK9MEjXOXuk1ehOPBFWXBH23RfGjSu6GkmSJNUDg3Sdc7ExNbr+/eHSS+HNN3PvtCRJkjSnDNJ1zh5pCdZaC37607wt1vXXF12NJEmSap1Bus45R1rKjj0W1lgD9t8/905LkiRJs8sgXefskZayPn3gyivhs89gr71g6tSiK5IkSVKtMkjXsZScIy219sUvwhlnwMiRcOqpRVcjSZKkWmWQrmMTJ8KUKfZIS63ttx/ssAP87Gfw2GNFVyNJkqRaZJCuY01N+WyQllpEwAUXwMILw2675VEbkiRJUmcYpOtYOSAYpKVpzT8/XHYZPP88HHFE0dVIkiSp1hik61g5SDtHWpreRhvBj38M558PN9xQdDWSJEmqJQbpOmaPtDRzJ5zQsiXWW28VXY0kSZJqhUG6jjlHWpq5Pn3giitg/HjYc0+3xJIkSVLHGKTrmD3S0qytsIJbYkmSJKlzDNJ1zDnSUse03hLr0UeLrkaSJEnVziBdx+yRljqmvCXWoovCTjvBRx8VXZEkSZKqmUG6jhmkpY6bf364+mp4/XXYd19IqeiKJEmSVK0M0nXMxcakzllnHTjpJLj+evj974uuRpIkSdXKIF3HmpvzkNX+/YuuRKodRxwB22yT95h+6KGiq5EkSVI1MkjXsebm3BsdUXQlUu2IgEsugcUXh513hg8/LLoiSZIkVRuDdB0rB2lJnTNoEPzlL/DWW7DPPs6XliRJ0rQM0nWsqckgLc2utdaCk0+Gm25yf2lJkiRNyyBdx+yRlubMoYfm/aV/+lMYPbroaiRJklQtDNJ1rLkZ5p676Cqk2hUBF10ESy0Fu+wCH3xQdEWSJEmqBgbpOmaPtDTnBg6Ea66Bd96BPfeEqVOLrkiSJElFM0jXMedIS11jjTXyPOnbbsv7TEuSJKmxGaTrmD3SUtf53vdg113h6KPhjjuKrkaSJElFMkjXMedIS10nAi68EIYOzYH6pZeKrkiSJElFMUjXMXukpa41YABcf30O1dtvn/83JkmSpMZjkK5TKRmkpe4wZAhcdRWMGQP77Zf/tyZJkqTGYpCuU+PH5//AN0hLXe8b34Bf/hKuvhpOOaXoaiRJklRpBuk6VR5y6hxpqXv85CfwzW/m8z33FF2NJEmSKskgXafKQdoeaal7RMAf/wgrrgjf+ha88krRFUmSJKlSDNJ1yiAtdb955oEbboDJk2GHHfKUCkmSJNU/g3SdamrKZ4O01L2WXx6uuAKeeAIOPNDFxyRJkhqBQbpOLbUUnHYafOlLRVci1b8tt4TjjoPLL4czzii6GkmSJHW3XkUXoO6x+OJw2GFFVyE1jqOPhscfhx/+EFZYATbfvOiKJEmS1F3skZakLtCjB1x2GQwdCrvskveZliRJUn0ySEtSF5l7brj55rw2wdZbw3vvFV2RJEmSuoNBWpK60FJLwY03wttvw/bbw4QJRVckSZKkrmaQlqQuttZacOml8MADruQtSZJUj1xsTJK6wc47w7PPwrHHwkorwU9/WnRFkiRJ6ioGaUnqJscck8P0UUfllby3377oiiRJktQVHNotSd0kAi66CL76Vdhjj7w9liRJkmqfQVqSulH//nDDDbDAAnkl77feKroiSZIkzSmDtCR1s0UXhVtugXHjYJttoKmp6IokSZI0JwzSklQBq64KV12Vh3fvvDNMmlR0RZIkSZpdBmlJqpAtt4Rzz4Xbb4fvfMdtsSRJkmqVq3ZLUgUdcAC8/jqccAIstRQcd1zRFUmSJKmzDNKSVGH/93/wxhv5vOSSsP/+RVckSZKkzjBIS1KFRcB558Hbb+ch3osvDltsUXRVkiRJ6ijnSEtSAXr3hmuugdVWg512gocfLroiSZIkdZRBWpIKMvfccOutsPDCeSGyl14quiJJkiR1hEFakgq06KJwxx0wdSpsthm8917RFUmSJGlWDNKSVLAVVoCbb84LkG21FTQ1FV2RJEmSZsYgLUlVYJ114Kqr4JFHYLvtYMKEoiuSJEnSjBikJalKbLst/PGPMHIk7LorTJ5cdEWSJElqj0FakqrIXnvBGWfA9dfn/aWnTi26IkmSJLXlPtKSVGUOPRQ++giOPRbmmw9OPz3vPS1JkqTqYJCWpCp0zDE5TJ92GgwaBMcdV3RFkiRJKjNIS1IVioBTTslh+v/+DwYOhMMOK7goSZIkAQZpSapaEXD++fDxx3D44XmY9z77FF2VJEmSDNKSVMV69YIrroBPPsmLj807L+y4Y9FVSZIkNTZX7ZakKte3L1x3Hay9dt4W67bbiq5IkiSpsRmkJakGzDUX3HorDB0KO+wAd95ZdEWSJEmNyyAtSTVi4EC4+25YaSXYdlu4666iK5IkSWpMBmlJqiHzzw/33AMrrpjD9D33FF2RJElS4zFIS1KNWWCBHKCXXx622QbuvbfoiiRJkhqLQVqSatCCC8LIkTBkCGy1Fdx/f9EVSZIkNQ6DtCTVqIUWymF6mWVgiy3g738vuiJJkqTGYJCWpBq2yCJ5aPcXvgCbbw4PPFB0RZIkSfXPIC1JNW7RRXOYXmIJ2Gwzw7QkSVJ3M0hLUh1YbDG47z5YfHH4xjdcgEySJKk7GaQlqU4svnhedGzIkDxn+rbbiq5IkiSpPhmkJamOLLoojBoFq6wC220Hf/1r0RVJkiTVH4O0JNWZBRbIq3kPGwa77AKXX150RZIkSfXFIC1JdWjgQLjrLthgA9hzTzj//KIrkiRJqh8GaUmqU3PPDbfemrfFOuggOOOMoiuSJEmqDwZpSapj/fvD9dfDDjvAYYfBr35VdEWSJEm1zyAtSXWuTx+4+mrYfXf4+c/hqKMgpaKrkiRJql29ii5AktT9evWCSy/Nw71POgnefRfOOy9flyRJUuf4n1CS1CB69oRzzoGFF4YTToD33oOrroIBA4quTJIkqbY4tFuSGkgEHH88nHUW3HILfOMbMHZs0VVJkiTVFoO0JDWggw/O86Yffhi+9jV4882iK5IkSaodBmlJalA77QS33w6vvQbrrgvPPlt0RZIkSbXBIC1JDWzjjWHUKPjsM1h/ffjXv4quSJIkqfoZpCWpwX3lK/DAAzDffDlY33FH0RVJkiRVN4O0JInllsth+otfhK22ggsuKLoiSZKk6mWQliQBsOii8Le/5ZW8DzwQfvITmDq16KokSZKqj0FakvS5eeaBm26C734Xfvtb2HlnGD++6KokSZKqS6+iC5AkVZdeveAPf4Dll4cf/hDeeANuvBEWWaToyiRJkqqDPdKSpOlEwOGHw3XXwVNPwdprw3/+U3RVkiRJ1cEgLUmaoe22g/vvz8O7110XRo4suiJJkqTiGaQlSTO15pp5f+kll4TNNoM//rHoiiRJkoplkJYkzdLSS+ftsTbaCPbbD444AiZPLroqSZKkYhikJUkdMt98cOut8P3vw2mnwRZbwNixRVclSZJUeRUP0hGxVERcGxHjIuLjiLguIr7QwXv7RcTJEfF2RIyPiNERsUE77XpExFER8UpEfBYRT0bEjm3aLBYRv46IRyLio4h4LyJGtvd+kqSsd2/4/e/hwgth1ChYay0XIZMkSY2nokE6IgYA9wIrAnsB3waWB+6LiLk68BYXAQcAvwC2At4G7oyI1du0OwE4DjgL2Bx4ELgmIrZo1WYNYBfgRmAnYG/gM2BURGzV+W8nSY1jv/1ykP7kE/jqV/Pe05IkSY2i0vtIHwAMAVZIKb0IEBFPAS8ABwGnzujGiFgN2A3YN6V0cena/cAY4Hhgm9K1hYEfASellH5Xuv2+iFgOOAm4rXTtH8AXU0qTW33GnaX3OxK4pSu+sCTVq3XXhUceySt7b7cdnHAC/OxneessSZKkelbpod3bAA+WQzRASull4AFg2w7cOwm4utW9k4GrgE0jom/p8qZAH+DyNvdfDgyNiMGlez9qHaJbvd8TwBKd+1qS1JiWXBL+/nfYdVc4+mjYZRdobi66KkmSpO5V6SC9CvB0O9fHACt34N6XU0qftnNvH2C5Vu0mAC+2046ZfU5E9AHWAZ6ZRS2SpJL+/eHyy+G3v4Vrr4X11oP//rfoqiRJkrpPpYP0/EB7a7x+CAyag3vLr5fPH6WU0izatec4YEngN7OoRZLUSgT8+Mdw223w6quwxhpwixNkJElSnar0HOmqFRG7AT8FTkgp/X0m7Q4EDgRYZJFFGDVqVGUKbEdTU1Ohny+15vMogH794A9/6Mdxx63C1lvPw+67v8o++7xMz56VrcPnUdXE51HVxOdR1aSWn8dKB+mxtN/zPKPe5rb3Lj2De6Glx3ksMDAiok2vdNt2n4uIrYFLgItSSsfOrIiU0vnA+QDDhg1Lw4cPn0XZ3WfUqFEU+flSaz6Pam2HHfJ+0xdeuDRvv700f/4zLLxw5T7f51HVxOdR1cTnUdWklp/HSg/tHkOew9zWysCsdiIdAwwubaHV9t6JtMyJHgP0BZZtpx1tPyciNgGuAa4nrxwuSZpD/frBBRfAH/8I//wnfPnL8MADRVclSZLUNSodpG8C1o6IIeULEbEMsF7ptZm5GehN3vO5fG8v8l7Qd6WUJpQu30Fe3Xv3NvfvATxdWiW8fP865H2kRwJ7pJSmzsZ3kiTNwD77wOjReUGy4cPh9NNhuhUsJEmSakylh3ZfABwC3BgRRwMJOAF4HTiv3CgilgZeAo5PKR0PkFJ6PCKuBk6PiN7Ay8B3gcG0Cs0ppXcj4lTgqIj4BHiMHLY3prTXdOkzVgRuBd4HTgbWiFabn6aUHuzyby9JDWj11fN+03vvDYcfnnuoL7wQ5p236MokSZJmT0WDdEqpOSI2Bk4DLgOC3Bt8WEqpqVXTAHoyfY/5PsAvgROBgcCTwGYppcfatPs50AT8AFgUeA7YOaXUeg3ZtcnztQcB97VTbrRzTZI0GwYOhOuvh5NPhqOOgsceg6uugmHDiq5MkiSp8yo9tJuU0msppR1TSvOmlOZJKW2XUnqlTZtXUkqRUjquzfXxKaUjUkqLppT6pZS+mlIa1c5nTEkpnZhSWjql1DeltGpK6do2bS4pfUa7Rzd8dUlqaBFw5JFw//0wcSKsuy6ceipMdVKNJEmqMRUP0pKkxrb++vDEE7DVVvDDH+bzu+8WXZUkSVLHGaQlSRU3//zw17/C2WfDvffCaqvByJFFVyVJktQxBmlJUiEi4LvfhYcegkGDYMQI+NnPYNKkoiuTJEmaOYO0JKlQq64KDz8M++0Hv/41bLABvPzyrO+TJEkqikFaklS4ueaCCy7IK3n/5z95qPfFF7vntCRJqk4GaUlS1dhlF3jqKfjKV2DffWH77V2ITJIkVR+DtCSpqiy9dF6A7He/g9tvh6FD4aabiq5KkiSphUFaklR1evTIW2M9+igsthhsu22eQ/3JJ0VXJkmSZJCWJFWxL30pr+p91FFwySV57vTf/150VZIkqdEZpCVJVa1PH/jVr+Bvf8tbZm24Ifz4xzB+fNGVSZKkRmWQliTVhPXWgyeegP33z/OnV18d/vGPoquSJEmNyCAtSaoZ88wD558Pd90FEybkPacPPRSamoquTJIkNRKDtCSp5owYAU8/DYccAmeemVf2Hjmy6KokSVKjMEhLkmrS3HPD73+f50737g1f/zoceCCMG1d0ZZIkqd4ZpCVJNe1rX4Mnn4Qjj4SLLoJVVoHRo+cvuixJklTHDNKSpJrXvz/85jfw4IMwcCD87GersvPO8NZbRVcmSZLqkUFaklQ31lwTHnsM9tvvv9x0E6y4Ipx1FkyZUnRlkiSpnhikJUl1pU8f2GOP13j6aVhnHfj+92HttXPAliRJ6goGaUlSXVpuObjjDvjzn+H113Nv9WGHwSefFF2ZJEmqdQZpSVLdioBvfQuefRYOOiiv8r3SSnDddZBS0dVJkqRaZZCWJNW9gQPh7LPhn/+EBRaAHXeEzTfPAVuSJKmzDNKSpIax9trw6KNw2mkwejQMHQo//jF8/HHRlUmSpFpikJYkNZRevfJc6RdegL32glNOgRVWgD/9CaZOLbo6SZJUCwzSkqSGtPDCcOGF8K9/wdJL51C9/vq5x1qSJGlmDNKSpIa25pp57vTFF8NLL+XfDzwQ3n236MokSVK1MkhLkhpejx6w997w/PNw+OE5VC+3HPz61zB+fNHVSZKkamOQliSpZL758pzpp5+GjTaCn/0MVlwRrrjC+dOSJKmFQVqSpDZWWAFuvBHuuw8WXBD22AO++lX429+KrkySJFUDg7QkSTMwfDg8/DBcdhn873+w4Yaw/fZ5CLgkSWpcBmlJkmaiR4/cI/388/DLX8I998Aqq8DBB8PbbxddnSRJKoJBWpKkDujfP8+ZfvFF2H9/OP98WHbZfG3s2KKrkyRJlWSQliSpExZZBM45B555BrbbLq/sPWQInHQSfPpp0dVJkqRKMEhLkjQbllsOrrwSnngC1lsPjjoq91CffTZMnFh0dZIkqTsZpCVJmgOrrQa33AL/+Acsv3yeO73SSnmBssmTi65OkiR1B4O0JEldYL314P774bbbYN55Yc8986Jkl19uoJYkqd4YpCVJ6iIRsPnm8OijcN11eYGyb3/bQC1JUr0xSEuS1MV69Mj7TT/2mIFakqR6ZJCWJKmbzCxQO4dakqTaZZCWJKmbtReo99wzL0529tkwfnzRFUqSpM4wSEuSVCGtA/VNN8Fii+VVvgcPht/8Bj7+uOgKJUlSRxikJUmqsB49YOut4YEHYNQoWH11+OlP4QtfgJ//HN59t+gKJUnSzBikJUkqSARsuCHccQc88giMGAG//jUsswwceii8/HLRFUqSpPYYpCVJqgJrrAHXXAPPPAO77grnngvLLQc77QQPPlh0dZIkqTWDtCRJVWSFFeCii3Jv9JFHwj33wDrrwLrrwl//ClOmFF2hJEkySEuSVIWWWCIP8379dTjzTHjnHfjmN/NK37//PTQ1FV2hJEmNyyAtSVIVm3tuOOQQeP753CO92GLwgx/AUkvBj34E//1v0RVKktR4DNKSJNWAnj1hhx3ySt+jR+eFyU4/Pc+j3mYbuPtuSKnoKiVJagwGaUmSaszaa8Nf/gKvvpq3y3rwQfjGN2ClleCss+CTT4quUJKk+maQliSpRi2xBJxwQp5H/ac/wbzzwve/n68feig8+2zRFUqSVJ8M0pIk1bi+feHb34aHHoJ//Qu23TZvn7XSSnmf6iuvhAkTiq5SkqT6YZCWJKmOrLUWXHYZvPEGnHRSPu++e+6l/tGP8qJlkiRpzhikJUmqQwsvDD/5CbzwAtx5Z+6ZPv30vE/1xhvD1VfDxIlFVylJUm0ySEuSVMd69MgLkf31r3ku9Ykn5i2zvvWt3Et9+OHw738XXaUkSbXFIC1JUoNYbLG8yvdLL8Htt8Pw4fCHP8Cqq8KwYXD22TB2bNFVSpJU/QzSkiQ1mJ49YbPN4Jpr4K238pDvSZPg4INz2N5117wv9ZQpRVcqSVJ1MkhLktTAFlwQfvADeOIJePRROOCAPKf6G9+AwYPhqKNgzJiiq5QkqboYpCVJEhHwla/AmWfmXuqrroKhQ+Hkk+FLX4IvfxlOPRXefrvoSiVJKp5BWpIkTaNfP9hlF7j11hyqzzgDeveGH/4Qllwy91Zfdhk0NRVdqSRJxTBIS5KkGVp4YTj0UHjoIXj2WfjZz/KWWnvuCYssklf/vv56+OyzoiuVJKlyDNKSJKlDVlgBTjghb5/1j3/kMD1yJOywQw7ce+4Jt93m/tSSpPpnkJYkSZ0SAeutB+eck+dM33UX7LQT3HwzbLllXvn7gANyyJ48uehqJUnqegZpSZI023r1ghEj4KKL4J13cpjefPO8WNnXv55D9f77532r7amWJNULg7QkSeoSffrAVlvB5ZfDu+/CX/+aFyb7y19giy3y8O9vfzvPqR4/vuhqJUmafQZpSZLU5fr3z3Onr7gC3nsPbrkl/37bbfm84IJ5OPiVV8JHHxVdrSRJndOr6AIkSVJ969s3z53eckuYNAnuvz/3Vl9/PVx7bR4evsEGsO22sM02sMwyRVcsSdLM2SMtSZIqpnfvPHf6nHPyHtX//Cf86Ed50bIf/AAGD4bVVoNjjoFHHoGUiq5YkqTpGaQlSVIhevSAddaBX/8a/vMfeP55+N3vYOBA+NWvYM01YYkl8mJl110Hn3xSdMWSJGUGaUmSVBWWXx5++MM89Pudd+CSS2D99eGaa2DHHWGBBWCTTeCUU+DZZ+2tliQVxyAtSZKqzoILwl575RW/338fRo2Cww/PAftHP4KVVoJll4VDDslbbtlbLUmqJIO0JEmqar17w4Ybwm9+A08/Da+8kudYr7IKXHxxXqBsgQVg+PA8JPyRR2Dq1KKrliTVM4O0JEmqKUsvDd/5Tu6J/vBDGDkSjjgCPv4Yfv7zPLd64YVh113z8PDXXy+6YklSvXH7K0mSVLP69oWNN87HSSflod/33AN33gl33QVXXZXbLb98nl+98caw0UZ56LgkSbPLIC1JkurGIovA7rvnIyX4979zj/W998IVV8C55+Z2q6/eEqy/9jWYZ55Cy5Yk1RiDtCRJqksRsOqq+Tj8cJg0Kc+fHjkyH2eemVcA79kTvvKVPA97ww3zSuEDBxZdvSSpmhmkJUlSQ+jdO+9bvc46cPTRMH48PPBA3m7r/vvh97/P+1j36JF7rFsH6wUWKLp6SVI1MUhLkqSG1L8/fP3r+YAcrB98sCVYn302nHZafm3llXOgLh/LLJN7vCVJjckgLUmSRA7WG22UD4DPPoOHHoJ//CP3XF99NZx/fn5t8cVhvfVyqF5vvTx8vHfv4mqXJFWWQVqSJKkd/frBBhvkA/Le1GPG5GBdPq65Jr/Wvz8MG9YydHzttWHRRYurXZLUvQzSkiRJHdCjBwwdmo/vfjdfe/11+Oc/YfToPCz8tNPgt7/Nry2zTA7VCyywBP365XnX/foVVb0kqSsZpCVJkmbTUkvBLrvkA/Jw8Mcey8F69Og81/qtt5bnrLOgV688BHyttVqOFVfMq4ZLkmqLQVqSJKmL9OsH666bj7JrrhlNr17r8NBD8PDDcOWVLftZzz03rLHGtMfyy+feb0lS9TJIS5IkdaOFFprA8OGw/fb596lT4fnnc6guh+uzz8692ZDD9Ze/PG24/uIX7bmWpGpikJYkSaqgHj3ykO4VV4RvfztfmzQJnnkGHn205Tj33JZwPWBAHha++uotx9Ch+bokqfIM0pIkSQXr3TsH5VVXhX32ydcmT24J108+CU88AVdd1TIsvEcPWGGFHKpXWy3fO3QoLLGEe1xLUnczSEuSJFWhXr1aVgkvSwlefTWH6vLxwAPw5z+3tBk0KN9TDuZDh8KXvpSHjEuSuoZBWpIkqUZE5G21llkGttuu5fpHH8HTT8NTT+Xj3/+GSy6BpqaWNksvDauskkP1KqvkY6WVHB4uSbPDIC1JklTjBg6E9dfPR9nUqbn3+qmncsgeMyYf99wDEyfmNhEweHBLqG59zDtvIV9FkmqCQVqSJKkO9eiRQ/LgwbDtti3XJ0+GF19sCdZPPw3/+Q/ccUde9Kxs8cWnDdYrrJAP52BLkkFakiSpofTq1bJq+I47tlyfPBn++9+8wFnro+0Q8bnmyntdl4P1Civk7bmWXx7mm6/iX0eSCmGQliRJEr165UD8xS9O24OdErz1Fjz3XMtR3gf7mmvyEPKyhRbKgXr55WG55ab92aHikuqJQVqSJEkzFJGHcy+xBGy88bSvTZiQh4k/91w+v/BCPu6+Gy69dNq2Cy0Eyy7b/rHIIg4Xl1RbDNKSJEmaLX37tqwA3lZzM7z0Uku4fumlfPz973Dllbmnu2zAABgypGVO9+DBeWXy8s/2ZkuqNgZpSZIkdbm55mrZy7qtiRPhlVdysP7vf1vOL78Mo0bBJ59M237++XOgXnrpfCyzzLQ/DxzY7V9HkqZhkJYkSVJF9enTMh+7rZTgww9zqG57/Oc/cPvtMH78tPfMO28O1V/4Aiy11PTnJZbInylJXcUgLUmSpKoRAQsskI9hw6Z/PSV4//28R/Yrr+Rz+Xj9dXjwQfjgg+nfc9FFc6hecsn2D8O2pM4wSEuSJKlmROSFyxZaqP2gDXl+9htvwGuv5eP11/P5jTfyll733AMffzz9fQstlAP14ovP+LzggnmPbkmNzSAtSZKkujLXXC17XM/Ixx/Dm2/mcP3GGzlsv/VWvvbWW/Doo/Duu9MuigZ5m7BFFoHFFmv/WHTRfCyySF6MTVJ9MkhLkiSp4cw7bz5WWmnGbSZNgv/9L4frN9+Et9+e9nj11TyU/L332r9/0KCWUF0O2AsvnH8vn8s/9+vXPd9TUvcwSEuSJEnt6N07z6teaqmZt5s0Cd55J4frd97J4bvt8cgj+fXm5vbfY955c6BeeOE8xLx8bvvzQgvl4eX2dkvFMkhLkiRJc6B375ZFy2aluTkPGX/33Ry6257fey9vB/bgg3lRtSlT2n+feebJgXrBBVvCdfkoL9bW9jB8S13HIC1JkiRVyFxz5T2xBw+eddupU2Hs2Byu33svh+3332853nsvn995B8aMyb9/+umM32/uuWGuudZm8cXz3tzzz58Ddvnn8jFoUD7KP/fvnxd5k9TCIC1JkiRVoR49WnqTV1yxY/eMH5+3/2rveP99GDPmI/r2XZQPP4Snnsp7dn/44Yx7viFvC1YO1+Vj4MCWc/ko/z7ffC3n+eZzWzHVJ4O0JEmSVCf695/5MPNRo55l+PBFp7mWEnzySQ7UH3yQe8HbHh9+2PLzO+/As8/CRx/lY+rUWdfUXsCed95pz22vzTtvHsI+77y5J99ecVUTg7QkSZLUwCJagusyy3Tu3nIIL4fqsWNh3Lj887hx0/5cPo8dC6+8krcgGzdu5sPRy3r0aAnV88zTcrT9vfUx99wt57Y/9+tnMNecMUhLkiRJmi2tQ/gXvjB77zF5ckuoLp8/+ST/PKPjk0/y8fbbLT9/8kl+r47o0aMlVLc+5pqr5dz25/aOAQOm/9mh7I3BIC1JkiSpML16tSx0NidSggkTWkJ1U1M+yj+3Pjc3t7ze1NTy+3vvwcsv59/Lx4QJnf8+AwZ07OjfPx/ln1tfm9UxYED+LBXDP3pJkiRJNS8iD9nu1y9vCdZVJk+eNliXg/enn+aj/HPrc3NzXvit3KZ8vPtuS5vx41vazGyxt5np2TN/3/79pz23/XlmR9++7f/e+tz6aHutZ8+u+7OuJRUP0hGxFHAaMAII4B7gsJTSax24tx9wArAHMBB4AvhJSulvbdr1AH4CHAQsCjwHHJ9S+ms773kA8ENgMPAKcFpK6dzZ+3aSJEmS6kmvXi2LoXWXSZNyoG4dvstBe0bHZ59Nf2577eOPc3hv2+azzzrf0z4jPXvmQN2nz/Shu/X11ufyz01NyzJ8eNfUUWkVDdIRMQC4F5gA7AUk4ETgvohYNaXUPIu3uAjYEvgx8F/gYODOiFgnpfREq3YnAD8Cfg48CnwLuCYitkop3daqngOA84BfkwP9JsDZEREppXPm9PtKkiRJ0qz07t39Yb2tlGDixOnDddvfWx/tXWt9TJzY/u8TJ+a5761/nzABevacw/H8Bap0j/QBwBBghZTSiwAR8RTwArn3+NQZ3RgRqwG7AfumlC4uXbsfGAMcD2xTurYwOUSflFL6Xen2+yJiOeAk4LZSu17AL4HLUko/b9VuceCEiLgwpTSpy765JEmSJFWJiJZe40oG+NZGjXoYGF7Mh8+hHhX+vG2AB8shGiCl9DLwALBtB+6dBFzd6t7JwFXAphHRt3R5U6APcHmb+y8HhkbE4NLv6wALtdPuMmABYP0OfidJkiRJUgOpdJBeBXi6netjgJU7cO/LKaW2O82NIQfn5Vq1mwC82E47Wn3OKqVz23ratpMkSZIk6XOVHto9PzC2nesfAoPm4N7y6+XzRyml1IF2tPOebdtNIyIOBA4EWGSRRRg1atQsyu4+TU1NhX6+1JrPo6qJz6Oqic+jqonPo6pJLT+Pbn/VSSml84HzAYYNG5aGF7jM3KhRoyjy86XWfB5VTXweVU18HlVNfB5VTWr5eaz00O6xtN/zPKPe5o7eCy09yWOBgRERHWhHO+/Ztp0kSZIkSZ+rdJAeQ8vc5NZWBv7TgXsHl7bQanvvRFrmRI8B+gLLttOOVp9Tngvdtp627SRJkiRJ+lylg/RNwNoRMaR8ISKWAdYrvTYzNwO9gZ1a3dsL2AW4K6VU3lL8DvLq3ru3uX8P4OnSKuEAo4H3Z9DuQ/JK4pIkSZIkTaPSc6QvAA4BboyIo4EEnAC8DpxXbhQRSwMvAcenlI4HSCk9HhFXA6dHRG/gZeC7wGBaheGU0rsRcSpwVER8AjxGDtsbU9prutRuUkQcA5wdEW8C95Ta7At8P6U0sZv+DCRJkiRJNayiQTql1BwRGwOnkfdrDmAkcFhKqalV0wB6Mn2P+T7AL4ETgYHAk8BmKaXH2rT7OdAE/ABYFHgO2DmldEubes6NiAT8EPgx8BpwSErp7Dn8qpIkSZKkOlXxVbtTSq8BO86izSvkMN32+njgiNIxs/unkMP2iR2o5zxa9YZLkiRJkjQzlZ4jLUmSJElSTTNIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVInGKQlSZIkSeoEg7QkSZIkSZ1gkJYkSZIkqRMM0pIkSZIkdYJBWpIkSZKkTjBIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVInGKQlSZIkSeoEg7QkSZIkSZ1gkJYkSZIkqRMM0pIkSZIkdYJBWpIkSZKkTjBIS5IkSZLUCZFSKrqGmhUR7wGvFljCgsD7BX6+1JrPo6qJz6Oqic+jqonPo6pJLTyPS6eUFmp70SBdwyLikZTSsKLrkMDnUdXF51HVxOdR1cTnUdWklp9Hh3ZLkiRJktQJBmlJkiRJkjrBIF3bzi+6AKkVn0dVE59HVROfR1UTn0dVk5p9Hp0jLUmSJElSJ9gjLUmSJElSJxikJUmSJEnqBIN0jYmIpSLi2ogYFxEfR8R1EfGFoutSfYuIb0bEXyPi1YgYHxHPRcSvI2KeNu0GRcSFEfF+RDRHxD0RMbSoutU4IuKOiEgRcWKb6z6TqoiI2CIi/hYRTaV/Pz8SERu3et1nURUREetFxF0R8W5EfBIRj0XEvm3a9IuIkyPi7dK/10dHxAZF1az6EBFLRsSZpefp09K/l5dpp12Hnr+I6BERR0XEKxHxWUQ8GRE7VuTLdIBBuoZExADgXmBFYC/g28DywH0RMVeRtanu/QiYAvwM2Aw4B/gucHdE9ACIiABuLr3+fWBHoDf5+VyyiKLVGCJiV2C1dq77TKoiIuIg4EbgUWB7YCfgGmBA6XWfRVVERKwK3EN+vg4AdgAeBi6KiO+2anpR6fVfAFsBbwN3RsTqFS1Y9WY5YGdgLPD3mbTr6PN3AnAccBawOfAgcE1EbNGlVc8mFxurIRHxA+BUYIWU0oula4OBF4AjU0qnFlmf6ldELJRSeq/NtT2BS4FNUkr3RsS2wA3Aximl+0pt5gNeBi5PKR1a4bLVACJiEPAMcDhwJfDLlNLRpdd8JtXtSr0tzwBHpZROn0Ebn0VVRET8ivyX3/OnlJpaXR8NkFJaJyJWA54A9k0pXVx6vRcwBngupbRNxQtXXYiIHimlqaWf9wcuAAanlF5p1aZDz19ELAy8DpyUUjq21f0jgYVSSqtW5EvNhD3StWUb4MFyiAZIKb0MPABsW1hVqnttQ3TJw6XzEqXzNsBb5f9ILN03jtwL4/Op7vIb4OmU0p/bec1nUpWwLzAVOHcmbXwWVSl9gEnA+DbXx9Hy3/3blNpcXX4xpTQZuArYNCL6VqBO1aFyiJ6Fjj5/m5Kf58vb3H85MLTUmVgog3RtWQV4up3rY4CVK1yLtGHp/EzpPLPn8wsRMXdFqlLDiIj1gT2Bg2fQxGdSlbA+8CzwrYh4KSImR8SLEdH6ufRZVKVcUjr/PiIWj4iBEXEAsAlwWum1VYCXU0qftrl3DDm4LFeRStWoOvr8rQJMAF5spx1UQfYxSNeW+clzDtr6EBhU4VrUwCJiCeB44J6U0iOlyzN7PsFnVF0oIvoA5wG/Syk9N4NmPpOqhMXJ65WcDJwEfAO4GzirNCULfBZVISmlp4Hh5JEOb5Kfuz8A30kpXVVqNqvncf5uLlONraPP3/zAR2n6echV85z2KroASbWl1HNyIzAZ2KfgctS4jgT6A78suhA1vB7APMDeKaXrStfuLc2dPioifl9YZWo4EbE88Fdyr913yEO8twXOjYjPUkpXFFmfVE8M0rVlLO3/rfWM/mZH6lIR0Z88p28IsGFK6Y1WL8/s+Sy/Ls2xyFv+/RzYH+jbZj5f34gYCHyCz6Qq4wNyj/Tdba7fRV6lezF8FlU5vyLPP90qpTSpdG1kRCwAnBERfyY/b0u3c2/5efywndekrtLR528sMDAiok2vdNU8pw7tri1jyPMF2loZ+E+Fa1GDiYjewLXAMGCLlNK/2zSZ2fP5WuvVQ6U5NAToR15wZGyrA/JqtWOBofhMqjLGzOL1qfgsqnKGAk+2CtFlDwELAAuTn8fBpW1VW1sZmMj0c1KlrtTR528M0BdYtp12UAXZxyBdW24C1o6IIeULpaFj65Vek7pFaa/oK4CNge1SSg+20+wmYImI2LDVffMCW+Pzqa71BLBROwfkcL0R+V/EPpOqhOtL503bXN8MeCOl9D98FlU5/wNWL60j0dpXgc/IvXg3k/eZ3qn8Ymn7oV2Au1JKEypUqxpTR5+/O8ijK3Zvc/8e5N06Xq5ArTPl0O7acgFwCHBjRBwNJPJG5a+TF92RussfyP+H90ugOSLWbvXaG6Uh3jcBo4HLI+LH5F7Bo4AAflvhelXHUkofAaPaXo8IgFdTSqNKv/tMqhJuA+4DzouIBYH/kv//8hu0rCPhs6hKOQu4Brg5Is4mz5HeBtgVOC2lNBF4PCKuBk4vjTZ7GfguMJjpQ4vUKRHxzdKPa5TOm0fEe8B7KaX7U0odev5SSu9GxKnktSY+AR4jh+2Nyc904WL6hdBUzUpzA08DRpD/BTwSOKz1RudSV4uIV2h/PgvA/6WUjiu1mx/4HbAdeejtaOCIlNKT3V+lGl1EJOCXKaWjW13zmVS3K/Uu/xr4Jnku9LPASSmlK1u18VlURUTE5sBPyNMJ+gEvAecD56WUppTalBdr3A0YCDwJ/KT8F5HS7Cr9u7g996eUhpfadOj5i4ie5L90PABYFHgOOD6ldG131N5ZBmlJkiRJkjrBOdKSJEmSJHWCQVqSJEmSpE4wSEuSJEmS1AkGaUmSJEmSOsEgLUmSJElSJxikJUmSJEnqBIO0JEmqiIh4JSIuL7oOSZLmlEFakiRJkqROMEhLkiRJktQJBmlJkupQRKwWETdFxNiIGB8RD0TE11q9fklEvBER60bEwxHxWWno9ffbea+1IuKeiGiKiOaIGBkRa7XTbsOIuDsixpXaPRkR+7XT7lsR8UypzSMRsX7X/wlIktR9DNKSJNWZiPgK8E9gfuAAYEfgA+CeiFijVdN5gauBS4HtgFHA7yNi71bvtSpwPzAI2BvYs3Tf/RGxWqt22wIjgT7AQcC2wB+BpduU9zXgh8AxwC5AT+CWiBg4h19bkqSKiZRS0TVIkqQuFBEjgcWB1VJKE0vXegJPA8+llLaLiEuAvYBdU0pXtbr3buCLwDIppRQR1wJfL/3+UanNvMArwKiU0g4REcDLwPvAWimlqTOo6xVgPmBISmls6dow4GFg95TSlV36ByFJUjexR1qSpDoSEf2BDYFrgKkR0SsiegEB3ANs0Kr5FOCvbd7iKuALwBKl3zcAbimHaICU0sfATaXPAViB3PN84YxCdCujyyG65N+l8xdm/e0kSaoOBmlJkurL/OTh0scAk9ochwCDIqL87/+xKaVJbe5/p3QuB+n5gbfb+Zz/kYd7AyxQOr/Rgfo+bP1LSmlC6cd+HbhXkqSq0KvoAiRJUpf6CJgK/AH4U3sNUkpT82hsBkVE7zZhepHS+c3S+UNg0XbeZlGg3LP8fum8RDvtJEmqOwZpSZLqSEqpOSL+DqwGPDaLodY9yQuRXdXq2reA12gJ0vcDW0TEPCmlTwAiYh5ga/LiZADPk+dM7x8R5ycXYJEk1TmDtCRJ9ecI4G/AnRFxEXlo9oLAV4CeKaWfltp9Avw2IhYEXgB2JS8stnerMHwCsBUwMiJ+AyTgJ8AA4HiA0qJkhwHXAfdGxLnAe8BKwMIppWO7+ftKklRRzpGWJKnOpJQeA9Ykb3n1e+Au4AxgKDlgl31M7oHeC7gR2Aj4QUrp0lbv9RQwvNT2UuAyoAnYMKX0ZKt2NwIjSr9eRF6M7EByT7UkSXXF7a8kSWpApe2vvp5SWrLoWiRJqjX2SEuSJEmS1AkGaUmSJEmSOsGh3ZIkSZIkdYI90pIkSZIkdYJBWpIkSZKkTjBIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVIn/D9nKvBASJhCgQAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "logger.setLevel(3)\n", + "matplotlib.rcParams.update({'font.size': 16})\n", + "plt.figure(figsize=(16,12))\n", + "plt.plot(train_result['epoch'], train_result['train_loss'], color='b', label='train')\n", + "plt.plot(train_result['epoch'], train_result['val_loss'], color='r', label='va')\n", + "plt.title('CXE Loss')\n", + "plt.ylabel('CXE')\n", + "plt.xlabel('epoch')\n", + "plt.grid(True)\n", + "plt.show()\n", + "\n", + "plt.figure(figsize=(16,12))\n", + "plt.plot(train_result['epoch'], train_result['train_acc'], color='b', label='train')\n", + "plt.plot(train_result['epoch'], train_result['val_acc'], color='r', label='va')\n", + "plt.title('Classification Accuracy')\n", + "plt.ylabel('Accuracy (%)')\n", + "plt.xlabel('epoch')\n", + "plt.grid(True)\n", + "plt.show()\n", + "\n", + "plt.figure(figsize=(16,12))\n", + "plt.plot(train_result['epoch'], train_result['lr'], color='b')\n", + "plt.title('Learning Rate')\n", + "plt.ylabel('lr')\n", + "plt.xlabel('epoch')\n", + "plt.grid(True)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = TDNNXVec(16, 2, 32, 16, 100)\n", + "state_dict=torch.load(\"./tdnn_xvec/model_ep0099.pth\")\n", + "model.load_state_dict(state_dict['model_state_dict'])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the Trials dataset with different speakers than the train data.\n", + "#trial_data = IVDataset(num_spks=20, seed=4321)\n", + "trial_loader = DataLoader(train_data, batch_size=100, shuffle=True)\n", + "# sample enrollment data and compute x-vectors\n", + "x_e, y_e = next(iter(trial_loader))\n", + "z_e = model(x_e, infer=True).detach().cpu().numpy()\n", + "# sample test data and compute x-vectors\n", + "x_t, y_t = next(iter(trial_loader))\n", + "z_t = model(x_t, infer=True).detach().cpu().numpy()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.12987226757966816\n" + ] + } + ], + "source": [ + "from hyperion.utils.math import cosine_scoring\n", + "from hyperion.np.metrics import compute_eer\n", + "scores = cosine_scoring(z_e, z_t)\n", + "key = (y_e[:, None] - y_t[None,:])==0\n", + "tar_scores = scores[key==1]\n", + "non_scores = scores[key==0]\n", + "eer = compute_eer(tar_scores, non_scores)\n", + "print(eer)" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "488a239b304e646027d6710c3377746db4487e56624448f35f81edd765904a6d" + }, + "kernelspec": { + "display_name": "Python 3.8.12 ('py38_pt101_cu112')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt index 7a1ae1b3..1e1aea9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,10 +12,15 @@ memory_profiler gdown fairscale==0.4.4 tensorboard>=2.5.0 -yapf jsonargparse>=3.5.0 wandb>=0.10.30 librosa>=0.8.1 +isort black twine wheel +transformers>=4.16.2 +sentencepiece>=0.1.97 +loralib +lhotse + diff --git a/setup.py b/setup.py index 9780586d..e1fb35cc 100644 --- a/setup.py +++ b/setup.py @@ -15,15 +15,26 @@ # limitations under the License. # -import setuptools from pathlib import Path +import setuptools + project_root = Path(__file__).parent -with open(project_root / "apps.txt") as f: - apps = f.read().splitlines() +# with open(project_root / "apps.txt") as f: +# apps = f.read().splitlines() -apps = [str(project_root / "hyperion" / "bin" / app) for app in apps] +# apps = [str(project_root / "hyperion" / "bin" / app) for app in apps] +binaries = (project_root / "hyperion" / "bin").glob("*.py") +console_scripts = [] +for binary in binaries: + stem = binary.stem + script_name = stem.replace("hyperion_", "").replace("_", "-") + if script_name[0] == "-": + continue + module = f"hyperion.bin.{stem}:main" + console_script = f"hyperion-{script_name} = {module}" + console_scripts.append(console_script) with open(project_root / "requirements.txt") as f: requirements = f.read().splitlines() @@ -77,10 +88,22 @@ def get_version(): "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ], python_requires=">=3.7", install_requires=requirements, - scripts=apps, + entry_points={ + "console_scripts": console_scripts, + } + # entry_points={ + # "console_scripts": [ + # "hyperion-prepare-data = hyperion.bin.prepare_data:main", + # "hyperion-train-wav2xvector = hyperion.bin.train_wav2xvector:main", + # ] + # }, + # scripts=apps, ) diff --git a/tools/install_k2_from_src.sh b/tools/install_k2_from_src.sh new file mode 100755 index 00000000..0bd2e972 --- /dev/null +++ b/tools/install_k2_from_src.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +if [ $# -ne 2 ];then + echo "Usage: $0 " + echo " e.g.: $0 hyperion /usr/local/cuda" +fi + +env_name=$1 +CUDA_ROOT=$2 + +eval "$(conda shell.bash hook)" +conda activate $env_name + +#module load cuda10.2/toolkit +#module load gcc + +#conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch + +CUDA_VERSION=$(echo "import torch; print(torch.version.cuda)" | python) +CUDNN_VERSION=$(echo "import torch; print(torch.__config__.show())" | python | awk '/CuDNN/ { print $NF}') + +# Install cmake +echo "Installing CMAKE" +conda install -c anaconda cmake +echo "Installing NVIDIDA CUDA=$CUDA_VERSION CUDNN=$CUDNN_VERSION" +conda install -c nvidia cudnn=$CUDNN_VERSION cudatoolkit=$CUDA_VERSION + +#conda install -c k2-fsa -c conda-forge kaldilm + +echo "Download k2" +git clone https://github.com/k2-fsa/k2.git +cd k2 + +ENV_PATH=$(which python | sed 's@/bin/python$@@') +NVCC=$CUDA_ROOT/bin/nvcc +CUDNN_LIBRARY_PATH=${ENV_PATH}/lib +CUDNN_INCLUDE_PATH=${ENV_PATH}/include +CUDA_TOOLKIT_DIR=$ENV_PATH +export PATH=$CUDA_ROOT/bin:$PATH + +export K2_CMAKE_ARGS="\ +-DCMAKE_BUILD_TYPE=Release \ +-DCMAKE_CUDA_COMPILER=$NVCC \ +-DPYTHON_EXECUTABLE=$(which python) \ +-DCUDNN_LIBRARY_PATH=$CUDNN_LIBRARY_PATH/libcudnn.so \ +-DCUDNN_INCLUDE_PATH=$CUDNN_INCLUDE_PATH \ +-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_ROOT" + +export K2_MAKE_ARGS="-j6" + +echo "Compile k2 with CMAKE_ARGS=$K2_CMAKE_ARGS" +python setup.py install +cd - + + +# pip install lhotse + +# export OT_CMAKE_ARGS=$K2_CMAKE_ARGS +# git clone https://github.com/csukuangfj/optimized_transducer +# cd optimized_transducer +# python setup.py install +# cd - + + +# git clone https://github.com/k2-fsa/icefall +# cd icefall +# pip install -r requirements.txt +# export PYTHONPATH=./icefall:$PYTHONPATH