From 48b1e4e6523af9af5c63509d803d148d49edcf36 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Mon, 23 Jan 2023 17:25:18 -0500 Subject: [PATCH 01/89] commonvoice speech recognition recipe --- egs/commonvoice/v1/cmd.sh | 28 ++ egs/commonvoice/v1/conf/clsp.conf | 11 + egs/commonvoice/v1/conf/coe_gpu_bigmem.conf | 11 + egs/commonvoice/v1/conf/coe_gpu_long.conf | 13 + egs/commonvoice/v1/conf/coe_gpu_rtx.conf | 11 + egs/commonvoice/v1/conf/coe_gpu_short.conf | 11 + egs/commonvoice/v1/conf/coe_gpu_v100.conf | 11 + egs/commonvoice/v1/conf/reverb_noise_aug.yaml | 35 ++ ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 55 +++ ...v2vec2xlsr300m_transducer_stage1_v2.0.yaml | 55 +++ ...v2vec2xlsr300m_transducer_stage1_v3.0.yaml | 55 +++ ...v2vec2xlsr300m_transducer_stage1_v3.1.yaml | 55 +++ ...v2vec2xlsr300m_transducer_stage1_v3.2.yaml | 55 +++ ...v2vec2xlsr300m_transducer_stage1_v3.3.yaml | 55 +++ ...v2vec2xlsr300m_transducer_stage1_v4.3.yaml | 55 +++ ...v2vec2xlsr300m_transducer_stage1_v4.4.yaml | 55 +++ ...v2vec2xlsr300m_transducer_stage2_v1.0.yaml | 56 +++ ...v2vec2xlsr300m_transducer_stage2_v3.2.yaml | 61 +++ ...v2vec2xlsr300m_transducer_stage2_v3.3.yaml | 61 +++ .../v1/conf/wav2vec2xlsr300m_transducer.yaml | 14 + .../conf/wav2vec2xlsr300m_transducer_do.yaml | 13 + .../wav2vec2xlsr300m_transducer_do0.2.yaml | 13 + .../wav2vec2xlsr300m_transducer_do0.3.yaml | 13 + .../wav2vec2xlsr300m_transducer_do0.4.yaml | 13 + .../wav2vec2xlsr300m_transducer_enclast.yaml | 11 + egs/commonvoice/v1/datapath.sh | 22 + egs/commonvoice/v1/default_config.sh | 1 + egs/commonvoice/v1/feats | 1 + .../v1/global_conf/config_transducer_v3.3.sh | 39 ++ .../global_conf/config_transducer_v3.3_it.sh | 41 ++ egs/commonvoice/v1/hyp_utils | 1 + egs/commonvoice/v1/local/data_prep.sh | 33 ++ egs/commonvoice/v1/local/make_musan.py | 189 ++++++++ egs/commonvoice/v1/local/make_musan.sh | 48 ++ egs/commonvoice/v1/local/make_rirs_data.sh | 29 ++ egs/commonvoice/v1/local/prepare_lang.py | 410 ++++++++++++++++++ egs/commonvoice/v1/local/prepare_lang_bpe.py | 259 +++++++++++ egs/commonvoice/v1/local/train_bpe_model.py | 97 +++++ .../v1/local/validate_bpe_lexicon.py | 77 ++++ egs/commonvoice/v1/path.sh | 5 + egs/commonvoice/v1/run_001_prepare_data.sh | 50 +++ .../v1/run_003_prepare_noises_rirs.sh | 67 +++ egs/commonvoice/v1/run_004_compute_bpe.sh | 105 +++++ egs/commonvoice/v1/run_011_train_asr.sh | 119 +++++ egs/commonvoice/v1/run_030_inference.sh | 47 ++ egs/commonvoice/v1/steps | 1 + egs/commonvoice/v1/steps_be | 1 + egs/commonvoice/v1/steps_pyfe | 1 + egs/commonvoice/v1/steps_transducer | 1 + egs/commonvoice/v1/steps_xvec | 1 + egs/commonvoice/v1/utils | 1 + egs/commonvoice/v1/xvectors | 1 + .../decode_wav2vec2transducer.sh | 80 ++++ .../preprocess_audios_for_nnet_train.sh | 112 +++++ hyperion/bin/preprocess_audio_files.py | 8 + 55 files changed, 2673 insertions(+) create mode 100755 egs/commonvoice/v1/cmd.sh create mode 100644 egs/commonvoice/v1/conf/clsp.conf create mode 100644 egs/commonvoice/v1/conf/coe_gpu_bigmem.conf create mode 100644 egs/commonvoice/v1/conf/coe_gpu_long.conf create mode 100644 egs/commonvoice/v1/conf/coe_gpu_rtx.conf create mode 100644 egs/commonvoice/v1/conf/coe_gpu_short.conf create mode 100644 egs/commonvoice/v1/conf/coe_gpu_v100.conf create mode 100644 egs/commonvoice/v1/conf/reverb_noise_aug.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml create mode 100644 egs/commonvoice/v1/datapath.sh create mode 120000 egs/commonvoice/v1/default_config.sh create mode 120000 egs/commonvoice/v1/feats create mode 100644 egs/commonvoice/v1/global_conf/config_transducer_v3.3.sh create mode 100644 egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh create mode 120000 egs/commonvoice/v1/hyp_utils create mode 100755 egs/commonvoice/v1/local/data_prep.sh create mode 100755 egs/commonvoice/v1/local/make_musan.py create mode 100755 egs/commonvoice/v1/local/make_musan.sh create mode 100755 egs/commonvoice/v1/local/make_rirs_data.sh create mode 100755 egs/commonvoice/v1/local/prepare_lang.py create mode 100755 egs/commonvoice/v1/local/prepare_lang_bpe.py create mode 100755 egs/commonvoice/v1/local/train_bpe_model.py create mode 100755 egs/commonvoice/v1/local/validate_bpe_lexicon.py create mode 100755 egs/commonvoice/v1/path.sh create mode 100755 egs/commonvoice/v1/run_001_prepare_data.sh create mode 100755 egs/commonvoice/v1/run_003_prepare_noises_rirs.sh create mode 100755 egs/commonvoice/v1/run_004_compute_bpe.sh create mode 100755 egs/commonvoice/v1/run_011_train_asr.sh create mode 100755 egs/commonvoice/v1/run_030_inference.sh create mode 120000 egs/commonvoice/v1/steps create mode 120000 egs/commonvoice/v1/steps_be create mode 120000 egs/commonvoice/v1/steps_pyfe create mode 120000 egs/commonvoice/v1/steps_transducer create mode 120000 egs/commonvoice/v1/steps_xvec create mode 120000 egs/commonvoice/v1/utils create mode 120000 egs/commonvoice/v1/xvectors create mode 100755 hyp_utils/steps_transducer/decode_wav2vec2transducer.sh create mode 100755 hyp_utils/steps_transducer/preprocess_audios_for_nnet_train.sh diff --git a/egs/commonvoice/v1/cmd.sh b/egs/commonvoice/v1/cmd.sh new file mode 100755 index 00000000..89dbb7d8 --- /dev/null +++ b/egs/commonvoice/v1/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " + export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/commonvoice/v1/conf/clsp.conf b/egs/commonvoice/v1/conf/clsp.conf new file mode 100644 index 00000000..959c62a7 --- /dev/null +++ b/egs/commonvoice/v1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/commonvoice/v1/conf/coe_gpu_bigmem.conf b/egs/commonvoice/v1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/commonvoice/v1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/commonvoice/v1/conf/coe_gpu_long.conf b/egs/commonvoice/v1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/commonvoice/v1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/commonvoice/v1/conf/coe_gpu_rtx.conf b/egs/commonvoice/v1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/commonvoice/v1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/commonvoice/v1/conf/coe_gpu_short.conf b/egs/commonvoice/v1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/commonvoice/v1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/commonvoice/v1/conf/coe_gpu_v100.conf b/egs/commonvoice/v1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/commonvoice/v1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/commonvoice/v1/conf/reverb_noise_aug.yaml b/egs/commonvoice/v1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/commonvoice/v1/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml new file mode 100644 index 00000000..edc0af5e --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml new file mode 100644 index 00000000..aefddc7e --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_enclast.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml new file mode 100644 index 00000000..49077fd6 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml new file mode 100644 index 00000000..9f070bbe --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml new file mode 100644 index 00000000..d787a373 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml new file mode 100644 index 00000000..564ea8c7 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml new file mode 100644 index 00000000..35b2b47c --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml new file mode 100644 index 00000000..855bfc98 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 42000 + hold_steps: 15000 + min_lr: 4e-5 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 1200 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml new file mode 100644 index 00000000..0f328e08 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml @@ -0,0 +1,56 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: 'seg_sampler' + # sampler_type: 'bucketing_seg_sampler' + min_batch_size: 4 + batch_size: 4 + iters_per_epoch: 6 + drop_last: true + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: 'seg_sampler' + # sampler_type: 'bucketing_seg_sampler' + min_batch_size: 2 + batch_size: 2 + iters_per_epoch: 6 + drop_last: true + data_loader: + num_workers: 8 +model: {} +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: full + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml new file mode 100644 index 00000000..69c489b0 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml @@ -0,0 +1,61 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + transducer: + decoder: + override_dropouts: true + embedding_dropout_rate: 0.3 + rnn_dropout_rate: 0.3 + +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml new file mode 100644 index 00000000..8017f9b3 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml @@ -0,0 +1,61 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + transducer: + decoder: + override_dropouts: true + embedding_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer.yaml new file mode 100644 index 00000000..a7071b8c --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer.yaml @@ -0,0 +1,14 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + #embedding_dim: 128 + #num_layers: 1 + #hidden_dim: 64 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml new file mode 100644 index 00000000..c7fc2df7 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.1 + rnn_dropout_rate: 0.1 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml new file mode 100644 index 00000000..1ee4ec72 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.2 + rnn_dropout_rate: 0.2 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml new file mode 100644 index 00000000..ca7c1995 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.3 + rnn_dropout_rate: 0.3 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml new file mode 100644 index 00000000..9fed09e7 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml new file mode 100644 index 00000000..1d46c33c --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml @@ -0,0 +1,11 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + joiner: + num_layers: 1 +feat_fusion_method: last + diff --git a/egs/commonvoice/v1/datapath.sh b/egs/commonvoice/v1/datapath.sh new file mode 100644 index 00000000..4c7987ef --- /dev/null +++ b/egs/commonvoice/v1/datapath.sh @@ -0,0 +1,22 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + librispeech_root=/export/corpora5/LibriSpeech + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + # voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + # voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + # musan_root=/expscratch/dgromero/corpora-open/musan + echo "Put your database paths here" + exit 1 +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/commonvoice/v1/default_config.sh b/egs/commonvoice/v1/default_config.sh new file mode 120000 index 00000000..2b6239b6 --- /dev/null +++ b/egs/commonvoice/v1/default_config.sh @@ -0,0 +1 @@ +global_conf/config_transducer_v1.sh \ No newline at end of file diff --git a/egs/commonvoice/v1/feats b/egs/commonvoice/v1/feats new file mode 120000 index 00000000..7b9d122a --- /dev/null +++ b/egs/commonvoice/v1/feats @@ -0,0 +1 @@ +hyp_utils/feats \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3.sh new file mode 100644 index 00000000..4800e6fe --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=sv_train_proc_audio +dev_data=sv_dev_proc_audio +test_data=sv_test_proc_audio + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.3 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0120.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh new file mode 100644 index 00000000..c0fbe9dc --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh @@ -0,0 +1,41 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=it_train_proc_audio +dev_data=it_dev_proc_audio +test_data=it_test_proc_audio + +language=it + +bpe_model=data/it_lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.3_it +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0015.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/hyp_utils b/egs/commonvoice/v1/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/commonvoice/v1/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/commonvoice/v1/local/data_prep.sh b/egs/commonvoice/v1/local/data_prep.sh new file mode 100755 index 00000000..d68c2368 --- /dev/null +++ b/egs/commonvoice/v1/local/data_prep.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +. ./cmd.sh +. ./path.sh + +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 ${language} /export/c06/ylu125/GSP/corpora/CommonVoice data/" + exit 1 +fi + +language=$1 +src=$2 +dst=$3 + +if [ ! -d $src/cv-corpus-12.0-2022-12-07/${language} ]; then + wget https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-12.0-2022-12-07/cv-corpus-12.0-2022-12-07-${language}.tar.gz + tar -xvzf cv-corpus-12.0-2022-12-07-${language}.tar.gz -C $src + rm cv-corpus-12.0-2022-12-07-${language}.tar.gz +fi + + +lhotse prepare commonvoice -l ${language} $src/cv-corpus-12.0-2022-12-07/ ${dst}/${language} + + +for part in dev test train +do + lhotse kaldi export ${dst}/${language}/cv-${language}_recordings_${part}.jsonl.gz ${dst}/${language}/cv-${language}_supervisions_${part}.jsonl.gz ${dst}/${language}_${part} + utils/utt2spk_to_spk2utt.pl ${dst}/${language}_${part}/utt2spk > ${dst}/${language}_${part}/spk2utt + utils/fix_data_dir.sh ${dst}/${language}_${part} + steps_xvec/audio_to_duration.sh --cmd "$train_cmd" ${dst}/${part//-/_} +done + diff --git a/egs/commonvoice/v1/local/make_musan.py b/egs/commonvoice/v1/local/make_musan.py new file mode 100755 index 00000000..b0ae6846 --- /dev/null +++ b/egs/commonvoice/v1/local/make_musan.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + + +def prepare_music(root_dir, fs, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_speech(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_noise(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def main(): + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) + + +if __name__ == "__main__": + main() diff --git a/egs/commonvoice/v1/local/make_musan.sh b/egs/commonvoice/v1/local/make_musan.sh new file mode 100755 index 00000000..4a6d30f9 --- /dev/null +++ b/egs/commonvoice/v1/local/make_musan.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +use_vocals='Y' + +. parse_options.sh || exit 1; + +if [ $# -ne 3 ];then + echo "Usage: $0 [options] "; + echo "e.g.: $0 /export/corpora/JHU/musan 8 data" + exit 1; +fi + +in_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} $fs ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf $data_dir/musan.tmp + diff --git a/egs/commonvoice/v1/local/make_rirs_data.sh b/egs/commonvoice/v1/local/make_rirs_data.sh new file mode 100755 index 00000000..c6652eda --- /dev/null +++ b/egs/commonvoice/v1/local/make_rirs_data.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# +# Apache 2.0. +set -e + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom" +fi + +rir_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir + +rir_list=$rir_dir/rir_list +if [ "$fs" -eq 16 ];then + awk '{ key=$5; sub(/.*\//,"",key); print key,$5 }' $rir_list > $data_dir/wav.scp +else + awk '{ +key=$5; sub(/.*\//,"",key); +print key,"sox "$5" -r 8000 -t wav -b 16 -e signed-integer - |" }' \ + $rir_list > $data_dir/wav.scp +fi +awk '{ key=$5; sub(/.*\//,"",key); print key,$4 }' $rir_list > $data_dir/rir2room + diff --git a/egs/commonvoice/v1/local/prepare_lang.py b/egs/commonvoice/v1/local/prepare_lang.py new file mode 100755 index 00000000..39d76146 --- /dev/null +++ b/egs/commonvoice/v1/local/prepare_lang.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script takes as input a lexicon file "data/lang_phone/lexicon.txt" +consisting of words and tokens (i.e., phones) and does the following: + +1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt + +2. Generate tokens.txt, the token table mapping a token to a unique integer. + +3. Generate words.txt, the word table mapping a word to a unique integer. + +4. Generate L.pt, in k2 format. It can be loaded by + + d = torch.load("L.pt") + lexicon = k2.Fsa.from_dict(d) + +5. Generate L_disambig.pt, in k2 format. +""" +import argparse +import math +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Tuple + +import k2 +import torch + +from hyperion.utils.lexicon import read_lexicon, write_lexicon + +Lexicon = List[Tuple[str, List[str]]] + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain a file lexicon.txt. + Generated files by this script are saved into this directory. + """, + ) + + parser.add_argument( + "--debug", + default=False, + action="store_true", + help="""True for debugging, which will generate + a visualization of the lexicon FST. + + Caution: If your lexicon contains hundreds of thousands + of lines, please set it to False! + """, + ) + + return parser.parse_args() + + +def write_mapping(filename: str, sym2id: Dict[str, int]) -> None: + """Write a symbol to ID mapping to a file. + + Note: + No need to implement `read_mapping` as it can be done + through :func:`k2.SymbolTable.from_file`. + + Args: + filename: + Filename to save the mapping. + sym2id: + A dict mapping symbols to IDs. + Returns: + Return None. + """ + with open(filename, "w", encoding="utf-8") as f: + for sym, i in sym2id.items(): + f.write(f"{sym} {i}\n") + + +def get_tokens(lexicon: Lexicon) -> List[str]: + """Get tokens from a lexicon. + + Args: + lexicon: + It is the return value of :func:`read_lexicon`. + Returns: + Return a list of unique tokens. + """ + ans = set() + for _, tokens in lexicon: + ans.update(tokens) + sorted_ans = sorted(list(ans)) + return sorted_ans + + +def get_words(lexicon: Lexicon) -> List[str]: + """Get words from a lexicon. + + Args: + lexicon: + It is the return value of :func:`read_lexicon`. + Returns: + Return a list of unique words. + """ + ans = set() + for word, _ in lexicon: + ans.add(word) + sorted_ans = sorted(list(ans)) + return sorted_ans + + +def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]: + """It adds pseudo-token disambiguation symbols #1, #2 and so on + at the ends of tokens to ensure that all pronunciations are different, + and that none is a prefix of another. + + See also add_lex_disambig.pl from kaldi. + + Args: + lexicon: + It is returned by :func:`read_lexicon`. + Returns: + Return a tuple with two elements: + + - The output lexicon with disambiguation symbols + - The ID of the max disambiguation symbol that appears + in the lexicon + """ + + # (1) Work out the count of each token-sequence in the + # lexicon. + count = defaultdict(int) + for _, tokens in lexicon: + count[" ".join(tokens)] += 1 + + # (2) For each left sub-sequence of each token-sequence, note down + # that it exists (for identifying prefixes of longer strings). + issubseq = defaultdict(int) + for _, tokens in lexicon: + tokens = tokens.copy() + tokens.pop() + while tokens: + issubseq[" ".join(tokens)] = 1 + tokens.pop() + + # (3) For each entry in the lexicon: + # if the token sequence is unique and is not a + # prefix of another word, no disambig symbol. + # Else output #1, or #2, #3, ... if the same token-seq + # has already been assigned a disambig symbol. + ans = [] + + # We start with #1 since #0 has its own purpose + first_allowed_disambig = 1 + max_disambig = first_allowed_disambig - 1 + last_used_disambig_symbol_of = defaultdict(int) + + for word, tokens in lexicon: + tokenseq = " ".join(tokens) + assert tokenseq != "" + if issubseq[tokenseq] == 0 and count[tokenseq] == 1: + ans.append((word, tokens)) + continue + + cur_disambig = last_used_disambig_symbol_of[tokenseq] + if cur_disambig == 0: + cur_disambig = first_allowed_disambig + else: + cur_disambig += 1 + + if cur_disambig > max_disambig: + max_disambig = cur_disambig + last_used_disambig_symbol_of[tokenseq] = cur_disambig + tokenseq += f" #{cur_disambig}" + ans.append((word, tokenseq.split())) + return ans, max_disambig + + +def generate_id_map(symbols: List[str]) -> Dict[str, int]: + """Generate ID maps, i.e., map a symbol to a unique ID. + + Args: + symbols: + A list of unique symbols. + Returns: + A dict containing the mapping between symbols and IDs. + """ + return {sym: i for i, sym in enumerate(symbols)} + + +def add_self_loops(arcs: List[List[Any]], disambig_token: int, + disambig_word: int) -> List[List[Any]]: + """Adds self-loops to states of an FST to propagate disambiguation symbols + through it. They are added on each state with non-epsilon output symbols + on at least one arc out of the state. + + See also fstaddselfloops.pl from Kaldi. One difference is that + Kaldi uses OpenFst style FSTs and it has multiple final states. + This function uses k2 style FSTs and it does not need to add self-loops + to the final state. + + The input label of a self-loop is `disambig_token`, while the output + label is `disambig_word`. + + Args: + arcs: + A list-of-list. The sublist contains + `[src_state, dest_state, label, aux_label, score]` + disambig_token: + It is the token ID of the symbol `#0`. + disambig_word: + It is the word ID of the symbol `#0`. + + Return: + Return new `arcs` containing self-loops. + """ + states_needs_self_loops = set() + for arc in arcs: + src, dst, ilabel, olabel, score = arc + if olabel != 0: + states_needs_self_loops.add(src) + + ans = [] + for s in states_needs_self_loops: + ans.append([s, s, disambig_token, disambig_word, 0]) + + return arcs + ans + + +def lexicon_to_fst( + lexicon: Lexicon, + token2id: Dict[str, int], + word2id: Dict[str, int], + sil_token: str = "SIL", + sil_prob: float = 0.5, + need_self_loops: bool = False, +) -> k2.Fsa: + """Convert a lexicon to an FST (in k2 format) with optional silence at + the beginning and end of each word. + + Args: + lexicon: + The input lexicon. See also :func:`read_lexicon` + token2id: + A dict mapping tokens to IDs. + word2id: + A dict mapping words to IDs. + sil_token: + The silence token. + sil_prob: + The probability for adding a silence at the beginning and end + of the word. + need_self_loops: + If True, add self-loop to states with non-epsilon output symbols + on at least one arc out of the state. The input label for this + self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. + Returns: + Return an instance of `k2.Fsa` representing the given lexicon. + """ + assert sil_prob > 0.0 and sil_prob < 1.0 + # CAUTION: we use score, i.e, negative cost. + sil_score = math.log(sil_prob) + no_sil_score = math.log(1.0 - sil_prob) + + start_state = 0 + loop_state = 1 # words enter and leave from here + sil_state = 2 # words terminate here when followed by silence; this state + # has a silence transition to loop_state. + next_state = 3 # the next un-allocated state, will be incremented as we go. + arcs = [] + + assert token2id[""] == 0 + assert word2id[""] == 0 + + eps = 0 + + sil_token = token2id[sil_token] + + arcs.append([start_state, loop_state, eps, eps, no_sil_score]) + arcs.append([start_state, sil_state, eps, eps, sil_score]) + arcs.append([sil_state, loop_state, sil_token, eps, 0]) + + for word, tokens in lexicon: + assert len(tokens) > 0, f"{word} has no pronunciations" + cur_state = loop_state + + word = word2id[word] + tokens = [token2id[i] for i in tokens] + + for i in range(len(tokens) - 1): + w = word if i == 0 else eps + arcs.append([cur_state, next_state, tokens[i], w, 0]) + + cur_state = next_state + next_state += 1 + + # now for the last token of this word + # It has two out-going arcs, one to the loop state, + # the other one to the sil_state. + i = len(tokens) - 1 + w = word if i == 0 else eps + arcs.append([cur_state, loop_state, tokens[i], w, no_sil_score]) + arcs.append([cur_state, sil_state, tokens[i], w, sil_score]) + + if need_self_loops: + disambig_token = token2id["#0"] + disambig_word = word2id["#0"] + arcs = add_self_loops( + arcs, + disambig_token=disambig_token, + disambig_word=disambig_word, + ) + + final_state = next_state + arcs.append([loop_state, final_state, -1, -1, 0]) + arcs.append([final_state]) + + arcs = sorted(arcs, key=lambda arc: arc[0]) + arcs = [[str(i) for i in arc] for arc in arcs] + arcs = [" ".join(arc) for arc in arcs] + arcs = "\n".join(arcs) + + fsa = k2.Fsa.from_str(arcs, acceptor=False) + return fsa + + +def main(): + args = get_args() + lang_dir = Path(args.lang_dir) + lexicon_filename = lang_dir / "lexicon.txt" + sil_token = "SIL" + sil_prob = 0.5 + + lexicon = read_lexicon(lexicon_filename) + tokens = get_tokens(lexicon) + words = get_words(lexicon) + + lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) + + for i in range(max_disambig + 1): + disambig = f"#{i}" + assert disambig not in tokens + tokens.append(f"#{i}") + + assert "" not in tokens + tokens = [""] + tokens + + assert "" not in words + assert "#0" not in words + assert "" not in words + assert "" not in words + + words = [""] + words + ["#0", "", ""] + + token2id = generate_id_map(tokens) + word2id = generate_id_map(words) + + write_mapping(lang_dir / "tokens.txt", token2id) + write_mapping(lang_dir / "words.txt", word2id) + write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig) + + L = lexicon_to_fst( + lexicon, + token2id=token2id, + word2id=word2id, + sil_token=sil_token, + sil_prob=sil_prob, + ) + + L_disambig = lexicon_to_fst( + lexicon_disambig, + token2id=token2id, + word2id=word2id, + sil_token=sil_token, + sil_prob=sil_prob, + need_self_loops=True, + ) + torch.save(L.as_dict(), lang_dir / "L.pt") + torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") + + if args.debug: + labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt") + aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt") + + L.labels_sym = labels_sym + L.aux_labels_sym = aux_labels_sym + L.draw(f"{lang_dir / 'L.svg'}", title="L.pt") + + L_disambig.labels_sym = labels_sym + L_disambig.aux_labels_sym = aux_labels_sym + L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", + title="L_disambig.pt") + + +if __name__ == "__main__": + main() diff --git a/egs/commonvoice/v1/local/prepare_lang_bpe.py b/egs/commonvoice/v1/local/prepare_lang_bpe.py new file mode 100755 index 00000000..7838b6a0 --- /dev/null +++ b/egs/commonvoice/v1/local/prepare_lang_bpe.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) +""" + +This script takes as input `lang_dir`, which should contain:: + + - lang_dir/bpe.model, + - lang_dir/words.txt + +and generates the following files in the directory `lang_dir`: + + - lexicon.txt + - lexicon_disambig.txt + - L.pt + - L_disambig.pt + - tokens.txt +""" + +import argparse +from pathlib import Path +from typing import Dict, List, Tuple + +import k2 +import sentencepiece as spm +import torch +from prepare_lang import ( + Lexicon, + add_disambig_symbols, + add_self_loops, + write_lexicon, + write_mapping, +) + + +def lexicon_to_fst_no_sil( + lexicon: Lexicon, + token2id: Dict[str, int], + word2id: Dict[str, int], + need_self_loops: bool = False, +) -> k2.Fsa: + """Convert a lexicon to an FST (in k2 format). + + Args: + lexicon: + The input lexicon. See also :func:`read_lexicon` + token2id: + A dict mapping tokens to IDs. + word2id: + A dict mapping words to IDs. + need_self_loops: + If True, add self-loop to states with non-epsilon output symbols + on at least one arc out of the state. The input label for this + self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. + Returns: + Return an instance of `k2.Fsa` representing the given lexicon. + """ + loop_state = 0 # words enter and leave from here + next_state = 1 # the next un-allocated state, will be incremented as we go + + arcs = [] + + # The blank symbol is defined in local/train_bpe_model.py + assert token2id[""] == 0 + assert word2id[""] == 0 + + eps = 0 + + for word, pieces in lexicon: + assert len(pieces) > 0, f"{word} has no pronunciations" + cur_state = loop_state + + word = word2id[word] + pieces = [token2id[i] for i in pieces] + + for i in range(len(pieces) - 1): + w = word if i == 0 else eps + arcs.append([cur_state, next_state, pieces[i], w, 0]) + + cur_state = next_state + next_state += 1 + + # now for the last piece of this word + i = len(pieces) - 1 + w = word if i == 0 else eps + arcs.append([cur_state, loop_state, pieces[i], w, 0]) + + if need_self_loops: + disambig_token = token2id["#0"] + disambig_word = word2id["#0"] + arcs = add_self_loops( + arcs, + disambig_token=disambig_token, + disambig_word=disambig_word, + ) + + final_state = next_state + arcs.append([loop_state, final_state, -1, -1, 0]) + arcs.append([final_state]) + + arcs = sorted(arcs, key=lambda arc: arc[0]) + arcs = [[str(i) for i in arc] for arc in arcs] + arcs = [" ".join(arc) for arc in arcs] + arcs = "\n".join(arcs) + + fsa = k2.Fsa.from_str(arcs, acceptor=False) + return fsa + + +def generate_lexicon(model_file: str, + words: List[str]) -> Tuple[Lexicon, Dict[str, int]]: + """Generate a lexicon from a BPE model. + + Args: + model_file: + Path to a sentencepiece model. + words: + A list of strings representing words. + Returns: + Return a tuple with two elements: + - A dict whose keys are words and values are the corresponding + word pieces. + - A dict representing the token symbol, mapping from tokens to IDs. + """ + sp = spm.SentencePieceProcessor() + sp.load(str(model_file)) + + # Convert word to word piece IDs instead of word piece strings + # to avoid OOV tokens. + words_pieces_ids: List[List[int]] = sp.encode(words, out_type=int) + + # Now convert word piece IDs back to word piece strings. + words_pieces: List[List[str]] = [ + sp.id_to_piece(ids) for ids in words_pieces_ids + ] + + lexicon = [] + for word, pieces in zip(words, words_pieces): + lexicon.append((word, pieces)) + + # The OOV word is + lexicon.append(("", [sp.id_to_piece(sp.unk_id())])) + + token2id: Dict[str, int] = dict() + for i in range(sp.vocab_size()): + token2id[sp.id_to_piece(i)] = i + + return lexicon, token2id + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain the bpe.model and words.txt + """, + ) + + parser.add_argument( + "--debug", + default=False, + action="store_true", + help="""True for debugging, which will generate + a visualization of the lexicon FST. + + Caution: If your lexicon contains hundreds of thousands + of lines, please set it to False! + + See "test/test_bpe_lexicon.py" for usage. + """, + ) + + return parser.parse_args() + + +def main(): + args = get_args() + lang_dir = Path(args.lang_dir) + model_file = lang_dir / "bpe.model" + + word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt") + + words = word_sym_table.symbols + + excluded = [ + "", "!SIL", "", "", "#0", "", "" + ] + for w in excluded: + if w in words: + words.remove(w) + + lexicon, token_sym_table = generate_lexicon(model_file, words) + + lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) + + next_token_id = max(token_sym_table.values()) + 1 + for i in range(max_disambig + 1): + disambig = f"#{i}" + assert disambig not in token_sym_table + token_sym_table[disambig] = next_token_id + next_token_id += 1 + + word_sym_table.add("#0") + word_sym_table.add("") + word_sym_table.add("") + + write_mapping(lang_dir / "tokens.txt", token_sym_table) + + write_lexicon(lang_dir / "lexicon.txt", lexicon) + write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig) + + L = lexicon_to_fst_no_sil( + lexicon, + token2id=token_sym_table, + word2id=word_sym_table, + ) + + L_disambig = lexicon_to_fst_no_sil( + lexicon_disambig, + token2id=token_sym_table, + word2id=word_sym_table, + need_self_loops=True, + ) + torch.save(L.as_dict(), lang_dir / "L.pt") + torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") + + if args.debug: + labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt") + aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt") + + L.labels_sym = labels_sym + L.aux_labels_sym = aux_labels_sym + L.draw(f"{lang_dir / 'L.svg'}", title="L.pt") + + L_disambig.labels_sym = labels_sym + L_disambig.aux_labels_sym = aux_labels_sym + L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", + title="L_disambig.pt") + + +if __name__ == "__main__": + main() diff --git a/egs/commonvoice/v1/local/train_bpe_model.py b/egs/commonvoice/v1/local/train_bpe_model.py new file mode 100755 index 00000000..42aba957 --- /dev/null +++ b/egs/commonvoice/v1/local/train_bpe_model.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# You can install sentencepiece via: +# +# pip install sentencepiece +# +# Due to an issue reported in +# https://github.com/google/sentencepiece/pull/642#issuecomment-857972030 +# +# Please install a version >=0.1.96 + +import argparse +import shutil +from pathlib import Path + +import sentencepiece as spm + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + The generated bpe.model is saved to this directory. + """, + ) + + parser.add_argument( + "--transcript", + type=str, + help="Training transcript.", + ) + + parser.add_argument( + "--vocab-size", + type=int, + help="Vocabulary size for BPE training", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + vocab_size = args.vocab_size + lang_dir = Path(args.lang_dir) + + model_type = "unigram" + + model_prefix = f"{lang_dir}/{model_type}_{vocab_size}" + train_text = args.transcript + character_coverage = 1.0 + input_sentence_size = 100000000 + + user_defined_symbols = ["", ""] + unk_id = len(user_defined_symbols) + # Note: unk_id is fixed to 2. + # If you change it, you should also change other + # places that are using it. + + model_file = Path(model_prefix + ".model") + if not model_file.is_file(): + spm.SentencePieceTrainer.train( + input=train_text, + vocab_size=vocab_size, + model_type=model_type, + model_prefix=model_prefix, + input_sentence_size=input_sentence_size, + character_coverage=character_coverage, + user_defined_symbols=user_defined_symbols, + unk_id=unk_id, + bos_id=-1, + eos_id=-1, + ) + + shutil.copyfile(model_file, f"{lang_dir}/bpe.model") + + +if __name__ == "__main__": + main() diff --git a/egs/commonvoice/v1/local/validate_bpe_lexicon.py b/egs/commonvoice/v1/local/validate_bpe_lexicon.py new file mode 100755 index 00000000..36962933 --- /dev/null +++ b/egs/commonvoice/v1/local/validate_bpe_lexicon.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script checks that there are no OOV tokens in the BPE-based lexicon. + +Usage example: + + python3 ./local/validate_bpe_lexicon.py \ + --lexicon /path/to/lexicon.txt \ + --bpe-model /path/to/bpe.model +""" + +import argparse +from pathlib import Path +from typing import List, Tuple + +import sentencepiece as spm + +from hyperion.utils.lexicon import read_lexicon + +# Map word to word pieces +Lexicon = List[Tuple[str, List[str]]] + + +def get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--lexicon", + required=True, + type=Path, + help="Path to lexicon.txt", + ) + + parser.add_argument( + "--bpe-model", + required=True, + type=Path, + help="Path to bpe.model", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + assert args.lexicon.is_file(), args.lexicon + assert args.bpe_model.is_file(), args.bpe_model + + lexicon = read_lexicon(args.lexicon) + + sp = spm.SentencePieceProcessor() + sp.load(str(args.bpe_model)) + + word_pieces = set(sp.id_to_piece(list(range(sp.vocab_size())))) + for word, pieces in lexicon: + for p in pieces: + if p not in word_pieces: + raise ValueError(f"The word {word} contains an OOV token {p}") + + +if __name__ == "__main__": + main() diff --git a/egs/commonvoice/v1/path.sh b/egs/commonvoice/v1/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/commonvoice/v1/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/commonvoice/v1/run_001_prepare_data.sh b/egs/commonvoice/v1/run_001_prepare_data.sh new file mode 100755 index 00000000..d839fac6 --- /dev/null +++ b/egs/commonvoice/v1/run_001_prepare_data.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. ./datapath.sh +. $config_file + + +nj=6 + +mkdir -p data + +commonvoice=/export/c06/ylu125/GSP/corpora/CommonVoice + + +if [ ${stage} -le 1 ]; then + ### Task dependent. You have to make data the following preparation part by yourself. + ### But you can utilize Kaldi recipes in most cases + echo "stage 1: Data preparation" + for lan in it #sv-SE + do + # use underscore-separated names in data directories. + local/data_prep.sh ${lan} $commonvoice data/ + done +fi + +if [ ${stage} -le 2 ]; then + echo "stage 2: Data conversion" + # for part in $test_data $dev_data $nnet_data + for lan in it #sv-SE + do + for part in ${lan}_test ${lan}_dev ${lan}_train + do + echo ${part} + steps_transducer/preprocess_audios_for_nnet_train.sh --nj 20 --cmd "$train_cmd" \ + --storage_name commonvoice-v1-$(date +'%m_%d_%H_%M') --use-bin-vad false \ + --osr 16000 data/${part} data/${part}_proc_audio exp/${part}_proc_audio + utils/fix_data_dir.sh data/${part}_proc_audio || true + done + done +fi diff --git a/egs/commonvoice/v1/run_003_prepare_noises_rirs.sh b/egs/commonvoice/v1/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..6bdcb4f2 --- /dev/null +++ b/egs/commonvoice/v1/run_003_prepare_noises_rirs.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation + +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 16 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name librispeech-v1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name librispeech-v1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then + ln -s ../../sre19-cmn2/v1/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/commonvoice/v1/run_004_compute_bpe.sh b/egs/commonvoice/v1/run_004_compute_bpe.sh new file mode 100755 index 00000000..617f03ae --- /dev/null +++ b/egs/commonvoice/v1/run_004_compute_bpe.sh @@ -0,0 +1,105 @@ +#!/bin/bash + + +. ./cmd.sh +. ./path.sh +set -e + +vocab_sizes=( + # 5000 + 2000 + 1000 + 500 +) + +dl_dir=$PWD/download + +stage=1 +stop_stage=4 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. ./datapath.sh +. $config_file + + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + echo "Stage 1: Dump transcripts for LM training" + mkdir -p data/lm + gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ + | jq '.text' \ + | sed 's:"::g' \ + > data/lm/${language}_transcript_words.txt +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + echo "Stage 2: Prepare BPE based lang" + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/${language}_lang_bpe_${vocab_size} + mkdir -p $lang_dir + + # Add special words to words.txt + echo " 0" > $lang_dir/words.txt + echo "!SIL 1" >> $lang_dir/words.txt + echo " 2" >> $lang_dir/words.txt + + # Add regular words to words.txt + gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ + | jq '.text' \ + | sed 's:"::g' \ + | sed 's: :\n:g' \ + | sort \ + | uniq \ + | sed '/^$/d' \ + | awk '{print $0,NR+2}' \ + >> $lang_dir/words.txt + + # Add remaining special word symbols expected by LM scripts. + num_words=$(cat $lang_dir/words.txt | wc -l) + echo " ${num_words}" >> $lang_dir/words.txt + num_words=$(cat $lang_dir/words.txt | wc -l) + echo " ${num_words}" >> $lang_dir/words.txt + num_words=$(cat $lang_dir/words.txt | wc -l) + echo "#0 ${num_words}" >> $lang_dir/words.txt + + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript data/lm/${language}_transcript_words.txt + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bpe.py --lang-dir $lang_dir + fi + done +fi + +# if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then +# echo "Stage 3: Train LM" +# lm_dir=data/lm + +# if [ ! -f $lm_dir/G.arpa ]; then +# ./shared/make_kn_lm.py \ +# -ngram-order 3 \ +# -text $lm_dir/transcript_words.txt \ +# -lm $lm_dir/G.arpa +# fi + +# if [ ! -f $lm_dir/G_3_gram.fst.txt ]; then +# python3 -m kaldilm \ +# --read-symbol-table="data/lang_phone/words.txt" \ +# --disambig-symbol='#0' \ +# --max-order=3 \ +# $lm_dir/G.arpa > $lm_dir/G_3_gram.fst.txt +# fi +# fi + +# if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then +# echo "Stage 4: Compile HLG" +# ./local/compile_hlg.py --lang-dir data/lang_phone + +# for vocab_size in ${vocab_sizes[@]}; do +# lang_dir=data/lang_bpe_${vocab_size} +# ./local/compile_hlg.py --lang-dir $lang_dir +# done +# fi \ No newline at end of file diff --git a/egs/commonvoice/v1/run_011_train_asr.sh b/egs/commonvoice/v1/run_011_train_asr.sh new file mode 100755 index 00000000..1b402133 --- /dev/null +++ b/egs/commonvoice/v1/run_011_train_asr.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=1 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + diff --git a/egs/commonvoice/v1/run_030_inference.sh b/egs/commonvoice/v1/run_030_inference.sh new file mode 100755 index 00000000..86dccf0a --- /dev/null +++ b/egs/commonvoice/v1/run_030_inference.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +use_gpu=false +nnet_stage=1 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + transducer_args="--use-gpu true" + transducer_cmd="$cuda_eval_cmd --mem 6G" +else + transducer_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +transducer_dir=exp/transducer/$nnet_name + + + + +# Extracts x-vectors for evaluation +for name in $dev_data $test_data + do + nj=16 + steps_transducer/decode_wav2vec2transducer.sh --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ + $nnet data/$name \ + $transducer_dir/$name $bpe_model + done +exit diff --git a/egs/commonvoice/v1/steps b/egs/commonvoice/v1/steps new file mode 120000 index 00000000..aede39fe --- /dev/null +++ b/egs/commonvoice/v1/steps @@ -0,0 +1 @@ +hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/commonvoice/v1/steps_be b/egs/commonvoice/v1/steps_be new file mode 120000 index 00000000..b2098c2a --- /dev/null +++ b/egs/commonvoice/v1/steps_be @@ -0,0 +1 @@ +../v1/steps_be \ No newline at end of file diff --git a/egs/commonvoice/v1/steps_pyfe b/egs/commonvoice/v1/steps_pyfe new file mode 120000 index 00000000..7b9d122a --- /dev/null +++ b/egs/commonvoice/v1/steps_pyfe @@ -0,0 +1 @@ +hyp_utils/feats \ No newline at end of file diff --git a/egs/commonvoice/v1/steps_transducer b/egs/commonvoice/v1/steps_transducer new file mode 120000 index 00000000..c9fd1392 --- /dev/null +++ b/egs/commonvoice/v1/steps_transducer @@ -0,0 +1 @@ +hyp_utils/steps_transducer \ No newline at end of file diff --git a/egs/commonvoice/v1/steps_xvec b/egs/commonvoice/v1/steps_xvec new file mode 120000 index 00000000..289276b7 --- /dev/null +++ b/egs/commonvoice/v1/steps_xvec @@ -0,0 +1 @@ +hyp_utils/xvectors/ \ No newline at end of file diff --git a/egs/commonvoice/v1/utils b/egs/commonvoice/v1/utils new file mode 120000 index 00000000..3d590a1d --- /dev/null +++ b/egs/commonvoice/v1/utils @@ -0,0 +1 @@ +hyp_utils/kaldi/utils \ No newline at end of file diff --git a/egs/commonvoice/v1/xvectors b/egs/commonvoice/v1/xvectors new file mode 120000 index 00000000..af66a94d --- /dev/null +++ b/egs/commonvoice/v1/xvectors @@ -0,0 +1 @@ +hyp_utils/xvectors \ No newline at end of file diff --git a/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh new file mode 100755 index 00000000..143087a5 --- /dev/null +++ b/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" + +use_gpu=false +write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +num_augs=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ] && [ $# != 4 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + echo " --use-bin-vad # If true, uses binary VAD from vad.scp" + echo " --write-utt2num-frames # If true, write utt2num_frames file." + echo " --chunk-length # If provided, applies encoder with specified chunk-length and " + echo " # concatenates the chunks outputs before pooling" + echo " --feat-config # feature/mvn config file" + echo " --aug-config # augmentation config file" + echo " --random-utt-length # If true, extracts a random chunk from the utterance between " + echo " # min_utt_length and max_utt_length" + echo " --min-utt-length # " + echo " --max-utt-length # " + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +args="" +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + args="--use-gpu" +fi + +if [ "$write_utt2num_frames" == "true" ];then + write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +fi + +if [ $stage -le 0 ];then + set +e + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2transducer.py \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --output $output_dir/transducer.JOB.text + set -e +fi + +if [ $stage -le 1 ];then + echo "compute wer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text +fi diff --git a/hyp_utils/steps_transducer/preprocess_audios_for_nnet_train.sh b/hyp_utils/steps_transducer/preprocess_audios_for_nnet_train.sh new file mode 100755 index 00000000..ef54ceed --- /dev/null +++ b/hyp_utils/steps_transducer/preprocess_audios_for_nnet_train.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# +# 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +set -e +nj=40 +cmd="run.pl" +stage=0 +file_format=flac +nodes=b1 +storage_name=$(date +'%m_%d_%H_%M') +proc_opts="--remove-dc-offset" +use_bin_vad=false +osr=16000 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --file-format # Output format supported by soundfile (flac,ogg,wav,...)" + echo " --proc-opts # Extra arguments for proc-audio-files.py" + echo " --use-bin-vad # Removes silence using binary vad" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/wav.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +output_dir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $output_dir/storage ]; then + dir_name=$USER/hyp-data/$storage_name/xvector_audio/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $output_dir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b{14,15,16,17,18}/$dir_name $output_dir/storage + elif [ "$nodes" == "s01" ];then + utils/create_split_dir.pl \ + /export/s01/$dir_name $output_dir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{01,06,07,08,09}/$dir_name $output_dir/storage + elif [ "$nodes" == "fs05" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/fs05/$dir_name $output_dir/storage + fi + + for f in $(awk '{ print $1}' $data_in/wav.scp); do + # the next command does nothing unless $output_dir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $output_dir/$f.$file_format + done +fi + + +for f in reco2dur segments spk2utt text utt2dur utt2gender utt2lang utt2spk wav.scp spk2gender +do + if [ -f $data_in/$f ];then + cp $data_in/$f $data_out/$f + fi +done + +args="" +if [ "$use_bin_vad" == "true" ];then + args="${args} --vad scp:$data_in/vad.scp" +else + f=vad.scp + if [ -f $data_in/$f ];then + cp $data_in/$f $data_out/$f + fi +fi + +$cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \ + --write-time-durs $output_dir/utt2dur.${name}.JOB \ + --part-idx JOB --num-parts $nj \ + --output-sampling-rate $osr \ + --input $data_in/wav.scp \ + --output-path $output_dir \ + --output-script $output_dir/wav.${name}.JOB.scp + +for n in $(seq $nj); do + cat $output_dir/wav.${name}.$n.scp || exit 1; +done > ${data_out}/wav.scp || exit 1 + +for n in $(seq $nj); do + cat $output_dir/utt2dur.${name}.$n || exit 1; +done > ${data_out}/utt2dur || exit 1 + +echo "$0: Succeeded processing audios for $name" diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py index 67b1cf61..2698e61f 100755 --- a/hyperion/bin/preprocess_audio_files.py +++ b/hyperion/bin/preprocess_audio_files.py @@ -45,6 +45,7 @@ def process_audio_files( write_time_durs_spec, vad_spec, vad_path_prefix, + output_sampling_rate, vad_fs=100, vad_dilation=0, vad_erosion=0, @@ -74,6 +75,10 @@ def process_audio_files( logging.info("Processing audio %s" % (key)) t2 = time.time() + if output_sampling_rate is not None: + x = signal.resample(x, int(x.shape[0]*output_sampling_rate/fs)) + fs = output_sampling_rate + tot_samples = x.shape[0] if vad_spec is not None: num_vad_frames = int(round(tot_samples * vad_fs / fs)) @@ -95,6 +100,7 @@ def process_audio_files( ) ) + if x.shape[0] > 0: if remove_dc_offset: x -= np.mean(x) @@ -148,6 +154,8 @@ def process_audio_files( parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) + parser.add_argument( + "--output-sampling-rate", default=None, type=int, help=("resample output audio")) parser.add_argument( "--vad-fs", default=100, type=float, help=("vad sampling frequency") From beab75c01ca9dc44bc0143437d29f96f439f6b7e Mon Sep 17 00:00:00 2001 From: ylu125 Date: Wed, 1 Feb 2023 00:20:52 -0500 Subject: [PATCH 02/89] update slurm configuration for rockfish --- egs/commonvoice/v1/cmd.sh | 6 ++++-- egs/commonvoice/v1/conf/slurm.conf | 15 +++++++++++++++ .../v1/conf/wav2vec2xlsr300m_transducer_do.yaml | 3 ++- .../conf/wav2vec2xlsr300m_transducer_do0.2.yaml | 3 ++- .../conf/wav2vec2xlsr300m_transducer_do0.3.yaml | 3 ++- .../conf/wav2vec2xlsr300m_transducer_do0.4.yaml | 4 +++- egs/commonvoice/v1/datapath.sh | 11 ++++++----- egs/commonvoice/v1/default_config.sh | 2 +- egs/commonvoice/v1/run_001_prepare_data.sh | 3 +-- hyp_utils/conda_env.sh | 2 +- 10 files changed, 37 insertions(+), 15 deletions(-) create mode 100644 egs/commonvoice/v1/conf/slurm.conf diff --git a/egs/commonvoice/v1/cmd.sh b/egs/commonvoice/v1/cmd.sh index 89dbb7d8..6606a180 100755 --- a/egs/commonvoice/v1/cmd.sh +++ b/egs/commonvoice/v1/cmd.sh @@ -18,11 +18,13 @@ if [ "$(hostname -d)" == "cm.gemini" ];then export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G" #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +elif [ "$(hostname -d)" == "rockfish.cluster" ];then + export train_cmd="slurm.pl --config conf/slurm.conf --mem 4G" + export cuda_cmd="slurm.pl --config conf/slurm.conf --mem 20G" + export cuda_eval_cmd="$train_cmd" else export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G" export cuda_eval_cmd="$train_cmd" fi - - diff --git a/egs/commonvoice/v1/conf/slurm.conf b/egs/commonvoice/v1/conf/slurm.conf new file mode 100644 index 00000000..11bf450f --- /dev/null +++ b/egs/commonvoice/v1/conf/slurm.conf @@ -0,0 +1,15 @@ +# Default configuration +command sbatch --export=PATH +option name=* --job-name $0 +default time=48:00:00 +option time=* --time $0 +option mem=* --mem-per-cpu $0 +option mem=0 +option num_threads=* --cpus-per-task $0 +option num_threads=4 --cpus-per-task 4 +option num_nodes=* --nodes $0 +default gpu=0 +option gpu=0 +option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 4 --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU +# note: the --max-jobs-run option is supported as a special case +# by slurm.pl and you don't have to handle it in the config file. diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml index c7fc2df7..19aaac2c 100644 --- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do.yaml @@ -1,5 +1,6 @@ hf_feats: - pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus + pretrained_model_path: facebook/wav2vec2-xls-r-300m + #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus transducer: decoder: embedding_dim: 1024 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml index 1ee4ec72..baa6cde3 100644 --- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml @@ -1,5 +1,6 @@ hf_feats: - pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus + pretrained_model_path: facebook/wav2vec2-xls-r-300m + #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus transducer: decoder: embedding_dim: 1024 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml index ca7c1995..3a5ff1f5 100644 --- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml @@ -1,5 +1,6 @@ hf_feats: - pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus + pretrained_model_path: facebook/wav2vec2-xls-r-300m + #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus transducer: decoder: embedding_dim: 1024 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml index 9fed09e7..9c07f5e7 100644 --- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml @@ -1,5 +1,7 @@ hf_feats: - pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus + pretrained_model_path: facebook/wav2vec2-xls-r-300m + #pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus + #facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus transducer: decoder: embedding_dim: 1024 diff --git a/egs/commonvoice/v1/datapath.sh b/egs/commonvoice/v1/datapath.sh index 4c7987ef..e844d6cd 100644 --- a/egs/commonvoice/v1/datapath.sh +++ b/egs/commonvoice/v1/datapath.sh @@ -5,13 +5,14 @@ if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then - librispeech_root=/export/corpora5/LibriSpeech + commonvoice_root= musan_root=/export/corpora5/JHU/musan + echo "Put your database paths here" + exit 1 +elif [ "$(hostname --domain)" == "rockfish.cluster" ];then + commonvoice_root=/data/jvillal7/corpora/commonvoice + musan_root=/data/jvillal7/corpora/musan elif [ "$(hostname --domain)" == "cm.gemini" ];then - # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 - # voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 - # voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 - # musan_root=/expscratch/dgromero/corpora-open/musan echo "Put your database paths here" exit 1 else diff --git a/egs/commonvoice/v1/default_config.sh b/egs/commonvoice/v1/default_config.sh index 2b6239b6..6f5a2dfb 120000 --- a/egs/commonvoice/v1/default_config.sh +++ b/egs/commonvoice/v1/default_config.sh @@ -1 +1 @@ -global_conf/config_transducer_v1.sh \ No newline at end of file +global_conf/config_transducer_v3.3_it.sh \ No newline at end of file diff --git a/egs/commonvoice/v1/run_001_prepare_data.sh b/egs/commonvoice/v1/run_001_prepare_data.sh index d839fac6..6a5a6e10 100755 --- a/egs/commonvoice/v1/run_001_prepare_data.sh +++ b/egs/commonvoice/v1/run_001_prepare_data.sh @@ -19,7 +19,6 @@ nj=6 mkdir -p data -commonvoice=/export/c06/ylu125/GSP/corpora/CommonVoice if [ ${stage} -le 1 ]; then @@ -29,7 +28,7 @@ if [ ${stage} -le 1 ]; then for lan in it #sv-SE do # use underscore-separated names in data directories. - local/data_prep.sh ${lan} $commonvoice data/ + local/data_prep.sh ${lan} $commonvoice_root data/ done fi diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index ceee4e93..11b509bb 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -65,7 +65,7 @@ if [ $num_gpus -gt 0 ];then free_gpu=$(which hyp_utils/free-gpu) fi - if [ ! -z "$free_gpu" ];then + if [ ! -z "$free_gpu" ] && [ "$(hostname --domain)" != "rockfish.cluster" ];then # if free-gpu found set env var, otherwise we assume that you can use any gpu export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus) fi From 046b5f7e88be73acc67ae8f58069397205d74d50 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Wed, 1 Feb 2023 13:32:17 -0500 Subject: [PATCH 03/89] update data preparation for different languge --- egs/commonvoice/v1/run_001_prepare_data.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/commonvoice/v1/run_001_prepare_data.sh b/egs/commonvoice/v1/run_001_prepare_data.sh index 6a5a6e10..4c0d0297 100755 --- a/egs/commonvoice/v1/run_001_prepare_data.sh +++ b/egs/commonvoice/v1/run_001_prepare_data.sh @@ -25,7 +25,7 @@ if [ ${stage} -le 1 ]; then ### Task dependent. You have to make data the following preparation part by yourself. ### But you can utilize Kaldi recipes in most cases echo "stage 1: Data preparation" - for lan in it #sv-SE + for lan in $language #it sv-SE do # use underscore-separated names in data directories. local/data_prep.sh ${lan} $commonvoice_root data/ @@ -35,7 +35,7 @@ fi if [ ${stage} -le 2 ]; then echo "stage 2: Data conversion" # for part in $test_data $dev_data $nnet_data - for lan in it #sv-SE + for lan in $language #it sv-SE do for part in ${lan}_test ${lan}_dev ${lan}_train do From beb2ed5405f71d5cfe29ed62c343a8f6f825196f Mon Sep 17 00:00:00 2001 From: ylu125 Date: Wed, 15 Feb 2023 17:11:43 -0500 Subject: [PATCH 04/89] update config and add cer scripts --- egs/commonvoice/v1/conf/slurm.conf | 4 +- ...v2vec2xlsr300m_transducer_stage1_v3.3.yaml | 4 +- ...v2vec2xlsr300m_transducer_stage2_v3.3.yaml | 6 +-- .../config_transducer_v3.3_en_fr_it.sh | 41 +++++++++++++++++++ egs/commonvoice/v1/run_030_inference.sh | 6 ++- .../decode_wav2vec2transducer.sh | 9 +++- .../models/wav2transducer/beam_search.py | 5 ++- 7 files changed, 64 insertions(+), 11 deletions(-) create mode 100644 egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh diff --git a/egs/commonvoice/v1/conf/slurm.conf b/egs/commonvoice/v1/conf/slurm.conf index 11bf450f..262344ea 100644 --- a/egs/commonvoice/v1/conf/slurm.conf +++ b/egs/commonvoice/v1/conf/slurm.conf @@ -6,10 +6,10 @@ option time=* --time $0 option mem=* --mem-per-cpu $0 option mem=0 option num_threads=* --cpus-per-task $0 -option num_threads=4 --cpus-per-task 4 +option num_threads=1 --cpus-per-task 1 option num_nodes=* --nodes $0 default gpu=0 option gpu=0 -option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 4 --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU +option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 1 --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU # note: the --max-jobs-run option is supported as a special case # by slurm.pl and you don't have to handle it in the config file. diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml index 564ea8c7..e9fe0b05 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml @@ -13,7 +13,7 @@ data: min_batch_size: 1 drop_last: false data_loader: - num_workers: 4 + num_workers: 2 val: dataset: aug_cfgs: @@ -28,7 +28,7 @@ data: min_batch_size: 1 drop_last: true data_loader: - num_workers: 4 + num_workers: 2 model: wav2vec2xlsr300m_transducer_do0.4.yaml trainer: optim: diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml index 8017f9b3..686f9133 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml @@ -13,7 +13,7 @@ data: min_batch_size: 1 drop_last: false data_loader: - num_workers: 4 + num_workers: 2 val: dataset: aug_cfgs: @@ -28,7 +28,7 @@ data: min_batch_size: 1 drop_last: true data_loader: - num_workers: 4 + num_workers: 2 model: transducer: decoder: @@ -56,6 +56,6 @@ trainer: epochs: 120 # eff_batch_size: 1024 eff_batch_size: 128 - train_mode: hf-feats-frozen-nograd + train_mode: full diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh new file mode 100644 index 00000000..fcb675b8 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh @@ -0,0 +1,41 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=en_fr_it_train_proc_audio +dev_data=en_fr_it_dev_proc_audio +test_data="en_test_proc_audio fr_test_proc_audio it_test_proc_audio" + +language=en_fr_it + +bpe_model=data/en_fr_it_lang_bpe_2000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.3_en_fr_it +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0022.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/run_030_inference.sh b/egs/commonvoice/v1/run_030_inference.sh index 86dccf0a..cf2c8fb2 100755 --- a/egs/commonvoice/v1/run_030_inference.sh +++ b/egs/commonvoice/v1/run_030_inference.sh @@ -7,6 +7,8 @@ . ./path.sh set -e +stage=0 + config_file=default_config.sh use_gpu=false nnet_stage=1 @@ -37,10 +39,10 @@ transducer_dir=exp/transducer/$nnet_name # Extracts x-vectors for evaluation -for name in $dev_data $test_data +for name in $test_data # $dev_data $test_data do nj=16 - steps_transducer/decode_wav2vec2transducer.sh --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ + steps_transducer/decode_wav2vec2transducer.sh --cmd "$transducer_cmd --mem 12G" --nj $nj --stage $stage ${transducer_args} \ $nnet data/$name \ $transducer_dir/$name $bpe_model done diff --git a/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh index 143087a5..4a23d9fa 100755 --- a/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh +++ b/hyp_utils/steps_transducer/decode_wav2vec2transducer.sh @@ -74,7 +74,14 @@ if [ $stage -le 0 ];then fi if [ $stage -le 1 ];then - echo "compute wer" + echo "compute wer, cer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + + python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text + python steps_transducer/word2char.py $data_dir/text $data_dir/text_char + compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text + compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text + fi diff --git a/hyperion/torch/models/wav2transducer/beam_search.py b/hyperion/torch/models/wav2transducer/beam_search.py index b23a0769..2550ab3c 100644 --- a/hyperion/torch/models/wav2transducer/beam_search.py +++ b/hyperion/torch/models/wav2transducer/beam_search.py @@ -227,6 +227,9 @@ def beam_search( B = B[:beam] break t += 1 - best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:])) + try: + best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:])) + except: + return "" ys = best_hyp.ys[1:] # [1:] to remove the blank return ys From ff0fd554f5a0d4908b473cbb59a4bb607c7a7aba Mon Sep 17 00:00:00 2001 From: ylu125 Date: Wed, 15 Feb 2023 17:16:51 -0500 Subject: [PATCH 05/89] temporal remove data preparation for duration --- egs/commonvoice/v1/local/data_prep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/commonvoice/v1/local/data_prep.sh b/egs/commonvoice/v1/local/data_prep.sh index d68c2368..f21fea8d 100755 --- a/egs/commonvoice/v1/local/data_prep.sh +++ b/egs/commonvoice/v1/local/data_prep.sh @@ -28,6 +28,6 @@ do lhotse kaldi export ${dst}/${language}/cv-${language}_recordings_${part}.jsonl.gz ${dst}/${language}/cv-${language}_supervisions_${part}.jsonl.gz ${dst}/${language}_${part} utils/utt2spk_to_spk2utt.pl ${dst}/${language}_${part}/utt2spk > ${dst}/${language}_${part}/spk2utt utils/fix_data_dir.sh ${dst}/${language}_${part} - steps_xvec/audio_to_duration.sh --cmd "$train_cmd" ${dst}/${part//-/_} + # steps_xvec/audio_to_duration.sh --cmd "$train_cmd" ${dst}/${part//-/_} done From f179db41f27cd393b58adc3352406fdb6cc09dcc Mon Sep 17 00:00:00 2001 From: ylu125 Date: Wed, 15 Feb 2023 17:40:10 -0500 Subject: [PATCH 06/89] Add combination for multiple languages --- .../config_transducer_v3.3_en_fr_it.sh | 1 + egs/commonvoice/v1/run_001_prepare_data.sh | 23 +++++++++++++++--- hyp_utils/steps_transducer/word2char.py | 24 +++++++++++++++++++ 3 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 hyp_utils/steps_transducer/word2char.py diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh index fcb675b8..3c8efca9 100644 --- a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh +++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_en_fr_it.sh @@ -11,6 +11,7 @@ nnet_data=en_fr_it_train_proc_audio dev_data=en_fr_it_dev_proc_audio test_data="en_test_proc_audio fr_test_proc_audio it_test_proc_audio" +lans="en fr it" language=en_fr_it bpe_model=data/en_fr_it_lang_bpe_2000/bpe.model diff --git a/egs/commonvoice/v1/run_001_prepare_data.sh b/egs/commonvoice/v1/run_001_prepare_data.sh index 4c0d0297..7d05ba2c 100755 --- a/egs/commonvoice/v1/run_001_prepare_data.sh +++ b/egs/commonvoice/v1/run_001_prepare_data.sh @@ -25,7 +25,7 @@ if [ ${stage} -le 1 ]; then ### Task dependent. You have to make data the following preparation part by yourself. ### But you can utilize Kaldi recipes in most cases echo "stage 1: Data preparation" - for lan in $language #it sv-SE + for lan in $lans do # use underscore-separated names in data directories. local/data_prep.sh ${lan} $commonvoice_root data/ @@ -35,15 +35,32 @@ fi if [ ${stage} -le 2 ]; then echo "stage 2: Data conversion" # for part in $test_data $dev_data $nnet_data - for lan in $language #it sv-SE + for lan in $lans do for part in ${lan}_test ${lan}_dev ${lan}_train do echo ${part} - steps_transducer/preprocess_audios_for_nnet_train.sh --nj 20 --cmd "$train_cmd" \ + steps_transducer/preprocess_audios_for_nnet_train.sh --nj 16 --cmd "$train_cmd" \ --storage_name commonvoice-v1-$(date +'%m_%d_%H_%M') --use-bin-vad false \ --osr 16000 data/${part} data/${part}_proc_audio exp/${part}_proc_audio utils/fix_data_dir.sh data/${part}_proc_audio || true done done fi + +if [ ${stage} -le 3 ]; then + echo "stage 3: Combine Multilingual Data" + + dev_folders="" + train_folders="" + for lan in $lans + do + dev_folders+="data/${lan}_dev_proc_audio " + train_folders+="data/${lan}_train_proc_audio " + done + + combine_data.sh data/dev_data/ $dev_folders + combine_data.sh data/nnet_data/ $train_folders + + +fi \ No newline at end of file diff --git a/hyp_utils/steps_transducer/word2char.py b/hyp_utils/steps_transducer/word2char.py new file mode 100644 index 00000000..062832c4 --- /dev/null +++ b/hyp_utils/steps_transducer/word2char.py @@ -0,0 +1,24 @@ +import os +import sys + +word_file = sys.argv[1] # "data/it_test_proc_audio/text" +char_file = sys.argv[2] # "data/it_test_proc_audio/text_char" + + +# word_file = "exp/transducer/wav2vec2xlsr300m_transducer_v3.3_it.s1/it_test_proc_audio/transducer.text" +# char_file = "exp/transducer/wav2vec2xlsr300m_transducer_v3.3_it.s1/it_test_proc_audio/transducer_char.text" + +output_chars = [] +with open(word_file, "r") as fi: + for line in fi.readlines(): + words = line.split(" ") + chars = [words[0]] + for wrd in words[1:]: + for c in wrd: + chars.append(c) + output_chars.append(chars) + +with open(char_file, "w") as fo: + for chars in output_chars: + fo.writelines(" ".join(chars)) + From f816ed366bf6b6bddf9342976bd714b17eb960f8 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Sun, 19 Feb 2023 22:10:28 -0500 Subject: [PATCH 07/89] Add language identification task for commonvoice --- ...c2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml | 55 +++ ...v2vec2xlsr300m_transducer_stage1_v3.3.yaml | 4 +- ...v2vec2xlsr300m_transducer_stage2_v3.3.yaml | 4 +- .../conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml | 40 ++ .../global_conf/config_lid_v3.3_en_fr_it.sh | 42 ++ egs/commonvoice/v1/run_012_train_lid.sh | 136 ++++++ hyperion/bin/train_wav2vec2languageid.py | 261 ++++++++++++ hyperion/bin/train_wav2vec2transducer.py | 6 +- hyperion/torch/data/audio_dataset.py | 20 + hyperion/torch/models/__init__.py | 1 + .../torch/models/wav2languageid/__init__.py | 7 + .../wav2languageid/hf_wav2languageid.py | 391 ++++++++++++++++++ .../hf_wav2vec2resnet1d_languageid.py | 99 +++++ hyperion/torch/trainers/__init__.py | 2 + hyperion/torch/trainers/languageid_trainer.py | 208 ++++++++++ 15 files changed, 1268 insertions(+), 8 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v3.3_en_fr_it.sh create mode 100755 egs/commonvoice/v1/run_012_train_lid.sh create mode 100755 hyperion/bin/train_wav2vec2languageid.py create mode 100644 hyperion/torch/models/wav2languageid/__init__.py create mode 100644 hyperion/torch/models/wav2languageid/hf_wav2languageid.py create mode 100644 hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py create mode 100644 hyperion/torch/trainers/languageid_trainer.py diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..afe885a3 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 2 + drop_last: false + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 2 + drop_last: true + data_loader: + num_workers: 1 +model: wav2vec2xlsr300m_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml index e9fe0b05..96e0c4aa 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml @@ -13,7 +13,7 @@ data: min_batch_size: 1 drop_last: false data_loader: - num_workers: 2 + num_workers: 1 val: dataset: aug_cfgs: @@ -28,7 +28,7 @@ data: min_batch_size: 1 drop_last: true data_loader: - num_workers: 2 + num_workers: 1 model: wav2vec2xlsr300m_transducer_do0.4.yaml trainer: optim: diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml index 686f9133..88073958 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml @@ -13,7 +13,7 @@ data: min_batch_size: 1 drop_last: false data_loader: - num_workers: 2 + num_workers: 1 val: dataset: aug_cfgs: @@ -28,7 +28,7 @@ data: min_batch_size: 1 drop_last: true data_loader: - num_workers: 2 + num_workers: 1 model: transducer: decoder: diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml new file mode 100644 index 00000000..2e7574c2 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/global_conf/config_lid_v3.3_en_fr_it.sh b/egs/commonvoice/v1/global_conf/config_lid_v3.3_en_fr_it.sh new file mode 100644 index 00000000..08a9f950 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v3.3_en_fr_it.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=en_fr_it_train_proc_audio +dev_data=en_fr_it_dev_proc_audio +test_data="en_test_proc_audio fr_test_proc_audio it_test_proc_audio" + +lans="en fr it" +language=en_fr_it + +bpe_model=data/en_fr_it_lang_bpe_2000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v3.3_en_fr_it +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0022.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/run_012_train_lid.sh b/egs/commonvoice/v1/run_012_train_lid.sh new file mode 100755 index 00000000..80948243 --- /dev/null +++ b/egs/commonvoice/v1/run_012_train_lid.sh @@ -0,0 +1,136 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=1 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2languageid.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.language-id-file $train_dir/utt2lang \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $val_dir/langs \ + --data.val.dataset.language-id-file $val_dir/utt2lang \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2languageid.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.language-id-file $train_dir/utt2lang \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $val_dir/langs \ + --data.val.dataset.language-id-file $val_dir/utt2lang \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2languageid.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.language-id-file $train_dir/utt2lang \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $val_dir/langs \ + --data.val.dataset.language-id-file $val_dir/utt2lang \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + diff --git a/hyperion/bin/train_wav2vec2languageid.py b/hyperion/bin/train_wav2vec2languageid.py new file mode 100755 index 00000000..093042f6 --- /dev/null +++ b/hyperion/bin/train_wav2vec2languageid.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.utils import ddp +from hyperion.torch.trainers import LanguageIDTrainer as Trainer +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import HFWav2Vec2ResNet1dLanguageID +from torch.nn.utils.rnn import pad_sequence + +model_dict = { + "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dLanguageID, + # "hf_hubert2resnet1d": HFHubert2ResNet1LanguageID, + # "hf_wavlm2resnet1d": HFWavLM2ResNet1dLanguageID, +} + + +def Language_collate(batch): + audio = [] + audio_length = [] + language = [] + for record in batch: + wav = torch.as_tensor(record[0]) + audio.append(wav) + audio_length.append(wav.shape[0]) + language.append(record[1]) + audio = pad_sequence(audio) + audio_length = torch.as_tensor(audio_length) + language = torch.as_tensor(language) + + return torch.transpose(audio, 0, 1), audio_length, language + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs, + collate_fn=Language_collate) + return data_loader + + +def init_model(num_classes, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + model_args["languageid"]["num_classes"] = num_classes + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + + model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument("--data.train.dataset.language_id_file", type=str) + parser.add_argument("--data.val.dataset.language_id_file", type=str) + + + parser.add_argument( + "--data.train.dataset.class_files", + type=str, + ) + + + parser.add_argument( + "--data.dev.dataset.class_files", + type=str, + ) + + parser.add_argument( + "--data.train.dataset.class_names", + type=str, + ) + + parser.add_argument( + "--data.dev.dataset.class_names", + type=str, + ) + + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train Wav2Vec2Language model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index ee60080a..cb96c0f6 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -107,9 +107,6 @@ def train_model(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) set_float_cpu("float32") - #torch.backends.cudnn.deterministic = True - #torch.backends.cudnn.benchmark = False - torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) @@ -215,7 +212,8 @@ def make_parser(model_class): parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() - + print("cuda available:", torch.cuda.is_available()) + logging.info("cuda available: {}".format(torch.cuda.is_available())) for k, v in model_dict.items(): parser_k = make_parser(v) subcommands.add_subcommand(k, parser_k) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 721c7a1f..3bfa328b 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -467,6 +467,7 @@ def __init__( bpe_model=None, text_file=None, time_durs_file=None, + language_id_file=None, aug_cfgs=None, num_augs=1, return_segment_info=None, @@ -512,7 +513,15 @@ def __init__( else: assert "duration" in self.seg_set + if language_id_file is not None: + if rank == 0: + logging.info("loading language id file %s" % language_id_file) + + language_ids = SegmentSet.load(language_id_file) + self.seg_set["language"] = language_ids.loc[self.seg_set["id"]].class_id + logging.info("loading class-info files") + self._load_class_infos(class_names, class_files, is_val) @@ -523,6 +532,8 @@ def __init__( if text_file is not None: logging.info("loading text files") self._load_text_infos(text_file, is_val) + + self.return_segment_info = ( [] if return_segment_info is None else return_segment_info ) @@ -764,6 +775,7 @@ def filter_args(**kwargs): "return_segment_info", "return_orig", "time_durs_file", + "language_id_file", "target_sample_freq", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -807,6 +819,14 @@ def add_class_args(parser, prefix=None, skip={}): ), ) + parser.add_argument( + "--language-id-file", + default=None, + help=( + "file with language ids for each utterance" + ), + ) + parser.add_argument( "--bpe-model", default=None, diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 44ff171d..21fe7e6f 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -20,6 +20,7 @@ from .wav2transducer import HFWav2Vec2Transducer +from .wav2languageid import HFWav2Vec2ResNet1dLanguageID from .vae.vae import VAE from .vae.vq_vae import VQVAE diff --git a/hyperion/torch/models/wav2languageid/__init__.py b/hyperion/torch/models/wav2languageid/__init__.py new file mode 100644 index 00000000..849a30a6 --- /dev/null +++ b/hyperion/torch/models/wav2languageid/__init__.py @@ -0,0 +1,7 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +from .hf_wav2vec2resnet1d_languageid import HFWav2Vec2ResNet1dLanguageID \ No newline at end of file diff --git a/hyperion/torch/models/wav2languageid/hf_wav2languageid.py b/hyperion/torch/models/wav2languageid/hf_wav2languageid.py new file mode 100644 index 00000000..22974afe --- /dev/null +++ b/hyperion/torch/models/wav2languageid/hf_wav2languageid.py @@ -0,0 +1,391 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import contextlib +from jsonargparse import ArgumentParser, ActionParser + +import torch +import torch.nn as nn + +# import torch.nn.functional as nnf + +from ...torch_model import TorchModel +from ...utils import remove_silence + + +class HFWav2LanguageID(TorchModel): + """Abstract Base class for language identification models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + languageid: language identification model object. + feat_fusion_start: the input to language identification model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, hf_feats, languageid, feat_fusion_start=0, feat_fusion_method="weighted-avg" + ): + + super().__init__() + self.hf_feats = hf_feats + self.languageid = languageid + self.feat_fusion_start = feat_fusion_start + self.feat_fusion_method = feat_fusion_method + self._hf_context = contextlib.nullcontext() + self._make_fuser() + + def _make_fuser(self): + if self.feat_fusion_method == "last": + self.feat_fuser = None + return + + num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "linear": + self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers + elif self.feat_fusion_method == "cat": + self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) + + def _fuse_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start :] + if self.feat_fusion_method == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "last": + feats = hid_feats[-1] + + return feats + + @property + def sample_frequency(self): + return self.hf_feats.sample_frequency + + def compute_prototype_affinity(self): + return self.languageid.compute_prototype_affinity() + + def update_loss_margin(self, epoch): + """Updates the value of the margin in AAM/AM-softmax losses + given the epoch number + + Args: + epoch: epoch which is about to start + """ + self.languageid.update_loss_margin(epoch) + + def rebuild_output_layer( + self, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + ): + self.languageid.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + ) + + def forward_feats( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fusion_method == "last" + else True + ) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + feats = self._fuse_hid_feats(hid_feats) + else: + hid_feats = None + feats = hf_output["last_hidden_state"] + + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the language identification encoder. + hid_feats = [ + f.transpose(1, 2) + for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + x_lengths=None, + y=None, + return_feat_layers=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the language identification encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_enc_layers: list of integers indicating, which classification head layers + we should return. If None, no head layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Tensor with class logits with shape=(batch, num_classes) or + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) + """ + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers + ) + output = self.languageid( + feats, + feat_lengths, + y, + return_enc_layers=return_enc_layers, + return_classif_layers=return_classif_layers, + return_logits=return_logits, + ) + + if not return_feat_layers: + return output + + if not isinstance(output, dict): + # if the languageid just returned the logits we put then into a dictionary + # to append the hid feats later. + output["logits"] = output + + output["h_feats"] = hid_feats + return output + + def extract_embed( + self, + x, + x_lengths=None, + vad_samples=None, + hf_chunk_length=0, + xvec_chunk_length=0, + embed_layer=None, + detach_chunks=False, + ): + + if vad_samples is not None: + x, x_lengths = remove_silence(x, x_lengths) + + feats, _, feat_lengths = self.forward_feats( + x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks + ) + xvec_chunk_length = int( + xvec_chunk_length + * self.hf_feats.sample_frequency + * feats.size(-1) + // x.size(-1) + ) + return self.languageid.extract_embed( + feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks + ) + + def freeze_feat_fuser(self): + if self.feat_fuser is None: + return + + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser.requires_grad = False + return + + for param in self.feat_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode == "ft-embed-affine": + self.unfreeze() + self.freeze_feat_fuser() + self.freeze_hf_feats() + self.languageid.freeze_preembed_layers() + elif mode in ["ft-languageid", "ft-languageid-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalanguageid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode == "ft-embed-affine": + self.hf_feats.train() + self.languageid._train("ft-embed_affine") + elif train_mode in [ + "ft-languageid", + "hf-feats-frozen", + "ft-languageid-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.languageid._train("full") + else: + raise ValueError(f"invalanguageid train_mode={train_mode}") + + @staticmethod + def valanguageid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-languageid", + "hf-feats-frozen", + "ft-languageid-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valanguageid_args = ( + "hf_feats", + "languageid", + "feat_fusion_start", + "feat_fusion_method", + ) + args = dict((k, kwargs[k]) for k in valanguageid_args if k in kwargs) + return args + + def get_config(self): + + hf_cfg = self.hf_feats.get_config() + xvec_cfg = self.languageid.get_config() + del hf_cfg["class_name"] + del xvec_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "languageid": xvec_cfg, + "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_method": self.feat_fusion_method, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, languageid): + logging.info("changing hf wav2xvector config") + self.hf_feats.change_config(**hf_feats) + self.languageid.change_config(**languageid) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help=( + "the input to language identification model will fuse the wav2vec layers from feat_fusion_start to" + "the wav2vec num_layers" + ), + ) + parser.add_argument( + "--feat-fusion-method", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=( + "method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, cat]" + ), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + help="languageid options", + ) diff --git a/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py b/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py new file mode 100644 index 00000000..d357cd87 --- /dev/null +++ b/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py @@ -0,0 +1,99 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from jsonargparse import ArgumentParser, ActionParser +from typing import Union, Dict, Optional + +import torch +import torch.nn as nn + +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID +from ...tpm import HFWav2Vec2 +from .hf_wav2languageid import HFWav2LanguageID + + +class HFWav2Vec2ResNet1dLanguageID(HFWav2LanguageID): + """Class extracting Wav2Vec2 + ResNet1d language identifications from waveform. + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + languageid: ResNet1dLanguageID configuration dictionary or object. + feat_fusion_start: the input to language identification model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + languageid: Union[Dict, ResNet1dLanguageID], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + if isinstance(languageid, dict): + languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size + if "class_name" in languageid: + del languageid["class_name"] + languageid = ResNet1dLanguageID(**languageid) + else: + assert isinstance(languageid, ResNet1dLanguageID) + assert languageid.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, languageid, feat_fusion_start, feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + + base_args = HFWav2LanguageID.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"]) + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + ResNet1dLanguageID.add_class_args(parser, prefix="languageid") + HFWav2LanguageID.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dLanguageID.filter_finetune_args(**kwargs["languageid"]) + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + ResNet1dLanguageID.add_finetune_args(parser, prefix="languageid") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/__init__.py b/hyperion/torch/trainers/__init__.py index 593cfa1f..5db38bf7 100644 --- a/hyperion/torch/trainers/__init__.py +++ b/hyperion/torch/trainers/__init__.py @@ -6,6 +6,8 @@ from .torch_trainer import TorchTrainer + +from .languageid_trainer import LanguageIDTrainer from .transducer_trainer import TransducerTrainer from .xvector_trainer import XVectorTrainer diff --git a/hyperion/torch/trainers/languageid_trainer.py b/hyperion/torch/trainers/languageid_trainer.py new file mode 100644 index 00000000..3a65bfde --- /dev/null +++ b/hyperion/torch/trainers/languageid_trainer.py @@ -0,0 +1,208 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import os +from collections import OrderedDict as ODict + +import logging + +import torch +import torchaudio +import torch.nn as nn + +from ..utils import MetricAcc +from .torch_trainer import TorchTrainer +from torch.distributed.elastic.multiprocessing.errors import record + + +class LanguageIDTrainer(TorchTrainer): + """Trainer to train Language identification style models. + + Attributes: + model: Language identification model object. + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp + """ + def __init__( + self, + model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + eff_batch_size=None, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + loss=None, + train_mode="full", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): + + if loss is None: + loss = nn.CrossEntropyLoss() + super().__init__( + model, + loss, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) + + @record + def train_epoch(self, data_loader): + """Training epoch loop + + Args: + data_loader: pytorch data loader returning features and class labels. + """ + + self.model.update_loss_margin(self.cur_epoch) + + metric_acc = MetricAcc(device=self.device) + batch_metrics = ODict() + self.model.train() + + for batch, (data, audio_length, target) in enumerate(data_loader): + self.loggers.on_batch_begin(batch) + + if batch % self.grad_acc_steps == 0: + self.optimizer.zero_grad() + data, audio_length, target = data.to(self.device), audio_length.to( + self.device), target.to(self.device) + batch_size = data.shape[0] + + with self.amp_autocast(): + # TODO: Check and Modify output, loss from the model + # output, loss = self.model(data, + # x_lengths=audio_length, + # y=target) + # loss = loss.mean() / self.grad_acc_steps + output = self.model(data, y=target) + loss = self.loss(output, target).mean() / self.grad_acc_steps + + if self.use_amp: + self.grad_scaler.scale(loss).backward() + else: + loss.backward() + + if (batch + 1) % self.grad_acc_steps == 0: + if self.lr_scheduler is not None and not self.in_swa: + self.lr_scheduler.on_opt_step() + self.update_model() + + batch_metrics["loss"] = loss.item() * self.grad_acc_steps + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + logs = metric_acc.metrics + logs["lr"] = self._get_lr() + self.loggers.on_batch_end(logs=logs, batch_size=batch_size) + + logs = metric_acc.metrics + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() + return logs + + # def validation_epoch(self, data_loader, swa_update_bn=False): + # """Validation epoch loop + + # Args: + # data_loader: PyTorch data loader return input/output pairs. + # sw_update_bn: wheter or not, update batch-norm layers in SWA. + # """ + + # metric_acc = MetricAcc(self.device) + # batch_metrics = ODict() + # with torch.no_grad(): + # if swa_update_bn: + # log_tag = "train_" + # self.train() + # else: + # log_tag = "val_" + # self.model.eval() + + # for batch, (data, audio_length, target) in enumerate(data_loader): + # data, audio_length, target = data.to( + # self.device), audio_length.to(self.device), target.to( + # self.device) + # batch_size = data.shape[0] + # # data, target = data.to(self.device), target.to(self.device) + # # batch_size = data.shape[0] + + # with self.amp_autocast(): + # output, loss = self.model(data, + # x_lengths=audio_length, + # y=target) + # # output = self.model(data) + # # loss = self.loss(output, target) + + # batch_metrics["loss"] = loss.mean().item() + # for k, metric in self.metrics.items(): + # batch_metrics[k] = metric(output, target) + + # metric_acc.update(batch_metrics, batch_size) + + # logs = metric_acc.metrics + # logs = ODict((log_tag + k, v) for k, v in logs.items()) + # return logs From b524b8491b7f4dac8c1b9a04a5db486d97c414d1 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Thu, 23 Mar 2023 20:36:56 -0400 Subject: [PATCH 08/89] Add Class Weighted Sampler for ASR and utterance-wise LID --- ...c2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml | 67 +++ .../v1/global_conf/config_lid_v2.0_13langs.sh | 44 ++ .../class_weighted_bucketing_seg_sampler.py | 251 +++++++++++ .../torch/data/class_weighted_seg_sampler.py | 392 ++++++++++++++++++ hyperion/torch/data/seg_sampler_factory.py | 13 +- hyperion/torch/trainers/languageid_trainer.py | 87 ++-- 6 files changed, 809 insertions(+), 45 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v2.0_13langs.sh create mode 100644 hyperion/torch/data/class_weighted_bucketing_seg_sampler.py create mode 100644 hyperion/torch/data/class_weighted_seg_sampler.py diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..c06e46e8 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 2 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.5 + + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 2 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.5 + data_loader: + num_workers: 1 +model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.0_13langs.sh new file mode 100644 index 00000000..851cbc18 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v2.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v2.0_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0022.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py new file mode 100644 index 00000000..94943ccc --- /dev/null +++ b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py @@ -0,0 +1,251 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import math +from jsonargparse import ArgumentParser, ActionParser +import logging + +import numpy as np +import pandas as pd +import torch +import torch.distributed as dist + +from .hyp_sampler import HypSampler +from .class_weighted_seg_sampler import ClassWeightedRandomSegSampler + + +class ClassWeightedRandomBucketingSegSampler(HypSampler): + def __init__(self, + seg_set, + class_info, + base_sampler=ClassWeightedRandomSegSampler, + num_buckets=10, + length_column="duration", + weight_exponent=1.0, + weight_mode="custom", + seg_weight_mode="uniform", + class_name="language", + seed=1234, + **base_kwargs): + super().__init__(shuffle=False, seed=seed) + self.class_name = class_name + self.seg_set = seg_set + self.class_info = class_info + self.base_sampler = base_sampler + self.base_kwargs = base_kwargs + self.base_kwargs["seed"] = seed + self.num_buckets = num_buckets + self.length_column = length_column + self.weight_exponent = weight_exponent + self.weight_mode = weight_mode + self.seg_weight_mode = seg_weight_mode + self._gather_class_info() + self._set_class_weights() + self._create_bucket_samplers() + self._compute_len() + self.depleted_buckets = torch.zeros((num_buckets, ), dtype=torch.bool) + + def create_buckets(self): + # class_ids = self._sample_classes() + sort_idx = np.argsort(self.seg_set[self.length_column].values) + sorted_seg_set = self.seg_set.iloc[sort_idx] + cum_lengths = np.cumsum(sorted_seg_set[self.length_column].values, + axis=0) + bucket_length = cum_lengths[-1] / self.num_buckets + buckets = [] + for i in range(self.num_buckets): + # logging.info("self.seg_set", self.seg_set.get_col_idx(self.length_column)) + # logging.info("sorted_seg_set", sorted_seg_set.get_col_idx(self.length_column)) + bucket_idx = (cum_lengths <= bucket_length) & (cum_lengths > 0) + bucket_i = sorted_seg_set.loc[bucket_idx] + # logging.info("bucket_i", bucket_i.get_col_idx(self.length_column)) + buckets.append(bucket_i) + cum_lengths -= bucket_length + + return buckets + + def _create_bucket_samplers(self): + buckets = self.create_buckets() + bucket_samplers = [] + for i in range(self.num_buckets): + sampler_i = self.base_sampler(buckets[i], + self.class_info, + # weight_exponent=self.weight_exponent, + # weight_mode=self.weight_mode, + seg_weight_mode=self.seg_weight_mode, + class_name=self.class_name, + **self.base_kwargs) + bucket_samplers.append(sampler_i) + + self.bucket_samplers = bucket_samplers + + def __len__(self): + return self._len + + def _gather_class_info(self): + # we get some extra info that we need for the classes. + + # we need the maximum/minimum segment duration for each class. + total_dur = np.zeros(len(self.class_info)) + for i, c in enumerate(self.class_info["id"]): + seg_idx = self.seg_set[self.class_name] == c + if seg_idx.sum() > 0: + durs_i = self.seg_set.loc[seg_idx, self.length_column] + total_dur[i] = durs_i.sum() + else: + total_dur[i] = 0 + + self.class_info["total_duration"] = total_dur + # logging.info("total_duration", self.class_info["total_duration"]) + + # we need the mapping from class index to id + self.map_class_idx_to_ids = self.class_info[["class_idx", "id"]] + self.map_class_idx_to_ids.set_index("class_idx", inplace=True) + + def _set_class_weights(self): + # logging.info("setting class weights") + # logging.info(f'weight mode:{self.weight_mode}') + # logging.info(f'weight exponent:{self.weight_exponent}') + # import pdb; pdb.set_trace() + if self.weight_mode == "uniform": + self.class_info.set_uniform_weights() + elif self.weight_mode == "data-prior": + weights = self.class_info["total_duration"].values + self.class_info.set_weights(weights) + logging.info(f'data-prior weight:{self.class_info["weights"]}') + + if self.weight_exponent != 1.0: + self.class_info.exp_weights(self.weight_exponent) + logging.info(f'weight_exponent weight:{self.class_info["weights"]}') + + + def _compute_len(self): + self._len = 0 + for i in range(self.num_buckets): + self._len += len(self.bucket_samplers[i]) + + def set_epoch(self, epoch): + for i in range(self.num_buckets): + self.bucket_samplers[i].set_epoch(epoch) + + def __iter__(self): + super().__iter__() + self.depleted_buckets[:] = False + for i in range(self.num_buckets): + self.bucket_samplers[i].__iter__() + + return self + + def all_buckets_depleted(self): + return torch.all(self.depleted_buckets).item() + + def __next__(self): + + if self.batch == self._len or self.all_buckets_depleted(): + raise StopIteration + + while True: + bucket_idx = torch.randint(low=0, + high=self.num_buckets, + size=(1, ), + generator=self.rng).item() + if self.depleted_buckets[bucket_idx]: + continue + + bucket = self.bucket_samplers[bucket_idx] + try: + batch = next(bucket) + break + except StopIteration: + self.depleted_buckets[bucket_idx] = True + if self.all_buckets_depleted(): + raise StopIteration() + + if self.batch == 0: + logging.info("batch 0 chunks=%s", str(batch[:10])) + + self.batch += 1 + return batch + + @property + def avg_batch_size(self): + avg_batch_size = 0 + for sampler in self.bucket_samplers: + avg_batch_size += sampler.avg_batch_size + + avg_batch_size /= self.num_buckets + return avg_batch_size + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "num_buckets", + "length_column", + "weight_exponent", + "weight_mode", + "seg_weight_mode", + "class_name", + "length_column", + "shuffle", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + + parser.add_argument( + "--weight-exponent", + default=1.0, + type=float, + help=("exponent for class weights"), + ) + parser.add_argument( + "--weight-mode", + default="custom", + choices=["custom", "uniform", "data-prior"], + help=("method to get the class weights"), + ) + + parser.add_argument( + "--seg-weight-mode", + default="uniform", + choices=["uniform", "data-prior"], + help=("method to sample segments given a class"), + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the segments or chunks at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--length-column", + default="duration", + help="which column in the segment table indicates the duration of the segment", + ) + parser.add_argument( + "--class-name", + default="class_id", + help="which column in the segment table indicates the class of the segment", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/class_weighted_seg_sampler.py b/hyperion/torch/data/class_weighted_seg_sampler.py new file mode 100644 index 00000000..09a34591 --- /dev/null +++ b/hyperion/torch/data/class_weighted_seg_sampler.py @@ -0,0 +1,392 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import math +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +import logging + +import numpy as np + +import torch +from .hyp_sampler import HypSampler + + +def get_loc(seg_set, keys): + if isinstance(keys, (list, np.ndarray)): + return seg_set.index.get_indexer(keys) + + loc = seg_set.index.get_loc(keys) + if isinstance(loc, int): + return loc + elif isinstance(loc, np.ndarray) and loc.dtype == np.bool: + return np.nonzero(loc)[0] + else: + return list(range(loc.start, loc.stop, loc.step)) + +class ClassWeightedRandomSegSampler(HypSampler): + def __init__( + self, + seg_set, + class_info, + min_batch_size=1, + max_batch_size=None, + max_batch_length=None, + length_name="duration", + shuffle=False, + drop_last=False, + # weight_exponent=1.0, + # weight_mode="custom", + seg_weight_mode="uniform", + num_segs_per_class=1, + class_name="class_id", + seed=1234, + ): + super().__init__(shuffle=shuffle, seed=seed) + self.class_info = class_info + # self.weight_exponent=weight_exponent + # self.weight_mode=weight_mode + self.seg_weight_mode = seg_weight_mode + self.num_segs_per_class = num_segs_per_class + self.class_name=class_name + self.seg_set = seg_set + self.min_batch_size = min_batch_size + self.max_batch_size = max_batch_size + self.max_batch_length = max_batch_length + self.var_batch_size = max_batch_length is not None + self.length_name = length_name + if self.var_batch_size: + avg_batch_size = max_batch_length / np.mean( + self.seg_set[self.length_name]) + else: + avg_batch_size = min_batch_size + + self.avg_batch_size = avg_batch_size + + if drop_last: + self._len = int( + len(self.seg_set) / (avg_batch_size * self.world_size)) + else: + self._len = int( + math.ceil( + (len(self.seg_set) // self.world_size) / avg_batch_size)) + + self._gather_class_info() + self._permutation = None + + + def _gather_class_info(self): + # we get some extra info that we need for the classes. + + # we need the maximum/minimum segment duration for each class. + max_dur = np.zeros(len(self.class_info)) + min_dur = np.zeros(len(self.class_info)) + total_dur = np.zeros(len(self.class_info)) + for i, c in enumerate(self.class_info["id"]): + seg_idx = self.seg_set[self.class_name] == c + if seg_idx.sum() > 0: + durs_i = self.seg_set.loc[seg_idx, self.length_name] + max_dur[i] = durs_i.max() + min_dur[i] = durs_i.min() + total_dur[i] = durs_i.sum() + else: + max_dur[i] = min_dur[i] = total_dur[i] = 0 + + self.class_info["max_seg_duration"] = max_dur + self.class_info["min_seg_duration"] = min_dur + self.class_info["total_duration"] = total_dur + # logging.info("total_duration", self.class_info["total_duration"]) + + # we need the mapping from class index to id + self.map_class_idx_to_ids = self.class_info[["class_idx", "id"]] + self.map_class_idx_to_ids.set_index("class_idx", inplace=True) + + # we need the list of segments from each class + # to speed up segment sampling + # searching then in each batch, it is too slow + map_class_to_segs = self.seg_set[["id", self.class_name]].set_index( + self.class_name + ) + self.map_class_to_segs_idx = {} + for class_id in self.class_info["id"].values: + if class_id in map_class_to_segs.index: + seg_ids = map_class_to_segs.loc[class_id, "id"] + if isinstance(seg_ids, str): + seg_ids = [seg_ids] + else: + seg_ids = seg_ids.values + + seg_idx = get_loc(self.seg_set,seg_ids) + else: + seg_idx = [] + self.class_info.loc[class_id, "weights"] = 0.0 + self.class_info.renorm_weights() + + self.map_class_to_segs_idx[class_id] = seg_idx + logging.info(f'weight_exponent weight:{self.class_info["weights"]}') + + + def _get_class_weights(self): + # if not self.var_weights: + # return torch.as_tensor(self.class_info["weights"].values) + + class_weights = self.class_info["weights"].values.copy() + # renormalize weights + class_weights /= class_weights.sum() + return torch.as_tensor(class_weights) + + def _sample_classes(self, num_classes): + weights = self._get_class_weights() + # logging.info("weights: %s", weights) + + row_idx = torch.multinomial( + weights, num_samples=num_classes, replacement=True, generator=self.rng, + ).numpy() + + class_ids = self.class_info.iloc[row_idx].id.values + + return class_ids + + + def _sample_segs(self, class_ids): + + dur_col_idx = self.seg_set.columns.get_loc(self.length_name) + id_col_idx = self.seg_set.columns.get_loc("id") + + seg_ids = [] + for c in class_ids: + # for each class we sample segments longer than chunk length + # get segments belonging to c + # t1 = time.time() + seg_idx_c = self.map_class_to_segs_idx[c] + # seg_idx_c = self.map_class_to_segs_idx[c] + # t2 = time.time() + durs = self.seg_set.iloc[seg_idx_c, dur_col_idx].values + # if self.class_info.loc[c, "min_seg_duration"] < chunk_length: + # mask = durs >= chunk_length + # seg_idx_c = seg_idx_c[mask] + # durs = durs[mask] + + # t3 = time.time() + # sample num_segs_per_class random segments + if len(seg_idx_c) == 0: + logging.error("no segments found with class=%s dur=%d", c, chunk_length) + if self.seg_weight_mode == "uniform": + sel_idx = torch.randint( + low=0, + high=len(seg_idx_c), + size=(self.num_segs_per_class,), + generator=self.rng, + ).numpy() + + elif self.seg_weight_mode == "data-prior": + weights = durs / durs.sum() + sel_idx = torch.multinomial( + torch.from_numpy(weights), + num_samples=self.num_segs_per_class, + replacement=True, + generator=self.rng, + ).numpy() + # t4 = time.time() + else: + raise ValueError("unknown seg-weight-mode=%s", self.seg_weight_mode) + + sel_seg_idx_c = seg_idx_c[sel_idx] + sel_seg_ids_c = list(self.seg_set.iloc[sel_seg_idx_c, id_col_idx]) + # t5 = time.time() + seg_ids.extend(sel_seg_ids_c) + # t6 = time.time() + # logging.info( + # "stime %f %f %f %f %f", t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5 + # ) + + return seg_ids + + def __len__(self): + return self._len + + def _shuffle_segs(self): + self._permutation = torch.randperm(len(self.seg_set), + generator=self.rng).numpy() + + def __iter__(self): + super().__iter__() + if self.shuffle: + self._shuffle_segs() + + self.start = self.rank + return self + + def __next__(self): + + if self.batch == self._len: + raise StopIteration + + + if self.var_batch_size: + column_idx = self.seg_set.columns.get_loc(self.length_name) + idxs = [] + max_length = 0 + batch_size = 0 + while True: + if self.shuffle: + idx = self._permutation[self.start] + else: + idx = self.start + + max_length = max(max_length, self.seg_set.iloc[idx, + column_idx]) + if max_length * (batch_size + 1) > self.max_batch_length: + break + + idxs.append(idx) + self.start = (self.start + self.world_size) % len(self.seg_set) + batch_size += 1 + if (self.max_batch_size is not None + and batch_size >= self.max_batch_size): + break + + assert len( + idxs + ) >= 1, f"increase max_batch_length {self.max_batch_length} >= {max_length}" + else: + stop = min(self.start + self.world_size * self.min_batch_size, + len(self.seg_set)) + if self.shuffle: + idxs = self._permutation[self.start:stop:self.world_size] + else: + idxs = slice(self.start, stop, self.world_size) + + self.start += self.world_size * self.min_batch_size + + + class_ids = self._sample_classes(batch_size) + seg_ids = self._sample_segs(class_ids) + + + # if "chunk_start" in self.seg_set: + # chunks = self.seg_set.iloc[idxs] + # seg_ids = [(id, s, d) for id, s, d in zip( + # chunks.seg_id, chunks.chunk_start, chunks[self.length_name])] + # else: + # seg_ids = self.seg_set.iloc[idxs].id.values + + if self.batch == 0: + logging.info("batch 0 seg_ids=%s", str(seg_ids[:10])) + + self.batch += 1 + return seg_ids + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "min_batch_size", + "max_batch_size", + "max_batch_length", + "length_name", + # "weight_exponent", + # "weight_mode", + "seg_weight_mode", + "num_segs_per_class", + "class_name", + "shuffle", + "drop_last", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--min-batch-size", + type=int, + default=1, + help=("minimum batch size per gpu"), + ) + parser.add_argument( + "--max-batch-size", + type=int, + default=None, + help= + ("maximum batch size per gpu, if None, estimated from max_batch_length" + ), + ) + + parser.add_argument( + "--max-batch-duration", + type=float, + default=None, + help= + ("maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), + ) + + parser.add_argument( + "--drop-last", + action=ActionYesNo, + help="drops the last batch of the epoch", + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help= + "shuffles the segments or chunks at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--length-name", + default="duration", + help= + "which column in the segment table indicates the duration of the file", + ) + + + parser.add_argument( + "--weight-exponent", + default=1.0, + type=float, + help=("exponent for class weights"), + ) + parser.add_argument( + "--weight-mode", + default="custom", + choices=["custom", "uniform", "data-prior"], + help=("method to get the class weights"), + ) + + parser.add_argument( + "--num-segs-per-class", + type=int, + default=1, + help=("number of segments per class in batch"), + ) + parser.add_argument( + "--seg-weight-mode", + default="uniform", + choices=["uniform", "data-prior"], + help=("method to sample segments given a class"), + ) + parser.add_argument( + "--class-name", + default="class_id", + help="which column in the segment table indicates the class of the segment", + ) + + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py index 512f2f64..63b0cc86 100644 --- a/hyperion/torch/data/seg_sampler_factory.py +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -13,13 +13,18 @@ from .class_weighted_seg_chunk_sampler import ClassWeightedRandomSegChunkSampler from .seg_chunk_sampler import SegChunkSampler from .bucketing_seg_sampler import BucketingSegSampler +from .class_weighted_bucketing_seg_sampler import ClassWeightedRandomBucketingSegSampler +from .class_weighted_seg_sampler import ClassWeightedRandomSegSampler + sampler_dict = { "class_weighted_random_seg_chunk_sampler": ClassWeightedRandomSegChunkSampler, "seg_sampler": SegSampler, + "class_weighted_seg_sampler": ClassWeightedRandomSegSampler, "seg_chunk_sampler": SegChunkSampler, "bucketing_seg_sampler": BucketingSegSampler, + "class_weighted_random_bucketing_seg_sampler": ClassWeightedRandomBucketingSegSampler, } @@ -45,7 +50,7 @@ def create( sampler_class = sampler_dict[sampler_type] sampler_kwargs = sampler_class.filter_args(**kwargs) - if sampler_type in ["bucketing_seg_sampler", "seg_chunk_sampler"]: + if sampler_type in ["bucketing_seg_sampler", "seg_chunk_sampler", "class_weighted_random_bucketing_seg_sampler"]: base_sampler_class = sampler_dict[base_sampler_type] base_sampler_kwargs = base_sampler_class.filter_args(**kwargs) sampler_kwargs.update(base_sampler_kwargs) @@ -55,7 +60,9 @@ def create( base_sampler_kwargs = base_sampler_class.filter_args(**kwargs) sampler_kwargs.update(base_sampler_kwargs) - if sampler_type in ["class_weighted_random_seg_chunk_sampler"]: + if sampler_type in ["class_weighted_random_seg_chunk_sampler", "class_weighted_random_bucketing_seg_sampler"]: + # import pdb; pdb.set_trace() + logging.info(f"sampler-args={sampler_kwargs}") try: class_name = sampler_kwargs["class_name"] except: @@ -110,7 +117,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--base-sampler-type", - choices=["seg_sampler", "bucketing_seg_sampler"], + choices=["seg_sampler", "bucketing_seg_sampler", "bucketing_seg_sampler","class_weighted_seg_sampler"], default="seg_sampler", help= "base sampler used for seg_chunk_sampler or bucketing_seg_sampler", diff --git a/hyperion/torch/trainers/languageid_trainer.py b/hyperion/torch/trainers/languageid_trainer.py index 3a65bfde..773402d5 100644 --- a/hyperion/torch/trainers/languageid_trainer.py +++ b/hyperion/torch/trainers/languageid_trainer.py @@ -164,45 +164,48 @@ def train_epoch(self, data_loader): logs["lr"] = self._get_lr() return logs - # def validation_epoch(self, data_loader, swa_update_bn=False): - # """Validation epoch loop - - # Args: - # data_loader: PyTorch data loader return input/output pairs. - # sw_update_bn: wheter or not, update batch-norm layers in SWA. - # """ - - # metric_acc = MetricAcc(self.device) - # batch_metrics = ODict() - # with torch.no_grad(): - # if swa_update_bn: - # log_tag = "train_" - # self.train() - # else: - # log_tag = "val_" - # self.model.eval() - - # for batch, (data, audio_length, target) in enumerate(data_loader): - # data, audio_length, target = data.to( - # self.device), audio_length.to(self.device), target.to( - # self.device) - # batch_size = data.shape[0] - # # data, target = data.to(self.device), target.to(self.device) - # # batch_size = data.shape[0] - - # with self.amp_autocast(): - # output, loss = self.model(data, - # x_lengths=audio_length, - # y=target) - # # output = self.model(data) - # # loss = self.loss(output, target) - - # batch_metrics["loss"] = loss.mean().item() - # for k, metric in self.metrics.items(): - # batch_metrics[k] = metric(output, target) - - # metric_acc.update(batch_metrics, batch_size) - - # logs = metric_acc.metrics - # logs = ODict((log_tag + k, v) for k, v in logs.items()) - # return logs + def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + + Args: + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. + """ + + metric_acc = MetricAcc(self.device) + batch_metrics = ODict() + with torch.no_grad(): + if swa_update_bn: + log_tag = "train_" + self.train() + else: + log_tag = "val_" + self.model.eval() + + for batch, (data, audio_length, target) in enumerate(data_loader): + data, audio_length, target = data.to( + self.device), audio_length.to(self.device), target.to( + self.device) + batch_size = data.shape[0] + # data, target = data.to(self.device), target.to(self.device) + # batch_size = data.shape[0] + + with self.amp_autocast(): + output = self.model(data, y=target) + loss = self.loss(output, target).mean() / self.grad_acc_steps + + # output, loss = self.model(data, + # x_lengths=audio_length, + # y=target) + # output = self.model(data) + # loss = self.loss(output, target) + + batch_metrics["loss"] = loss.mean().item() + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + + logs = metric_acc.metrics + logs = ODict((log_tag + k, v) for k, v in logs.items()) + return logs From 07ddda643d93699054961b3d9f351ca745f9757e Mon Sep 17 00:00:00 2001 From: ylu125 Date: Sat, 25 Mar 2023 19:15:51 -0400 Subject: [PATCH 09/89] Remove the seg_weighted_mode for sequence-level task --- ...c2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml | 67 +++++++++++++++++++ .../conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml | 4 +- .../v1/global_conf/config_lid_v2.1_13langs.sh | 44 ++++++++++++ egs/commonvoice/v1/run_001_prepare_data.sh | 10 +-- egs/commonvoice/v1/run_012_train_lid.sh | 22 ++---- hyperion/bin/train_wav2vec2languageid.py | 15 +---- hyperion/torch/data/audio_dataset.py | 15 ----- .../class_weighted_bucketing_seg_sampler.py | 13 ---- .../torch/data/class_weighted_seg_sampler.py | 60 +++-------------- 9 files changed, 139 insertions(+), 111 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml new file mode 100644 index 00000000..06d5697d --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 2 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 2 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + data_loader: + num_workers: 1 +model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml index 2e7574c2..5ca98bd9 100644 --- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml @@ -32,9 +32,9 @@ languageid: inner_feats: 128 embed_dim: 192 cos_scale: 32.0 - margin: 0.2 + margin: 0.0 margin_warmup_epochs: 5 - intertop_margin: 0.1 + intertop_margin: 0.0 dropout_rate: 0.0 feat_fusion_method: weighted-avg feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh new file mode 100644 index 00000000..c5febd98 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v2.1_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0022.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.1.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/run_001_prepare_data.sh b/egs/commonvoice/v1/run_001_prepare_data.sh index 7d05ba2c..d4873f0f 100755 --- a/egs/commonvoice/v1/run_001_prepare_data.sh +++ b/egs/commonvoice/v1/run_001_prepare_data.sh @@ -37,7 +37,7 @@ if [ ${stage} -le 2 ]; then # for part in $test_data $dev_data $nnet_data for lan in $lans do - for part in ${lan}_test ${lan}_dev ${lan}_train + for part in ${lan}_train # ${lan}_test ${lan}_dev do echo ${part} steps_transducer/preprocess_audios_for_nnet_train.sh --nj 16 --cmd "$train_cmd" \ @@ -59,8 +59,10 @@ if [ ${stage} -le 3 ]; then train_folders+="data/${lan}_train_proc_audio " done - combine_data.sh data/dev_data/ $dev_folders - combine_data.sh data/nnet_data/ $train_folders - + combine_data.sh data/${dev_data}/ $dev_folders + combine_data.sh data/${nnet_data}/ $train_folders + awk 'BEGIN {FS = " "} NR == FNR {a[$1]=$2; next} {print $1 "," a[$1] "," $2}' data/13_langs_dev_proc_audio/utt2lang data/13_langs_dev_proc_audio/utt2spk > data/13_langs_dev_proc_audio/data/13_langs_train_proc_audio/utt2seg.csv + awk 'BEGIN {FS = " "} NR == FNR {a[$1]=$2; next} {print $1 "," a[$1] "," $2}' data/13_langs_train_proc_audio/utt2lang data/13_langs_train_proc_audio/utt2spk > data/13_langs_train_proc_audio/data/13_langs_train_proc_audio/utt2seg.csv + # cut -d' ' -f1 --complement data/${nnet_data}/text > data/lm/${lan}_transcript_words.txt fi \ No newline at end of file diff --git a/egs/commonvoice/v1/run_012_train_lid.sh b/egs/commonvoice/v1/run_012_train_lid.sh index 80948243..3b250e16 100755 --- a/egs/commonvoice/v1/run_012_train_lid.sh +++ b/egs/commonvoice/v1/run_012_train_lid.sh @@ -8,7 +8,7 @@ set -e stage=1 -ngpu=1 +ngpu=2 config_file=default_config.sh interactive=false num_workers="" @@ -49,17 +49,14 @@ if [ $stage -le 1 ]; then train_wav2vec2languageid.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ - --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ - --data.train.dataset.language-id-file $train_dir/utt2lang \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ --data.val.dataset.audio-file $val_dir/wav.scp \ - --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ - --data.val.dataset.class-files $val_dir/langs \ - --data.val.dataset.language-id-file $val_dir/utt2lang \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ --data.train.dataset.time-durs-file $train_dir/utt2dur \ @@ -81,17 +78,14 @@ if [ $stage -le 2 ]; then finetune_wav2vec2languageid.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ - --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ - --data.train.dataset.language-id-file $train_dir/utt2lang \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ --data.val.dataset.audio-file $val_dir/wav.scp \ - --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ - --data.val.dataset.class-files $val_dir/langs \ - --data.val.dataset.language-id-file $val_dir/utt2lang \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s2_dir $args \ --in-model-file $nnet_s1 \ @@ -115,17 +109,15 @@ if [ $stage -le 3 ]; then finetune_wav2vec2languageid.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ - --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ - --data.train.dataset.language-id-file $train_dir/utt2lang \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ --data.val.dataset.audio-file $val_dir/wav.scp \ - --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ --data.val.dataset.class-files $val_dir/langs \ - --data.val.dataset.language-id-file $val_dir/utt2lang \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ --in-model-file $nnet_s2 \ diff --git a/hyperion/bin/train_wav2vec2languageid.py b/hyperion/bin/train_wav2vec2languageid.py index 093042f6..de5b2f2d 100755 --- a/hyperion/bin/train_wav2vec2languageid.py +++ b/hyperion/bin/train_wav2vec2languageid.py @@ -175,21 +175,10 @@ def make_parser(model_class): ) parser.add_argument("--data.val.dataset.text_file", type=str) - parser.add_argument("--data.train.dataset.language_id_file", type=str) - parser.add_argument("--data.val.dataset.language_id_file", type=str) - - parser.add_argument( - "--data.train.dataset.class_files", - type=str, + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" ) - - - parser.add_argument( - "--data.dev.dataset.class_files", - type=str, - ) - parser.add_argument( "--data.train.dataset.class_names", type=str, diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 3bfa328b..230b7220 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -467,7 +467,6 @@ def __init__( bpe_model=None, text_file=None, time_durs_file=None, - language_id_file=None, aug_cfgs=None, num_augs=1, return_segment_info=None, @@ -513,12 +512,6 @@ def __init__( else: assert "duration" in self.seg_set - if language_id_file is not None: - if rank == 0: - logging.info("loading language id file %s" % language_id_file) - - language_ids = SegmentSet.load(language_id_file) - self.seg_set["language"] = language_ids.loc[self.seg_set["id"]].class_id logging.info("loading class-info files") @@ -775,7 +768,6 @@ def filter_args(**kwargs): "return_segment_info", "return_orig", "time_durs_file", - "language_id_file", "target_sample_freq", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -819,13 +811,6 @@ def add_class_args(parser, prefix=None, skip={}): ), ) - parser.add_argument( - "--language-id-file", - default=None, - help=( - "file with language ids for each utterance" - ), - ) parser.add_argument( "--bpe-model", diff --git a/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py index 94943ccc..749d0558 100644 --- a/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py +++ b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py @@ -25,7 +25,6 @@ def __init__(self, length_column="duration", weight_exponent=1.0, weight_mode="custom", - seg_weight_mode="uniform", class_name="language", seed=1234, **base_kwargs): @@ -40,7 +39,6 @@ def __init__(self, self.length_column = length_column self.weight_exponent = weight_exponent self.weight_mode = weight_mode - self.seg_weight_mode = seg_weight_mode self._gather_class_info() self._set_class_weights() self._create_bucket_samplers() @@ -72,9 +70,6 @@ def _create_bucket_samplers(self): for i in range(self.num_buckets): sampler_i = self.base_sampler(buckets[i], self.class_info, - # weight_exponent=self.weight_exponent, - # weight_mode=self.weight_mode, - seg_weight_mode=self.seg_weight_mode, class_name=self.class_name, **self.base_kwargs) bucket_samplers.append(sampler_i) @@ -186,7 +181,6 @@ def filter_args(**kwargs): "length_column", "weight_exponent", "weight_mode", - "seg_weight_mode", "class_name", "length_column", "shuffle", @@ -216,13 +210,6 @@ def add_class_args(parser, prefix=None): help=("method to get the class weights"), ) - parser.add_argument( - "--seg-weight-mode", - default="uniform", - choices=["uniform", "data-prior"], - help=("method to sample segments given a class"), - ) - parser.add_argument( "--shuffle", action=ActionYesNo, diff --git a/hyperion/torch/data/class_weighted_seg_sampler.py b/hyperion/torch/data/class_weighted_seg_sampler.py index 09a34591..c56a96a7 100644 --- a/hyperion/torch/data/class_weighted_seg_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_sampler.py @@ -6,6 +6,8 @@ import math from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import logging +import copy + import numpy as np @@ -36,18 +38,12 @@ def __init__( length_name="duration", shuffle=False, drop_last=False, - # weight_exponent=1.0, - # weight_mode="custom", - seg_weight_mode="uniform", num_segs_per_class=1, class_name="class_id", seed=1234, ): super().__init__(shuffle=shuffle, seed=seed) - self.class_info = class_info - # self.weight_exponent=weight_exponent - # self.weight_mode=weight_mode - self.seg_weight_mode = seg_weight_mode + self.class_info = copy.deepcopy(class_info) self.num_segs_per_class = num_segs_per_class self.class_name=class_name self.seg_set = seg_set @@ -120,6 +116,7 @@ def _gather_class_info(self): seg_idx = get_loc(self.seg_set,seg_ids) else: seg_idx = [] + logging.warning("no segments found with class=%s", class_id) self.class_info.loc[class_id, "weights"] = 0.0 self.class_info.renorm_weights() @@ -172,25 +169,13 @@ def _sample_segs(self, class_ids): # sample num_segs_per_class random segments if len(seg_idx_c) == 0: logging.error("no segments found with class=%s dur=%d", c, chunk_length) - if self.seg_weight_mode == "uniform": - sel_idx = torch.randint( - low=0, - high=len(seg_idx_c), - size=(self.num_segs_per_class,), - generator=self.rng, - ).numpy() - - elif self.seg_weight_mode == "data-prior": - weights = durs / durs.sum() - sel_idx = torch.multinomial( - torch.from_numpy(weights), - num_samples=self.num_segs_per_class, - replacement=True, - generator=self.rng, - ).numpy() - # t4 = time.time() - else: - raise ValueError("unknown seg-weight-mode=%s", self.seg_weight_mode) + + sel_idx = torch.randint( + low=0, + high=len(seg_idx_c), + size=(self.num_segs_per_class,), + generator=self.rng, + ).numpy() sel_seg_idx_c = seg_idx_c[sel_idx] sel_seg_ids_c = list(self.seg_set.iloc[sel_seg_idx_c, id_col_idx]) @@ -285,9 +270,6 @@ def filter_args(**kwargs): "max_batch_size", "max_batch_length", "length_name", - # "weight_exponent", - # "weight_mode", - "seg_weight_mode", "num_segs_per_class", "class_name", "shuffle", @@ -354,32 +336,12 @@ def add_class_args(parser, prefix=None): "which column in the segment table indicates the duration of the file", ) - - parser.add_argument( - "--weight-exponent", - default=1.0, - type=float, - help=("exponent for class weights"), - ) - parser.add_argument( - "--weight-mode", - default="custom", - choices=["custom", "uniform", "data-prior"], - help=("method to get the class weights"), - ) - parser.add_argument( "--num-segs-per-class", type=int, default=1, help=("number of segments per class in batch"), ) - parser.add_argument( - "--seg-weight-mode", - default="uniform", - choices=["uniform", "data-prior"], - help=("method to sample segments given a class"), - ) parser.add_argument( "--class-name", default="class_id", From 396e020276cb55c864b2845836e5713df6daf84b Mon Sep 17 00:00:00 2001 From: ylu125 Date: Mon, 27 Mar 2023 00:00:03 -0400 Subject: [PATCH 10/89] Update the LID trainer for merging the new dataloader --- egs/commonvoice/v1/run_011_train_asr.sh | 32 +++++-- hyperion/bin/train_wav2vec2languageid.py | 21 ++++- hyperion/torch/trainers/__init__.py | 3 - hyperion/torch/trainers/languageid_trainer.py | 91 ++++++++++--------- 4 files changed, 86 insertions(+), 61 deletions(-) diff --git a/egs/commonvoice/v1/run_011_train_asr.sh b/egs/commonvoice/v1/run_011_train_asr.sh index 1b402133..e79de7af 100755 --- a/egs/commonvoice/v1/run_011_train_asr.sh +++ b/egs/commonvoice/v1/run_011_train_asr.sh @@ -7,8 +7,20 @@ . ./path.sh set -e +# export CUDA_VISIBLE_DEVICES=0 + +#ml purge +#module load namd/2.14-cuda-smp +#module load cuda/11.6.0 +#ml +#nvidia-smi +#export CUDA_VISIBLE_DEVICES=0,1,2,3 +#export CONV_RSH=ssh +#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH + + stage=1 -ngpu=1 +ngpu=2 config_file=default_config.sh interactive=false num_workers="" @@ -47,14 +59,18 @@ if [ $stage -le 1 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ - train_wav2vec2transducer.py $nnet_type \ + train_wav2vec2rnn_transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ - --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ --data.val.dataset.audio-file $val_dir/wav.scp \ - --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ --data.train.dataset.time-durs-file $train_dir/utt2dur \ @@ -76,11 +92,11 @@ if [ $stage -le 2 ]; then finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ - --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ --data.val.dataset.audio-file $val_dir/wav.scp \ - --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s2_dir $args \ --in-model-file $nnet_s1 \ @@ -104,11 +120,11 @@ if [ $stage -le 3 ]; then finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ - --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ --data.val.dataset.audio-file $val_dir/wav.scp \ - --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ --in-model-file $nnet_s2 \ diff --git a/hyperion/bin/train_wav2vec2languageid.py b/hyperion/bin/train_wav2vec2languageid.py index de5b2f2d..7af47d03 100755 --- a/hyperion/bin/train_wav2vec2languageid.py +++ b/hyperion/bin/train_wav2vec2languageid.py @@ -42,16 +42,27 @@ def Language_collate(batch): audio_length = [] language = [] for record in batch: - wav = torch.as_tensor(record[0]) + wav = torch.as_tensor(record["x"]) audio.append(wav) audio_length.append(wav.shape[0]) - language.append(record[1]) - audio = pad_sequence(audio) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + + language = [language[k] for k in sort_idx] language = torch.as_tensor(language) - - return torch.transpose(audio, 0, 1), audio_length, language + batch = { + "x": audio, + "x_lengths": audio_length, + "language": language, + } + return batch def init_data(partition, rank, num_gpus, **kwargs): data_kwargs = kwargs["data"][partition] diff --git a/hyperion/torch/trainers/__init__.py b/hyperion/torch/trainers/__init__.py index e1f6824f..212f0e92 100644 --- a/hyperion/torch/trainers/__init__.py +++ b/hyperion/torch/trainers/__init__.py @@ -5,13 +5,10 @@ from .dvae_trainer import DVAETrainer from .torch_trainer import TorchTrainer -<<<<<<< HEAD from .languageid_trainer import LanguageIDTrainer -======= ->>>>>>> hyp/persephone-asr from .transducer_trainer import TransducerTrainer from .vae_trainer import VAETrainer from .vq_dvae_trainer import VQDVAETrainer diff --git a/hyperion/torch/trainers/languageid_trainer.py b/hyperion/torch/trainers/languageid_trainer.py index 773402d5..0770cb8f 100644 --- a/hyperion/torch/trainers/languageid_trainer.py +++ b/hyperion/torch/trainers/languageid_trainer.py @@ -2,18 +2,19 @@ Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - import torch -import torchaudio import torch.nn as nn +import torchaudio +from jsonargparse import ActionParser, ArgumentParser +from torch.distributed.elastic.multiprocessing.errors import record -from ..utils import MetricAcc +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset from .torch_trainer import TorchTrainer -from torch.distributed.elastic.multiprocessing.errors import record class LanguageIDTrainer(TorchTrainer): @@ -75,38 +76,14 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="language", ): if loss is None: loss = nn.CrossEntropyLoss() - super().__init__( - model, - loss, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) @record def train_epoch(self, data_loader): @@ -115,6 +92,9 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning features and class labels. """ + batch_keys = [ + self.input_key, f"{self.input_key}_lengths", self.target_key + ] self.model.update_loss_margin(self.cur_epoch) @@ -122,14 +102,14 @@ def train_epoch(self, data_loader): batch_metrics = ODict() self.model.train() - for batch, (data, audio_length, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, audio_length, target = data.to(self.device), audio_length.to( - self.device), target.to(self.device) - batch_size = data.shape[0] + input_data, input_lengths, target = tensors_subset( + data, batch_keys, self.device) + batch_size = input_data.shape[0] with self.amp_autocast(): # TODO: Check and Modify output, loss from the model @@ -137,7 +117,7 @@ def train_epoch(self, data_loader): # x_lengths=audio_length, # y=target) # loss = loss.mean() / self.grad_acc_steps - output = self.model(data, y=target) + output = self.model(input_data, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: @@ -171,7 +151,9 @@ def validation_epoch(self, data_loader, swa_update_bn=False): data_loader: PyTorch data loader return input/output pairs. sw_update_bn: wheter or not, update batch-norm layers in SWA. """ - + batch_keys = [ + self.input_key, f"{self.input_key}_lengths", self.target_key + ] metric_acc = MetricAcc(self.device) batch_metrics = ODict() with torch.no_grad(): @@ -182,16 +164,15 @@ def validation_epoch(self, data_loader, swa_update_bn=False): log_tag = "val_" self.model.eval() - for batch, (data, audio_length, target) in enumerate(data_loader): - data, audio_length, target = data.to( - self.device), audio_length.to(self.device), target.to( - self.device) - batch_size = data.shape[0] + for batch, data in enumerate(data_loader): + input_data, input_lengths, target = tensors_subset( + data, batch_keys, self.device) + batch_size = input_data.shape[0] # data, target = data.to(self.device), target.to(self.device) # batch_size = data.shape[0] with self.amp_autocast(): - output = self.model(data, y=target) + output = self.model(input_data, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps # output, loss = self.model(data, @@ -209,3 +190,23 @@ def validation_epoch(self, data_loader, swa_update_bn=False): logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + super_skip = skip.copy() + super_skip.add("target_key") + TorchTrainer.add_class_args(parser, + train_modes=train_modes, + skip=super_skip) + if "target_key" not in skip: + parser.add_argument("--target-key", + default="language", + help="dict. key for nnet targets") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) From 2ecdebfedacf1d55750c2d4c99c178b5ecdfe727 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Mon, 27 Mar 2023 02:28:50 -0400 Subject: [PATCH 11/89] add commonvoice config for rnnt transducer --- ...v2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml | 83 +++++++++++++++++++ .../config_pruned_transducer_v1.3_13langs.sh | 44 ++++++++++ 2 files changed, 127 insertions(+) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml new file mode 100644 index 00000000..3712babc --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml @@ -0,0 +1,83 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 70. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 70. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh new file mode 100644 index 00000000..575a8436 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +# bpe_model=data/13_langs_lang_bpe_8000/bpe.model +bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v1.3_13_langs_16000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0019.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth From d33abe910668f6b710eeab55233c1acadd182ae4 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Fri, 21 Apr 2023 23:34:57 -0400 Subject: [PATCH 12/89] Add fine-tuning code for pruned RNN-T, LID, and Both --- ...v2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml | 86 ++++ ...v2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml | 86 ++++ ...v2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml | 86 ++++ ...v2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml | 86 ++++ ...v2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml | 73 +++ ...v2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml | 71 +++ ...v2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml | 73 +++ ...c2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml | 6 +- ...c2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml | 69 +++ ...c2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml | 71 +++ ...wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml | 41 ++ .../v1/global_conf/config_lid_v2.2_13langs.sh | 44 ++ .../config_pruned_transducer_v3.0_13langs.sh | 44 ++ .../config_pruned_transducer_v3.1_13langs.sh | 44 ++ .../config_pruned_transducer_v3.2_13langs.sh | 44 ++ .../config_pruned_transducer_v4.0_13langs.sh | 44 ++ hyperion/bin/finetune_wav2vec2languageid.py | 267 +++++++++++ .../bin/finetune_wav2vec2rnn_transducer.py | 248 ++++++++++ .../finetune_wav2vec2transducer_languageid.py | 255 +++++++++++ hyperion/np/augment/noise_augment.py | 2 +- .../class_weighted_bucketing_seg_sampler.py | 27 ++ .../torch/data/class_weighted_seg_sampler.py | 14 +- hyperion/torch/data/seg_sampler_factory.py | 10 + hyperion/torch/models/__init__.py | 1 + .../torch/models/transducer/rnn_transducer.py | 4 +- .../wav2transducer_languageid/__init__.py | 7 + .../hf_wav2rnn_transducer_languageid.py | 428 ++++++++++++++++++ .../hf_wav2vec2rnn_transducer_languageid.py | 119 +++++ hyperion/torch/trainers/__init__.py | 1 + .../trainers/transducer_languageid_trainer.py | 222 +++++++++ 30 files changed, 2566 insertions(+), 7 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.0_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.1_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.2_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh create mode 100755 hyperion/bin/finetune_wav2vec2languageid.py create mode 100755 hyperion/bin/finetune_wav2vec2rnn_transducer.py create mode 100755 hyperion/bin/finetune_wav2vec2transducer_languageid.py create mode 100644 hyperion/torch/models/wav2transducer_languageid/__init__.py create mode 100644 hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py create mode 100644 hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py create mode 100644 hyperion/torch/trainers/transducer_languageid_trainer.py diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml new file mode 100644 index 00000000..4718389d --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml @@ -0,0 +1,86 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 1.0 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 1.0 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.2 + rnn_dropout_rate: 0.2 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml new file mode 100644 index 00000000..f41f8dad --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml @@ -0,0 +1,86 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml new file mode 100644 index 00000000..fbadc196 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml @@ -0,0 +1,86 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.2 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 10 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml new file mode 100644 index 00000000..f41f8dad --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml @@ -0,0 +1,86 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml new file mode 100644 index 00000000..9db63d77 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40. + max_audio_length: 20. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40. + max_audio_length: 20. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + transducer: + decoder: + prune_range: 15 + override_dropouts: false +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml new file mode 100644 index 00000000..85970fa6 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.2.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.2 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + transducer: + decoder: + override_dropouts: false +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml new file mode 100644 index 00000000..9db63d77 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40. + max_audio_length: 20. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40. + max_audio_length: 20. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + transducer: + decoder: + prune_range: 15 + override_dropouts: false +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml index 06d5697d..0bb34b23 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.1.yaml @@ -17,10 +17,11 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 data_loader: - num_workers: 1 + num_workers: 4 val: dataset: aug_cfgs: @@ -39,8 +40,9 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 data_loader: - num_workers: 1 + num_workers: 4 model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml trainer: optim: diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml new file mode 100644 index 00000000..77cd2d26 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 2 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 2 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 420000 + hold_steps: 300000 + min_lr: 4e-5 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml new file mode 100644 index 00000000..c73c7130 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50. + max_audio_length: 20. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50. + max_audio_length: 20. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + languageid: + cos_scale: 32.0 +trainer: + optim: + opt_type: sgd + lr: 0.0005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 420000 + hold_steps: 300000 + min_lr: 4e-5 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml new file mode 100644 index 00000000..7d6d9473 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.0 + margin_warmup_epochs: 5 + intertop_margin: 0.0 + dropout_rate: 0.2 + +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh new file mode 100644 index 00000000..debd9377 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v2.2_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0014.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v2.2_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.0_13langs.sh new file mode 100644 index 00000000..0f66c12a --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + +bpe_model=data/13_langs_lang_bpe_4000/bpe.model +# bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v3.0_13_langs_4000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0019.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.1_13langs.sh new file mode 100644 index 00000000..3fb2f93a --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.1_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v3.1_13_langs_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0010.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v3.1.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.2_13langs.sh new file mode 100644 index 00000000..4a990e2c --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v3.2_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +# bpe_model=data/13_langs_lang_bpe_8000/bpe.model +bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v3.2_13_langs_16000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0001.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh new file mode 100644 index 00000000..29a762fa --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0007.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/hyperion/bin/finetune_wav2vec2languageid.py b/hyperion/bin/finetune_wav2vec2languageid.py new file mode 100755 index 00000000..4ac24e98 --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2languageid.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import ddp +from hyperion.torch.trainers import LanguageIDTrainer as Trainer +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import HFWav2Vec2ResNet1dLanguageID +from torch.nn.utils.rnn import pad_sequence + +model_dict = { + "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dLanguageID, + # "hf_hubert2resnet1d": HFHubert2ResNet1LanguageID, + # "hf_wavlm2resnet1d": HFWavLM2ResNet1dLanguageID, +} + + +def Language_collate(batch): + audio = [] + audio_length = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + batch = { + "x": audio, + "x_lengths": audio_length, + "language": language, + } + return batch + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs, + collate_fn=Language_collate) + return data_loader + + +def init_model(num_classes, in_model_file, rank, model_class, **kwargs): + model_args = model_class.filter_finetune_args(**kwargs["model"]) + if rank == 0: + logging.info("model network ft args={}".format(model_args)) + + model_args["languageid"]["num_classes"] = num_classes + model = TML.load(in_model_file) + logging.info(model_args) + model.change_config(**model_args) + + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + + model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + parser.add_argument("--data.val.dataset.text_file", type=str) + + + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.add_argument( + "--data.train.dataset.class_names", + type=str, + ) + + parser.add_argument( + "--data.dev.dataset.class_names", + type=str, + ) + + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + parser.add_argument("--in-model-file", required=True) + model_class.add_finetune_args(parser, prefix="model") + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train Wav2Vec2Language model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/finetune_wav2vec2rnn_transducer.py b/hyperion/bin/finetune_wav2vec2rnn_transducer.py new file mode 100755 index 00000000..4092ecd7 --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2rnn_transducer.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer) +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + +model_dict = { + "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, + "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, + # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, + # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, + # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, + # "hf_wavlm2rnn_rnn_transducer": HFWavLM2RNNRNNTransducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + } + return batch + + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate) + return data_loader + + +def init_model(in_model_file, rank, model_class, **kwargs): + model_args = model_class.filter_finetune_args(**kwargs["model"]) + # model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network ft args={}".format(model_args)) + model = TML.load(in_model_file) + model.change_config(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + + + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(**kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments( + "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" + ) + + + parser.add_argument("--in-model-file", required=True) + model_class.add_finetune_args(parser, prefix="model") + # model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser(description="Fine-tune Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/finetune_wav2vec2transducer_languageid.py b/hyperion/bin/finetune_wav2vec2transducer_languageid.py new file mode 100755 index 00000000..0628f3da --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2transducer_languageid.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2RNNTransducerResnet1D) +from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + +model_dict = { + "hf_wav2vec2rnn_transducer_resnet1d": HFWav2Vec2RNNTransducerResnet1D, + +} + + +def transducer_language_collate(batch): + audio = [] + audio_length = [] + text = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + text.append(record["text"]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + text = [text[k] for k in sort_idx] + text = k2.RaggedTensor(text) + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + batch = { + "x": audio, + "x_lengths": audio_length, + "text": text, + "languageid": language, + } + return batch + + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_language_collate) + return data_loader + + +def init_model(num_classes, in_model_transducer, in_model_lid, rank, model_class, **kwargs): + model_args = model_class.filter_finetune_args(**kwargs["model"]) + # model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network ft args={}".format(model_args)) + model_wav2transducer = TML.load(in_model_transducer) + model_wav2lid = TML.load(in_model_lid) + model_args["languageid"]["num_classes"] = num_classes + logging.info(model_args) + model = model_class(model_wav2transducer.hf_feats, model_wav2transducer.transducer, model_wav2lid.languageid) + model.change_config(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + + + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments( + "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" + ) + + + parser.add_argument("--in-model-transducer", required=True) + parser.add_argument("--in-model-lid", required=True) + model_class.add_finetune_args(parser, prefix="model") + # model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser(description="Fine-tune Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/np/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py index 799db930..fe54f385 100644 --- a/hyperion/np/augment/noise_augment.py +++ b/hyperion/np/augment/noise_augment.py @@ -55,7 +55,7 @@ def __init__( @staticmethod def _power(x): """Computes power of x in dB.""" - return 10 * np.log10((x ** 2).sum()) + return 10 * np.log10(((x+1e-5) ** 2).sum()) @staticmethod def snr(x, n): diff --git a/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py index 749d0558..1509d446 100644 --- a/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py +++ b/hyperion/torch/data/class_weighted_bucketing_seg_sampler.py @@ -23,9 +23,11 @@ def __init__(self, base_sampler=ClassWeightedRandomSegSampler, num_buckets=10, length_column="duration", + num_chunks_per_seg_epoch=1.0, weight_exponent=1.0, weight_mode="custom", class_name="language", + max_audio_length=None, seed=1234, **base_kwargs): super().__init__(shuffle=False, seed=seed) @@ -37,7 +39,9 @@ def __init__(self, self.base_kwargs["seed"] = seed self.num_buckets = num_buckets self.length_column = length_column + self.num_chunks_per_seg_epoch = num_chunks_per_seg_epoch self.weight_exponent = weight_exponent + self.max_audio_length = max_audio_length self.weight_mode = weight_mode self._gather_class_info() self._set_class_weights() @@ -49,6 +53,10 @@ def create_buckets(self): # class_ids = self._sample_classes() sort_idx = np.argsort(self.seg_set[self.length_column].values) sorted_seg_set = self.seg_set.iloc[sort_idx] + # import pdb; pdb.set_trace() + # remove audio length larger than max_audio_length + if self.max_audio_length is not None: + sorted_seg_set = sorted_seg_set.loc[sorted_seg_set[self.length_column] <= self.max_audio_length] cum_lengths = np.cumsum(sorted_seg_set[self.length_column].values, axis=0) bucket_length = cum_lengths[-1] / self.num_buckets @@ -71,6 +79,7 @@ def _create_bucket_samplers(self): sampler_i = self.base_sampler(buckets[i], self.class_info, class_name=self.class_name, + num_chunks_per_seg_epoch=self.num_chunks_per_seg_epoch, **self.base_kwargs) bucket_samplers.append(sampler_i) @@ -179,8 +188,10 @@ def filter_args(**kwargs): valid_args = ( "num_buckets", "length_column", + "num_chunks_per_seg_epoch", "weight_exponent", "weight_mode", + "max_audio_length", "class_name", "length_column", "shuffle", @@ -197,12 +208,28 @@ def add_class_args(parser, prefix=None): parser = ArgumentParser(prog="") + parser.add_argument( + "--num-chunks-per-seg-epoch", + default=1, + type=lambda x: x if x == "auto" else float(x), + help=("number of times we sample a segment in each epoch"), + ) + parser.add_argument( "--weight-exponent", default=1.0, type=float, help=("exponent for class weights"), ) + + + parser.add_argument( + "--max-audio-length", + default=None, + type=float, + help=("the maximum length of an audio segment in seconds"), + ) + parser.add_argument( "--weight-mode", default="custom", diff --git a/hyperion/torch/data/class_weighted_seg_sampler.py b/hyperion/torch/data/class_weighted_seg_sampler.py index c56a96a7..5af8cdcc 100644 --- a/hyperion/torch/data/class_weighted_seg_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_sampler.py @@ -35,6 +35,7 @@ def __init__( min_batch_size=1, max_batch_size=None, max_batch_length=None, + num_chunks_per_seg_epoch=1, length_name="duration", shuffle=False, drop_last=False, @@ -46,6 +47,7 @@ def __init__( self.class_info = copy.deepcopy(class_info) self.num_segs_per_class = num_segs_per_class self.class_name=class_name + self.num_chunks_per_seg_epoch = num_chunks_per_seg_epoch self.seg_set = seg_set self.min_batch_size = min_batch_size self.max_batch_size = max_batch_size @@ -62,11 +64,11 @@ def __init__( if drop_last: self._len = int( - len(self.seg_set) / (avg_batch_size * self.world_size)) + self.num_chunks_per_seg_epoch * len(self.seg_set) / (avg_batch_size * self.world_size)) else: self._len = int( math.ceil( - (len(self.seg_set) // self.world_size) / avg_batch_size)) + (self.num_chunks_per_seg_epoch * len(self.seg_set) // self.world_size) / avg_batch_size)) self._gather_class_info() self._permutation = None @@ -271,6 +273,7 @@ def filter_args(**kwargs): "max_batch_length", "length_name", "num_segs_per_class", + "num_chunks_per_seg_epoch", "class_name", "shuffle", "drop_last", @@ -336,6 +339,13 @@ def add_class_args(parser, prefix=None): "which column in the segment table indicates the duration of the file", ) + parser.add_argument( + "--num-chunks-per-seg-epoch", + default=1, + type=lambda x: x if x == "auto" else float(x), + help=("number of times we sample a segment in each epoch"), + ) + parser.add_argument( "--num-segs-per-class", type=int, diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py index f2fb1914..0a9a8a69 100644 --- a/hyperion/torch/data/seg_sampler_factory.py +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -86,6 +86,7 @@ def filter_args(**kwargs): "min_batch_size", "max_batch_size", "max_batch_length", + "max_audio_length", "num_chunks_per_seg_epoch", "num_segs_per_class", "num_chunks_per_seg", @@ -154,6 +155,15 @@ def add_class_args(parser, prefix=None): ), ) + + parser.add_argument( + "--max-audio-length", + default=None, + type=float, + help=("the maximum length of an audio segment in seconds"), + ) + + parser.add_argument( "--batch-size", default=None, diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 591bbb97..a8bb24d5 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -12,6 +12,7 @@ HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer) from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector, HFWavLM2ResNet1dXVector) +from .wav2transducer_languageid import HFWav2Vec2RNNTransducerResnet1D from .xvectors.efficient_net_xvector import EfficientNetXVector from .xvectors.resnet1d_xvector import ResNet1dXVector from .xvectors.resnet_xvector import ResNetXVector diff --git a/hyperion/torch/models/transducer/rnn_transducer.py b/hyperion/torch/models/transducer/rnn_transducer.py index 0b886fdf..5b8bc3ec 100644 --- a/hyperion/torch/models/transducer/rnn_transducer.py +++ b/hyperion/torch/models/transducer/rnn_transducer.py @@ -61,7 +61,7 @@ def forward( self, x: torch.Tensor, x_lengths: torch.Tensor, - y: k2.RaggedTensor, + y: Union[Dict, k2.RaggedTensor], ) -> RNNTransducerOutput: """ Args: @@ -199,7 +199,7 @@ def change_config( @staticmethod def filter_finetune_args(**kwargs): args = {} - decoder_args = Decoder.filter_finetune_args(**kwargs["decoder"]) + decoder_args = RNNTransducerDecoder.filter_finetune_args(**kwargs["decoder"]) args["decoder"] = decoder_args return args diff --git a/hyperion/torch/models/wav2transducer_languageid/__init__.py b/hyperion/torch/models/wav2transducer_languageid/__init__.py new file mode 100644 index 00000000..98ebfdc7 --- /dev/null +++ b/hyperion/torch/models/wav2transducer_languageid/__init__.py @@ -0,0 +1,7 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +from .hf_wav2vec2rnn_transducer_languageid import HFWav2Vec2RNNTransducerResnet1D \ No newline at end of file diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py new file mode 100644 index 00000000..b710655e --- /dev/null +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py @@ -0,0 +1,428 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import contextlib +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ....utils import HypDataClass +from ...torch_model import TorchModel +from ...utils import remove_silence +from ..transducer import RNNTransducer, RNNTransducerOutput + +@dataclass +class RNNTransducerLanguageIDOutput(HypDataClass): + loss: torch.Tensor + loss_transducer: torch.Tensor + loss_lid: torch.Tensor + loss_transducer_simple: Optional[torch.Tensor] = None + loss_transducer_pruned: Optional[torch.Tensor] = None + h_feats: Optional[List[torch.Tensor]] = None + +class HFWav2RNNTransducerLanguageID(TorchModel): + """Abstract Base class for combined transducer language identification models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + transducer: transducer model object. + languageid: language identification model object. + feat_fusion_start: the input to the combined model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__(self, + hf_feats: TorchModel, + transducer: Union[Dict, TorchModel], + languageid: Union[Dict, TorchModel], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + loss_weight_transducer: float = 0.005, + loss_weight_lid: float = 1.0,): + + super().__init__() + self.hf_feats = hf_feats + if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + #transducer["joiner"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer["encoder"] = None + transducer = RNNTransducer(**transducer) + else: + assert isinstance(transducer, RNNTransducer) + if transducer.encoder is None: + assert transducer.decoder.in_feats == hf_feats.hidden_size + #assert transducer.joiner.in_feats == hf_feats.hidden_size + + self.transducer = transducer + self.languageid = languageid + self.feat_fusion_start = feat_fusion_start + self.feat_fusion_method = feat_fusion_method + self.loss_weight_transducer = loss_weight_transducer + self.loss_weight_lid = loss_weight_lid + self._hf_context = contextlib.nullcontext() + self._make_fuser() + + def _make_fuser(self): + if self.feat_fusion_method == "last": + self.feat_fuser = None + return + + num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "linear": + self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, + num_layers) / num_layers + elif self.feat_fusion_method == "cat": + self.feat_fuser = nn.Linear(num_layers * layer_dim, + layer_dim, + bias=False) + + def _fuse_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start:] + if self.feat_fusion_method == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "last": + feats = hid_feats[-1] + + return feats + + def forward_feats(self, + x, + x_lengths, + return_feat_layers=None, + chunk_length=0, + detach_chunks=False): + return_hid_states = (False if return_feat_layers is None + and self.feat_fusion_method == "last" else True) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + feats = self._fuse_hid_feats(hid_feats) + else: + hid_feats = None + feats = hf_output["last_hidden_state"] + + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + x_lengths=None, + text=None, + languageid=None, + return_feat_layers=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Dataclass with losses, "h_enc" (list of hidden encoder layers), + "h_feats" (wav2vec features) + """ + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers) + + + + logits = self.languageid( + feats, + None, + languageid, + return_enc_layers=return_enc_layers, + return_classif_layers=return_classif_layers, + return_logits=return_logits, + ) + + loss_lid = nn.CrossEntropyLoss()(logits, languageid) + + + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + trans_output = self.transducer( + feats, + feat_lengths, + text, + ) + + + + if return_feat_layers: + trans_output.h_feats = hid_feats + output = RNNTransducerLanguageIDOutput( self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, trans_output.loss, loss_lid,trans_output.loss_simple, trans_output.loss_pruned,trans_output.h_feats) + return output + + def infer(self, + x: torch.Tensor, + x_lengths: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000): + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + + feats, _, feat_lengths = self.forward_feats(x, x_lengths) + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + y = self.transducer.infer(feats, + feat_lengths, + decoding_method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + return y + + def freeze_feat_fuser(self): + if self.feat_fuser is None: + return + + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser.requires_grad = False + return + + for param in self.feat_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode in ["ft-transducer", "ft-transducer-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode in [ + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.transducer._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "transducer", + "feat_fusion_start", + "feat_fusion_method", + "loss_weight_transducer", + "loss_weight_lid", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + hf_cfg = self.hf_feats.get_config() + tran_cfg = self.transducer.get_config() + del hf_cfg["class_name"] + del tran_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "transducer": tran_cfg, + "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_method": self.feat_fusion_method, + "loss_weight_transducer": self.loss_weight_transducer, + "loss_weight_lid": self.loss_weight_lid, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, transducer, languageid): + logging.info("changing hf wav2transducer config") + self.hf_feats.change_config(**hf_feats) + self.transducer.change_config(**transducer) + self.languageid.change_config(**languageid) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help=""" + the input to x-vector model will fuse the wav2vec + layers from feat_fusion_start to + the wav2vec num_layers""", + ) + parser.add_argument( + "--feat-fusion-method", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]"), + ) + + parser.add_argument( + "--loss-weight-transducer", + default=0.005, + type=float, + help=""" + The weight of the transducer loss + """, + ) + + parser.add_argument( + "--loss-weight-lid", + default=1.0, + type=float, + help=""" + The weight of the lid loss + """, + ) + + + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNTransducer.add_infer_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return RNNTransducer.filter_infer_args(**kwargs) diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py new file mode 100644 index 00000000..4fa19144 --- /dev/null +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py @@ -0,0 +1,119 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from ..transducer import RNNTransducer +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID +from ..wav2languageid import HFWav2Vec2ResNet1dLanguageID +from ..wav2transducer import HFWav2Vec2RNNTransducer + + +from .hf_wav2rnn_transducer_languageid import HFWav2RNNTransducerLanguageID + + +class HFWav2Vec2RNNTransducerResnet1D(HFWav2RNNTransducerLanguageID): + """Class for RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNTransducer], + languageid: Union[Dict, ResNet1dLanguageID], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + # if isinstance(hf_feats, dict): + # if "class_name" in hf_feats: + # del hf_feats["class_name"] + # hf_feats = HFWav2Vec2(**hf_feats) + # else: + # assert isinstance(hf_feats, HFWav2Vec2) + + # if isinstance(languageid, dict): + # languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size + # if "class_name" in languageid: + # del languageid["class_name"] + # languageid = ResNet1dLanguageID(**languageid) + # else: + # assert isinstance(languageid, ResNet1dLanguageID) + # assert languageid.encoder_net.in_feats == hf_feats.hidden_size + + # hf_feats = wav2transducer.hf_feats + # transducer = wav2transducer.transducer + # languageid = wav2languageid.languageid + + + super().__init__(hf_feats, transducer, languageid, feat_fusion_start, + feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNTransducerLanguageID.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNTransducer.filter_args(**kwargs["transducer"]) + child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"]) + base_args["transducer"] = child_args + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + RNNTransducer.add_class_args(parser, prefix="transducer") + # HFWav2RNNTransducer.add_class_args(parser) + ResNet1dLanguageID.add_class_args(parser, prefix="languageid") + # HFWav2LanguageID.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNTransducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + child_args = ResNet1dLanguageID.filter_finetune_args(**kwargs["languageid"]) + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + RNNTransducer.add_finetune_args(parser, prefix="transducer") + ResNet1dLanguageID.add_finetune_args(parser, prefix="languageid") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/__init__.py b/hyperion/torch/trainers/__init__.py index 212f0e92..3c96c84f 100644 --- a/hyperion/torch/trainers/__init__.py +++ b/hyperion/torch/trainers/__init__.py @@ -10,6 +10,7 @@ from .languageid_trainer import LanguageIDTrainer from .transducer_trainer import TransducerTrainer +from .transducer_languageid_trainer import TransducerLanguageIDTrainer from .vae_trainer import VAETrainer from .vq_dvae_trainer import VQDVAETrainer from .vq_vae_trainer import VQVAETrainer diff --git a/hyperion/torch/trainers/transducer_languageid_trainer.py b/hyperion/torch/trainers/transducer_languageid_trainer.py new file mode 100644 index 00000000..238e8022 --- /dev/null +++ b/hyperion/torch/trainers/transducer_languageid_trainer.py @@ -0,0 +1,222 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +from collections import OrderedDict as ODict + +import torch +import torch.nn as nn +import torchaudio +from jsonargparse import ActionParser, ArgumentParser +from torch.distributed.elastic.multiprocessing.errors import record + +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset +from .torch_trainer import TorchTrainer + + +class TransducerLanguageIDTrainer(TorchTrainer): + """Trainer to train ASR style models. + + Attributes: + model: ASR model object. + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp + """ + + def __init__( + self, + model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + eff_batch_size=None, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + loss=None, + train_mode="full", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + input_key="x", + target_key=["text", "languageid"], + ): + + loss = None + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + @record + def train_epoch(self, data_loader): + """Training epoch loop + + Args: + data_loader: pytorch data loader returning features and class labels. + """ + batch_keys = [ + self.input_key, f"{self.input_key}_lengths", self.target_key[0], self.target_key[1] + ] + metric_acc = MetricAcc(device=self.device) + batch_metrics = ODict() + self.model.train() + self.sp = data_loader.dataset.sp + + for batch, data in enumerate(data_loader): + self.loggers.on_batch_begin(batch) + + if batch % self.grad_acc_steps == 0: + self.optimizer.zero_grad() + + # # TODO: Check and Modify data, target + # data, audio_length, target = data.to(self.device), audio_length.to( + # self.device), target.to(self.device) + #print(data.keys(), batch_keys, flush=True) + input_data, input_lengths, text, languageid = tensors_subset( + data, batch_keys, self.device) + batch_size = input_data.shape[0] + + with self.amp_autocast(): + output = self.model(input_data, + x_lengths=input_lengths, + text=text, + languageid=languageid) + loss = output.loss + loss = loss.mean() / self.grad_acc_steps + + if self.use_amp: + self.grad_scaler.scale(loss).backward() + else: + loss.backward() + + if (batch + 1) % self.grad_acc_steps == 0: + if self.lr_scheduler is not None and not self.in_swa: + self.lr_scheduler.on_opt_step() + self.update_model() + + for k, v in output.items(): + if "loss" in k and v is not None: + batch_metrics[k] = output[k].item() + + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + logs = metric_acc.metrics + logs["lr"] = self._get_lr() + self.loggers.on_batch_end(logs=logs, batch_size=batch_size) + + logs = metric_acc.metrics + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() + return logs + + def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + + Args: + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. + """ + batch_keys = [ + self.input_key, f"{self.input_key}_lengths", self.target_key[0], self.target_key[1] + ] + metric_acc = MetricAcc(self.device) + batch_metrics = ODict() + with torch.no_grad(): + if swa_update_bn: + log_tag = "train_" + self.train() + else: + log_tag = "val_" + self.model.eval() + + for batch, data in enumerate(data_loader): + + input_data, input_lengths, text, languageid = tensors_subset( + data, batch_keys, self.device) + batch_size = input_data.shape[0] + + # data, audio_length, target = data.to( + # self.device), audio_length.to(self.device), target.to( + # self.device) + # batch_size = data.shape[0] + # data, target = data.to(self.device), target.to(self.device) + # batch_size = data.shape[0] + + with self.amp_autocast(): + output = self.model(input_data, + x_lengths=input_lengths, + text=text, + languageid=languageid) + + for k, v in output.items(): + if "loss" in k and v is not None: + batch_metrics[k] = output[k].item() + + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + + logs = metric_acc.metrics + logs = ODict((log_tag + k, v) for k, v in logs.items()) + return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + super_skip = skip.copy() + super_skip.add("target_key") + TorchTrainer.add_class_args(parser, + train_modes=train_modes, + skip=super_skip) + if "target_key" not in skip: + parser.add_argument("--target-keys", + default=["text", "languageid"], + help="list of dict. key for nnet targets") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) From 3b7e8aca6e3c9c7eb22224bfa3374957ebac52ec Mon Sep 17 00:00:00 2001 From: neillu23 Date: Sat, 22 Apr 2023 00:44:20 -0400 Subject: [PATCH 13/89] Add LID decode scripts --- egs/commonvoice/v1/run_030_inference.sh | 16 +- egs/commonvoice/v1/run_032_identificate.sh | 47 ++++ .../identificate_wav2vec2resnet1d.sh | 87 +++++++ hyperion/bin/identificate_wav2languageid.py | 238 ++++++++++++++++++ .../torch/narchs/rnn_transducer_decoder.py | 7 +- 5 files changed, 385 insertions(+), 10 deletions(-) create mode 100755 egs/commonvoice/v1/run_032_identificate.sh create mode 100755 egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh create mode 100755 hyperion/bin/identificate_wav2languageid.py diff --git a/egs/commonvoice/v1/run_030_inference.sh b/egs/commonvoice/v1/run_030_inference.sh index cf2c8fb2..ec5b140b 100755 --- a/egs/commonvoice/v1/run_030_inference.sh +++ b/egs/commonvoice/v1/run_030_inference.sh @@ -7,8 +7,6 @@ . ./path.sh set -e -stage=0 - config_file=default_config.sh use_gpu=false nnet_stage=1 @@ -36,14 +34,16 @@ fi transducer_dir=exp/transducer/$nnet_name +# test_data=test_clean # Extracts x-vectors for evaluation -for name in $test_data # $dev_data $test_data - do - nj=16 - steps_transducer/decode_wav2vec2transducer.sh --cmd "$transducer_cmd --mem 12G" --nj $nj --stage $stage ${transducer_args} \ +for name in $test_data +do + nj=40 + steps_transducer/decode_wav2vec2rnn_transducer.sh \ + --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ $nnet data/$name \ $transducer_dir/$name $bpe_model - done -exit +done + diff --git a/egs/commonvoice/v1/run_032_identificate.sh b/egs/commonvoice/v1/run_032_identificate.sh new file mode 100755 index 00000000..a9a8cee5 --- /dev/null +++ b/egs/commonvoice/v1/run_032_identificate.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=0 +config_file=default_config.sh +use_gpu=false +nnet_stage=1 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + lid_args="--use-gpu true" + lid_cmd="$cuda_eval_cmd --mem 6G" +else + lid_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +lid_dir=exp/resnet1d/$nnet_name + +# Extracts x-vectors for evaluation +for name in $test_data # $dev_data $test_data + do + nj=40 + steps_lid/identificate_wav2vec2resnet1d.sh \ + --cmd "$lid_cmd" --nj $nj ${lid_args} \ + $nnet data/$name \ + $lid_dir/$name data/$nnet_data/langs + done + +exit diff --git a/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh new file mode 100755 index 00000000..8b31ac2f --- /dev/null +++ b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" + +use_gpu=false +write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +num_augs=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ] && [ $# != 4 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + echo " --use-bin-vad # If true, uses binary VAD from vad.scp" + echo " --write-utt2num-frames # If true, write utt2num_frames file." + echo " --chunk-length # If provided, applies encoder with specified chunk-length and " + echo " # concatenates the chunks outputs before pooling" + echo " --feat-config # feature/mvn config file" + echo " --aug-config # augmentation config file" + echo " --random-utt-length # If true, extracts a random chunk from the utterance between " + echo " # min_utt_length and max_utt_length" + echo " --min-utt-length # " + echo " --max-utt-length # " + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +lang_file=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +args="" +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + args="--use-gpu" +fi + +if [ "$write_utt2num_frames" == "true" ];then + write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +fi + +if [ $stage -le 0 ];then + set +e + $cmd JOB=1:$nj $output_dir/log/identificate_wav2languageid.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + identificate_wav2languageid.py \ + --part-idx JOB --num-parts $nj ${args} \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --lang-file $lang_file \ + --output $output_dir/languageid.JOB + set -e +fi + +if [ $stage -le 1 ];then + echo "compute error rate" + + cat $output_dir/languageid.* > $output_dir/langs + + # python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text + # python steps_transducer/word2char.py $data_dir/text $data_dir/text_char + + # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text + # compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text + +fi diff --git a/hyperion/bin/identificate_wav2languageid.py b/hyperion/bin/identificate_wav2languageid.py new file mode 100755 index 00000000..8b01ac25 --- /dev/null +++ b/hyperion/bin/identificate_wav2languageid.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from typing import Dict, List, Tuple + +import sentencepiece as spm +import torch.nn as nn + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np +import pandas as pd + +import torch + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.utils import Utt2Info +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.np.augment import SpeechAugment + +from hyperion.torch.utils import open_device +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch import TorchModelLoader as TML + +from hyperion.torch.models.wav2transducer.beam_search import greedy_search, beam_search + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("lid-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def decode_one_batch( + model: nn.Module, + lang_dict: Dict[int, str], + x: torch.Tensor) -> Dict[str, List[List[str]]]: + """Decode one batch and return the result in a dict. The dict has the + following format: + - key: It indicates the setting used for decoding. For example, + if greedy_search is used, it would be "greedy_search" + If beam search with a beam size of 7 is used, it would be + "beam_7" + - value: It contains the decoding result. `len(value)` equals to + batch size. `value[i]` is the decoding result for the i-th + utterance in the given batch. + Args: + params: + It's the return value of :func:`get_params`. + model: + The neural model. + batch: + It is the return value from iterating + `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation + for the format of the `batch`. + Returns: + Return the decoding result. See above description for the format of + the returned dict. + """ + device = model.device + feature = x #batch["inputs"] + assert x.shape[0] == 1 + assert feature.ndim == 2 + + feature = feature.to(device) + # at entry, feature is (N, T, C) + + # feature_lens = torch.Tensor([x.shape[1]]).int() + + # encoder_out, hid_feats, encoder_out_lens = model.forward_feats( + # x=feature, x_lengths=feature_lens) + + predictions = [] + batch_size = feature.size(0) + + # encoder_out = encoder_out.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + for i in range(batch_size): + # # fmt: off + # encoder_out_i = encoder_out[i:i + 1, :encoder_out_lens[i]] + # fmt: on + output = model(feature) + _, pred = torch.max(output, dim=-1) + # to integer + pred = pred.cpu().numpy().tolist()[0] + predictions.append(lang_dict[pred]) + + logging.info("hyps:{}".format(" ".join(predictions))) + + return predictions + + +def decode_languageid(input_spec, output_spec, scp_sep, model_path, lang_file, + use_gpu, **kwargs): + + device = init_device(use_gpu) + model = load_model(model_path, device) + + # load language dict form langfile by row number + lang_dict = {} + with open(lang_file, "r") as f: + for i, line in enumerate(f): + lang_dict[i] = line.strip() + + augmenter = None + aug_df = None + num_augs = 1 + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output: %s" % (output_spec)) + # with DWF.create(output_spec, scp_sep=scp_sep) as writer: + with open(output_spec, "w") as writer: + logging.info("opening input stream: {} with args={}".format( + input_spec, ar_args)) + with AR(input_spec, **ar_args) as reader: + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + t2 = time.time() + + logging.info("processing utt %s" % (key0)) + for aug_id in range(num_augs): + t3 = time.time() + key, x = key0, x0 #augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], + dtype=torch.get_default_dtype()).to(device) + + t5 = time.time() + tot_frames = x.shape[1] + + # logging.info( + # "utt %s detected %d/%d (%.2f %%) speech frames" % ( + # key, + # x.shape[1], + # tot_frames, + # x.shape[1] / tot_frames * 100, + # )) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim, ), + dtype=float_cpu()) + else: + y = decode_one_batch(model=model, lang_dict=lang_dict, x=x) + + t7 = time.time() + + # writer.write([key], [y]) + writer.write(key + ' ' + ' '.join(y)+ "\n") + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ("utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f") % ( + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + )) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=("Extracts x-vectors from waveform computing " + "acoustic features on the fly")) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--scp-sep", + default=" ", + help=("scp file field separator")) + + AR.add_class_args(parser) + + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--model-path", required=True) + + parser.add_argument("--lang-file", required=True) + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--use-gpu", + default=False, + action="store_true", + help="extract xvectors in gpu") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + decode_languageid(**namespace_to_dict(args)) diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index 265f2c9b..bf9189ee 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -480,8 +480,11 @@ def decode_time_sync_beam_search(self, break t += 1 - best_hyp = max(B, - key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + try: + best_hyp = max(B, + key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + except: + return "" ys = best_hyp.ys[1:] # [1:] to remove the blank return ys From 35391de52990806d4802a7e034abe0dc84d675ff Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 4 May 2023 09:55:06 -0400 Subject: [PATCH 14/89] new vox2 dataprep --- hyperion/data_prep/data_prep.py | 8 +- hyperion/data_prep/voxceleb2.py | 16 +- .../torch/narchs/rnn_transducer_decoder.py | 407 +++++++++--------- 3 files changed, 224 insertions(+), 207 deletions(-) diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py index d9f6b238..fb6fc6c5 100644 --- a/hyperion/data_prep/data_prep.py +++ b/hyperion/data_prep/data_prep.py @@ -2,6 +2,7 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -66,21 +67,22 @@ def _get_recording_duration(scp, i, n): def get_recording_duration(self, recording_set): import itertools - from ..utils import SCPList scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values) futures = [] + logging.info("submitting threats...") with ThreadPoolExecutor(max_workers=self.num_threads) as pool: - for i in range(self.num_threads): + for i in tqdm(range(self.num_threads)): future = pool.submit( DataPrep._get_recording_duration, scp, i, self.num_threads ) futures.append(future) + logging.info("waiting threats...") res = [f.result() for f in tqdm(futures)] fss = list(itertools.chain(*[r[0] for r in res])) - durations = list(itertools.chain(*[r[0] for r in res])) + durations = list(itertools.chain(*[r[1] for r in res])) recording_set["duration"] = durations recording_set["sample_freq"] = fss diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index d8b9dd99..a1a9f0c3 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -158,8 +158,9 @@ def prepare(self): file_paths = [] futures = [] logging.info("making video cat lists") + logging.info("submitting threats...") with ThreadPoolExecutor(max_workers=self.num_threads) as pool: - for i, rec_id in enumerate(rec_ids): + for i, rec_id in tqdm(enumerate(rec_ids)): future = pool.submit( VoxCeleb2DataPrep.make_cat_list, lists_cat_dir, @@ -170,6 +171,7 @@ def prepare(self): ) futures.append(future) + logging.info("waiting threats...") file_paths = [f.result() for f in tqdm(futures)] video_ids = uniq_video_ids @@ -213,14 +215,14 @@ def prepare(self): df_lang.loc[r, "confidence"] if r in df_lang.index else "N/A" for r in rec_ids ], - # "duration": recs.loc[rec_ids, "duration"], + "duration": recs.loc[rec_ids, "duration"].values, } ) - print( - recs.loc[rec_ids, "duration"], - len(segments), - len(recs.loc[rec_ids, "duration"]), - ) + # print( + # recs.loc[rec_ids, "duration"], + # len(segments), + # len(recs.loc[rec_ids, "duration"]), + # ) segments = SegmentSet(segments) segments.sort() diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index 8db6c23a..763ec67c 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -16,7 +16,7 @@ try: import k2 except ModuleNotFoundError: - from ...utils import dummy_k2 as k2 + from ..utils import dummy_k2 as k2 from ...utils.misc import filter_func_args from ...utils.text import add_sos @@ -99,10 +99,8 @@ def __init__( if self.rnnt_loss == "k2_pruned": self.simple_am_proj = nn.Linear(in_feats, vocab_size) - self.simple_lm_proj = nn.Linear(self.predictor.out_feats, - vocab_size) - self.register_buffer("cur_step", torch.as_tensor(0, - dtype=torch.int)) + self.simple_lm_proj = nn.Linear(self.predictor.out_feats, vocab_size) + self.register_buffer("cur_step", torch.as_tensor(0, dtype=torch.int)) def _make_predictor(self): pred_type = self.predictor_args["pred_type"] @@ -110,12 +108,10 @@ def _make_predictor(self): self.predictor_args["vocab_size"] = self.vocab_size self.predictor_args["blank_id"] = self.blank_id if pred_type == "rnn": - pred_args = filter_func_args(RNNPredictor.__init__, - self.predictor_args) + pred_args = filter_func_args(RNNPredictor.__init__, self.predictor_args) self.predictor = RNNPredictor(**pred_args) elif pred_type == "conv": - pred_args = filter_func_args(ConvPredictor.__init__, - self.predictor_args) + pred_args = filter_func_args(ConvPredictor.__init__, self.predictor_args) self.predictor = ConvPredictor(**pred_args) self.predictor_args["out_feats"] = self.predictor.embed_dim else: @@ -127,8 +123,7 @@ def _make_joiner(self): if joiner_type == "basic": pred_feats = self.predictor_args["out_feats"] hid_feats = self.joiner_args["hid_feats"] - self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, - self.vocab_size) + self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, self.vocab_size) else: raise ValueError(f"Unknown joiner type {joiner_type}") @@ -152,9 +147,14 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor, - y: torch.Tensor, y_lengths: torch.Tensor, - pred_out: torch.Tensor): + def _rnnt_loss_torchaudio( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): logits = self.joiner(x, pred_out) # rnnt_loss requires 0 padded targets # Note: y does not start with SOS @@ -170,14 +170,17 @@ def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor, ) return loss - def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor, - y: torch.Tensor, y_lengths: torch.Tensor, - pred_out: torch.Tensor): + def _rnnt_loss_k2( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): y_padded = y.pad(mode="constant", padding_value=0) y_padded = y_padded.to(torch.int64) - boundary = torch.zeros((x.size(0), 4), - dtype=torch.int64, - device=x.device) + boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device) boundary[:, 2] = y_lengths boundary[:, 3] = x_lengths @@ -195,15 +198,18 @@ def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor, ) return loss - def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, - y: torch.Tensor, y_lengths: torch.Tensor, - pred_out: torch.Tensor): + def _rnnt_loss_k2_pruned( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): y_padded = y.pad(mode="constant", padding_value=0) y_padded = y_padded.to(torch.int64) - boundary = torch.zeros((x.size(0), 4), - dtype=torch.int64, - device=x.device) + boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device) boundary[:, 2] = y_lengths boundary[:, 3] = x_lengths @@ -266,7 +272,7 @@ def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, simple_loss_scale = 1.0 - r * (1.0 - self.simple_loss_scale) pruned_loss_scale = 0.1 + 0.9 * r self.cur_step += 1 - #print(simple_loss_scale, pruned_loss_scale) + # print(simple_loss_scale, pruned_loss_scale) loss = simple_loss_scale * loss_simple + pruned_loss_scale * loss_pruned @@ -288,44 +294,48 @@ def forward( loss_simple = loss_pruned = None if self.rnnt_loss == "k2_pruned": loss, loss_simple, loss_pruned = self._rnnt_loss_k2_pruned( - x, x_lengths, y, y_lengths, pred_out) + x, x_lengths, y, y_lengths, pred_out + ) elif self.rnnt_loss == "k2": loss = self._rnnt_loss_k2(x, x_lengths, y, y_lengths, pred_out) elif self.rnnt_loss == "torchaudio": loss_simple = loss_pruned = None - loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths, - pred_out) + loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths, pred_out) return loss, loss_simple, loss_pruned - def decode(self, - x: torch.Tensor, - x_lengths: torch.Tensor = None, - method="time_sync_beam_search", - beam_width: int = 5, - max_sym_per_frame: int = 3, - max_sym_per_utt: int = 1000) -> List[int]: + def decode( + self, + x: torch.Tensor, + x_lengths: torch.Tensor = None, + method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ) -> List[int]: if method == "time_sync_beam_search": - return self.decode_time_sync_beam_search(x, - x_lengths, - beam_width=beam_width) + return self.decode_time_sync_beam_search( + x, x_lengths, beam_width=beam_width + ) elif method == "align_length_sync_beam_search": return self.decode_align_length_sync_beam_search( + x, x_lengths, beam_width=beam_width, max_sym_per_utt=max_sym_per_utt + ) + elif method == "greedy": + return self.decode_greedy( x, x_lengths, - beam_width=beam_width, - max_sym_per_utt=max_sym_per_utt) - elif method == "greedy": - return self.decode_greedy(x, - x_lengths, - max_sym_per_frame=max_sym_per_frame, - max_sym_per_utt=max_sym_per_utt) - - def decode_greedy(self, - x: torch.Tensor, - x_lengths: torch.Tensor = None, - max_sym_per_frame: int = 3, - max_sym_per_utt: int = 1000) -> List[int]: + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt, + ) + + def decode_greedy( + self, + x: torch.Tensor, + x_lengths: torch.Tensor = None, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ) -> List[int]: """ Args: x: encoder embeddings with shape = (N, T, C) @@ -339,8 +349,7 @@ def decode_greedy(self, blank_id = self.blank_id device = x.device - sos = torch.tensor([blank_id], device=device, - dtype=torch.int64).reshape(1, 1) + sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1) pred_out, state = self.predictor(sos) T = x.size(1) t = 0 @@ -350,7 +359,7 @@ def decode_greedy(self, sym_per_utt = 0 while t < T and sym_per_utt < max_sym_per_utt: - x_t = x[:, t:t + 1, :] + x_t = x[:, t : t + 1, :] logits = self.joiner(x_t, pred_out) # (1, 1, 1, vocab_size) # logits is @@ -371,10 +380,9 @@ def decode_greedy(self, return hyp - def decode_time_sync_beam_search(self, - x: torch.Tensor, - x_lengths: torch.Tensor = None, - beam_width: int = 5) -> List[int]: + def decode_time_sync_beam_search( + self, x: torch.Tensor, x_lengths: torch.Tensor = None, beam_width: int = 5 + ) -> List[int]: assert x.ndim == 3 assert x.size(0) == 1, x.size(0) @@ -389,11 +397,10 @@ def decode_time_sync_beam_search(self, max_u = 20000 # terminate after this number of steps u = 0 - cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, - torch.Tensor]]] = {} + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {} while t < T and u < max_u: - x_t = x[:, t:t + 1, :] + x_t = x[:, t : t + 1, :] A = B B = [] @@ -406,13 +413,9 @@ def decode_time_sync_beam_search(self, cached_key = "_".join(map(str, y_star.ys)) if cached_key not in cache: - pred_in = torch.tensor([y_star.ys[-1]], - device=device).reshape(1, 1) + pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) - pred_out, pred_state = self.predictor( - pred_in, - y_star.pred_state, - ) + pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) cache[cached_key] = (pred_out, pred_state) else: pred_out, pred_state = cache[cached_key] @@ -443,7 +446,7 @@ def decode_time_sync_beam_search(self, topk_log_prob = log_prob.topk(beam_width, dim=-1) # Second, choose other labels - #for i, v in enumerate(log_prob.tolist()): + # for i, v in enumerate(log_prob.tolist()): for v, i in zip(*topk_log_prob): v = v.item() i = i.item() @@ -452,9 +455,7 @@ def decode_time_sync_beam_search(self, new_ys = y_star.ys + [i] new_log_prob = y_star.log_prob + v new_hyp = Hypothesis( - ys=new_ys, - log_prob=new_log_prob, - pred_state=pred_state, + ys=new_ys, log_prob=new_log_prob, pred_state=pred_state, ) A.append(new_hyp) @@ -462,12 +463,9 @@ def decode_time_sync_beam_search(self, # check whether B contains more than "beam" elements more probable # than the most probable in A A_most_probable = max(A, key=lambda hyp: hyp.log_prob) - #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + # print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) B = sorted( - [ - hyp - for hyp in B if hyp.log_prob > A_most_probable.log_prob - ], + [hyp for hyp in B if hyp.log_prob > A_most_probable.log_prob], key=lambda hyp: hyp.log_prob, reverse=True, ) @@ -483,17 +481,17 @@ def decode_time_sync_beam_search(self, break t += 1 - best_hyp = max(B, - key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + best_hyp = max(B, key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) ys = best_hyp.ys[1:] # [1:] to remove the blank return ys def decode_align_length_sync_beam_search( - self, - x: torch.Tensor, - x_lengths: torch.Tensor, - beam_width: int = 5, - max_sym_per_utt: int = 1000) -> List[int]: + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + beam_width: int = 5, + max_sym_per_utt: int = 1000, + ) -> List[int]: assert x.ndim == 3 assert x.size(0) == 1, x.size(0) @@ -503,39 +501,34 @@ def decode_align_length_sync_beam_search( sos = torch.tensor([blank_id], device=device).reshape(1, 1) pred_out, state = self.predictor(sos) T = x.size(1) - #t = 0 + # t = 0 B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] - #max_u = 20000 # terminate after this number of steps - #u = 0 + # max_u = 20000 # terminate after this number of steps + # u = 0 - cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, - torch.Tensor]]] = {} + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {} F = [] - #for t < T and u < max_u: + # for t < T and u < max_u: for i in range(T + max_sym_per_utt): A = [] for y_star in B: - #while u < max_u: + # while u < max_u: u = len(y_star.ys) - 1 t = i - u if t >= T: continue - #y_star = max(A, key=lambda hyp: hyp.log_prob) - #A.remove(y_star) - x_t = x[:, t:t + 1, :] + # y_star = max(A, key=lambda hyp: hyp.log_prob) + # A.remove(y_star) + x_t = x[:, t : t + 1, :] # Note: y_star.ys is unhashable, i.e., cannot be used # as a key into a dict cached_key = "_".join(map(str, y_star.ys)) if cached_key not in cache: - pred_in = torch.tensor([y_star.ys[-1]], - device=device).reshape(1, 1) + pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) - pred_out, pred_state = self.predictor( - pred_in, - y_star.pred_state, - ) + pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) cache[cached_key] = (pred_out, pred_state) else: pred_out, pred_state = cache[cached_key] @@ -563,7 +556,7 @@ def decode_align_length_sync_beam_search( topk_log_prob = log_prob.topk(beam_width, dim=-1) # Second, choose other labels - #for i, v in enumerate(log_prob.tolist()): + # for i, v in enumerate(log_prob.tolist()): for v, i in zip(*topk_log_prob): v = v.item() i = i.item() @@ -572,20 +565,16 @@ def decode_align_length_sync_beam_search( new_ys = y_star.ys + [i] new_log_prob = y_star.log_prob + v new_hyp = Hypothesis( - ys=new_ys, - log_prob=new_log_prob, - pred_state=pred_state, + ys=new_ys, log_prob=new_log_prob, pred_state=pred_state, ) A.append(new_hyp) # check whether B contains more than "beam_width" elements more probable # than the most probable in A - #A_most_probable = max(A, key=lambda hyp: hyp.log_prob) - #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + # A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + # print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) B0 = sorted( - [hyp for hyp in A], - key=lambda hyp: hyp.log_prob, - reverse=True, + [hyp for hyp in A], key=lambda hyp: hyp.log_prob, reverse=True, ) B = [] B_ys = set() @@ -605,8 +594,7 @@ def decode_align_length_sync_beam_search( B = B[:beam_width] break - best_hyp = max(F, - key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + best_hyp = max(F, key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) ys = best_hyp.ys[1:] # [1:] to remove the blank return ys @@ -617,8 +605,9 @@ def change_config( rnn_dropout_rate: float = 0.0, ): logging.info("changing decoder config") - self.predictor.change_config(override_dropouts, embed_dropout_rate, - rnn_dropout_rate) + self.predictor.change_config( + override_dropouts, embed_dropout_rate, rnn_dropout_rate + ) @staticmethod def filter_args(**kwargs): @@ -638,49 +627,58 @@ def add_pred_args(parser): "--pred-type", default="rnn", choices=["rnn", "conv"], - help= - """type of predictor between RNN and Convolutional [rnn, conv]""") - pred_parser.add_argument("--embed-dim", - default=1024, - type=int, - help=("token embedding dimension")) + help="""type of predictor between RNN and Convolutional [rnn, conv]""", + ) + pred_parser.add_argument( + "--embed-dim", default=1024, type=int, help=("token embedding dimension") + ) pred_parser.add_argument( "--embed-dropout-rate", default=0.0, type=float, - help=("dropout prob for predictor input embeddings")) - pred_parser.add_argument("--rnn-dropout-rate", - default=0.0, - type=float, - help="""dropout prob for decoder RNN """) + help=("dropout prob for predictor input embeddings"), + ) + pred_parser.add_argument( + "--rnn-dropout-rate", + default=0.0, + type=float, + help="""dropout prob for decoder RNN """, + ) pred_parser.add_argument( "--rnn-type", default="lstm", choices=["lstm", "gru"], - help= - """type of recurrent network for thep predictor in [lstm, gru]""") - - pred_parser.add_argument("--num-layers", - default=2, - type=int, - help="""number of layers of the predictor """) - - pred_parser.add_argument("--hid-feats", - default=512, - type=int, - help="""hidden features of the predictor""") - pred_parser.add_argument("--out-feats", - default=512, - type=int, - help="""output features of the predictor""") - pred_parser.add_argument("--context-size", - default=2, - type=int, - help="""context length of the convolutional - predictor, 1->bigram, 2-> trigram,...""") - - parser.add_argument("--predictor", - action=ActionParser(parser=pred_parser)) + help="""type of recurrent network for thep predictor in [lstm, gru]""", + ) + + pred_parser.add_argument( + "--num-layers", + default=2, + type=int, + help="""number of layers of the predictor """, + ) + + pred_parser.add_argument( + "--hid-feats", + default=512, + type=int, + help="""hidden features of the predictor""", + ) + pred_parser.add_argument( + "--out-feats", + default=512, + type=int, + help="""output features of the predictor""", + ) + pred_parser.add_argument( + "--context-size", + default=2, + type=int, + help="""context length of the convolutional + predictor, 1->bigram, 2-> trigram,...""", + ) + + parser.add_argument("--predictor", action=ActionParser(parser=pred_parser)) @staticmethod def add_joiner_args(parser): @@ -690,39 +688,43 @@ def add_joiner_args(parser): "--joiner-type", default="basic", choices=["basic"], - help= - """type of joiner network, there is only basic joiner for now""") - pred_parser.add_argument("--hid-feats", - default=512, - type=int, - help="""hidden features of the joiner""") - parser.add_argument("--joiner", - action=ActionParser(parser=pred_parser)) + help="""type of joiner network, there is only basic joiner for now""", + ) + pred_parser.add_argument( + "--hid-feats", + default=512, + type=int, + help="""hidden features of the joiner""", + ) + parser.add_argument("--joiner", action=ActionParser(parser=pred_parser)) @staticmethod - def add_class_args(parser, - prefix=None, - skip=set(["in_feats", "blank_id", "vocab_size"])): + def add_class_args( + parser, prefix=None, skip=set(["in_feats", "blank_id", "vocab_size"]) + ): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") if "in_feats" not in skip: - parser.add_argument("--in-feats", - type=int, - required=True, - help=("input feature dimension")) + parser.add_argument( + "--in-feats", type=int, required=True, help=("input feature dimension") + ) if "blank_id" not in skip: - parser.add_argument("--blank-id", - type=int, - default=0, - help=("blank id from tokenizer model")) + parser.add_argument( + "--blank-id", + type=int, + default=0, + help=("blank id from tokenizer model"), + ) if "vocab_size" not in skip: - parser.add_argument("--vocab-size", - type=int, - required=True, - help=("output prediction dimension")) + parser.add_argument( + "--vocab-size", + type=int, + required=True, + help=("output prediction dimension"), + ) RNNTransducerDecoder.add_pred_args(parser) RNNTransducerDecoder.add_joiner_args(parser) @@ -730,56 +732,62 @@ def add_class_args(parser, "--rnnt-loss", default="k2_pruned", choices=["torchaudio", "k2", "k2_pruned"], - help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""") + help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""", + ) parser.add_argument( "--rnnt-type", default="regular", choices=["regular", "modified", "constrained"], - help= - """type of rnn-t loss between regular, modified or constrained.""") + help="""type of rnn-t loss between regular, modified or constrained.""", + ) parser.add_argument( "--delay-penalty", default=0.0, type=float, - help= - """penalize symbol delay, which is used to make symbol emit earlier - for streaming models.""") + help="""penalize symbol delay, which is used to make symbol emit earlier + for streaming models.""", + ) parser.add_argument( "--reduction", default="sum", choices=["sum", "mean"], - help="""type of reduction for rnn-t loss between sum or mean""") + help="""type of reduction for rnn-t loss between sum or mean""", + ) parser.add_argument( "--prune-range", default=5, type=int, help="""how many symbols to keep for each frame in k2 rnn-t - pruned loss.""") + pruned loss.""", + ) parser.add_argument( "--lm-scale", default=0.25, type=float, - help="""language model scale in rnn-t smoothed loss""") + help="""language model scale in rnn-t smoothed loss""", + ) parser.add_argument( "--am-scale", default=0.0, type=float, - help="""acoustic model scale in rnn-t smoothed loss""") + help="""acoustic model scale in rnn-t smoothed loss""", + ) parser.add_argument( "--simple-loss-scale", default=0.5, type=float, - help="""weight of rnn-t simple loss when using k2 pruned loss""") + help="""weight of rnn-t simple loss when using k2 pruned loss""", + ) parser.add_argument( "--pruned-warmup-steps", default=2000, type=int, help="""number of steps to warm up the k2 rnn-t pruned loss - from 0.1 to 1""") + from 0.1 to 1""", + ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod def add_finetune_args(parser, prefix=None, skip=set()): @@ -794,16 +802,21 @@ def add_finetune_args(parser, prefix=None, skip=set()): action=ActionYesNo, help=( "whether to use the dropout probabilities passed in the " - "arguments instead of the defaults in the pretrained model.")) - parser.add_argument("--embed-dropout-rate", - default=0.0, - type=float, - help=("dropout prob for decoder input embeddings")) - parser.add_argument("--rnn-dropout-rate", - default=0.0, - type=float, - help=("dropout prob for decoder RNN ")) + "arguments instead of the defaults in the pretrained model." + ), + ) + parser.add_argument( + "--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings"), + ) + parser.add_argument( + "--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN "), + ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) From ebef85146384fa08995a816e3843d33a1e2e8673 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Thu, 4 May 2023 17:55:23 -0400 Subject: [PATCH 15/89] update the np.str to np.str_ --- hyperion/utils/utt2info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py index 9785d021..edf2c23a 100644 --- a/hyperion/utils/utt2info.py +++ b/hyperion/utils/utt2info.py @@ -142,7 +142,7 @@ def save(self, file_path, sep=" "): self.utt_info.to_csv(file_path, sep=sep, header=False, index=False) @classmethod - def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}): + def load(cls, file_path, sep=" ", dtype={0: np.str_, 1: np.str_}): """Loads utt2info list from text file. Args: From 720bd6eefd4fabda168fc1903876d615f4668be3 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Thu, 4 May 2023 17:58:48 -0400 Subject: [PATCH 16/89] update np.str to np.str_ --- hyperion/utils/utt2info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py index 9785d021..edf2c23a 100644 --- a/hyperion/utils/utt2info.py +++ b/hyperion/utils/utt2info.py @@ -142,7 +142,7 @@ def save(self, file_path, sep=" "): self.utt_info.to_csv(file_path, sep=sep, header=False, index=False) @classmethod - def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}): + def load(cls, file_path, sep=" ", dtype={0: np.str_, 1: np.str_}): """Loads utt2info list from text file. Args: From b112ebde8adc127156c2b111c21e1afe3042754d Mon Sep 17 00:00:00 2001 From: ylu125 Date: Thu, 4 May 2023 21:42:51 -0400 Subject: [PATCH 17/89] Add empty __init__.py --- hyperion/torch/models/vae/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 hyperion/torch/models/vae/__init__.py diff --git a/hyperion/torch/models/vae/__init__.py b/hyperion/torch/models/vae/__init__.py new file mode 100644 index 00000000..f4883a15 --- /dev/null +++ b/hyperion/torch/models/vae/__init__.py @@ -0,0 +1,5 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" From cf861bc7b30f9c318ed20308588c71856a545933 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 8 May 2023 14:49:09 -0400 Subject: [PATCH 18/89] fix new vox2 dataprep durations, scp -> RecordingSet --- egs/librispeech/v1/run_011_train_asr.sh | 12 +- egs/librispeech/v1/run_011_train_asr_old.sh | 12 +- .../adv.v1.1/run_005_train_victim_xvector.sh | 4 +- .../run_007_train_transfer_xvector.sh | 4 +- .../run_008_adv_finetune_victim_xvector.sh | 4 +- .../adv.v2/run_011_train_victim_xvector.sh | 4 +- .../run_022_attack_type_classif_allknown.sh | 4 +- .../adv.v2/run_023_snr_classif_allknown.sh | 4 +- .../run_024_threat_model_classif_allknown.sh | 4 +- ...un_031_attack_type_verif_and_noveltydet.sh | 4 +- egs/voxceleb/adv.v2/run_032_snr_verif.sh | 4 +- .../adv.v2/run_033_threat_model_verif.sh | 4 +- .../config_fbank80_stmn_cfwseresnet34.v3.0.sh | 4 +- .../config_fbank80_stmn_fwseresnet34.v3.0.sh | 4 +- .../config_fbank80_stmn_resnet34.v3.0.sh | 2 +- egs/voxceleb/v1.1/run_011_train_xvector.sh | 8 +- egs/voxceleb/v2/run_011_train_xvector.sh | 12 +- hyperion/bin/apply_mvn_select_frames.py | 36 +- hyperion/bin/compute_energy_vad.py | 21 +- hyperion/bin/compute_mfcc_feats.py | 20 +- hyperion/bin/decode_wav2transducer.py | 110 ++-- ...l_xvec_cosine_scoring_from_adv_test_wav.py | 21 +- ...osine_scoring_from_adv_test_wav_wavegan.py | 22 +- ...l_xvec_cosine_scoring_from_art_test_wav.py | 26 +- .../eval_xvec_cosine_scoring_from_test_wav.py | 15 +- ...sine_scoring_from_transfer_adv_test_wav.py | 3 +- ...sine_scoring_from_transfer_art_test_wav.py | 20 +- hyperion/bin/eval_xvec_logits_from_wav.py | 19 +- hyperion/bin/extract_wav2vec2xvectors.py | 17 +- hyperion/bin/extract_xvectors_from_wav.py | 16 +- .../extract_xvectors_slidwin_from_feats.py | 15 +- .../bin/extract_xvectors_slidwin_from_wav.py | 18 +- .../generate_adv_attacks_xvector_classif.py | 15 +- .../bin/generate_adv_attacks_xvector_verif.py | 10 +- hyperion/bin/pack_wav_rirs.py | 10 +- hyperion/data_prep/data_prep.py | 8 +- hyperion/io/ark_data_reader.py | 179 ++++--- hyperion/io/ark_data_writer.py | 42 +- hyperion/io/audio_reader.py | 409 ++++++++------- hyperion/io/audio_writer.py | 84 +-- hyperion/io/bin_vad_reader.py | 3 +- hyperion/io/data_reader.py | 62 ++- hyperion/io/data_rw_factory.py | 51 +- hyperion/io/data_writer.py | 51 +- hyperion/io/h5_data_reader.py | 204 +++++--- hyperion/io/h5_data_writer.py | 32 +- hyperion/io/old_audio_reader.py | 477 ++++++++++++++++++ hyperion/io/vad_rw_factory.py | 10 +- hyperion/torch/data/audio_dataset.py | 160 +++--- hyperion/utils/feature_set.py | 16 +- hyperion/utils/info_table.py | 27 +- hyperion/utils/segment_set.py | 27 + hyperion/utils/utt2info.py | 2 +- 53 files changed, 1525 insertions(+), 827 deletions(-) create mode 100644 hyperion/io/old_audio_reader.py diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh index 99b0065e..81ebbeae 100755 --- a/egs/librispeech/v1/run_011_train_asr.sh +++ b/egs/librispeech/v1/run_011_train_asr.sh @@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ train_wav2vec2rnn_transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ @@ -75,11 +75,11 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s2_dir $args \ @@ -103,11 +103,11 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ diff --git a/egs/librispeech/v1/run_011_train_asr_old.sh b/egs/librispeech/v1/run_011_train_asr_old.sh index 3d0e6eb1..3c9f4f5b 100755 --- a/egs/librispeech/v1/run_011_train_asr_old.sh +++ b/egs/librispeech/v1/run_011_train_asr_old.sh @@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ train_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ @@ -75,11 +75,11 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s2_dir $args \ @@ -103,11 +103,11 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ diff --git a/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh index 37a91211..aa779902 100755 --- a/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh +++ b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh @@ -44,11 +44,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh index 70bab280..420ac59d 100755 --- a/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh +++ b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh @@ -54,11 +54,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh index 12f1e5fd..4f2c137b 100755 --- a/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh +++ b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh @@ -53,11 +53,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ adv_finetune_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh index 971b88a3..a1acb1f6 100755 --- a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh +++ b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh @@ -40,11 +40,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh index 71c0c89f..b453260f 100755 --- a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh @@ -46,11 +46,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_dir/train_utt2attack \ --data.train.dataset.class-file $list_dir/class_file \ - --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh index a928ae29..de811505 100755 --- a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh @@ -46,11 +46,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_dir/train_utt2attack \ --data.train.dataset.class-file $list_dir/class_file \ - --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh index bed225a3..aa17a1ae 100755 --- a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh @@ -48,11 +48,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_dir/train_utt2attack \ --data.train.dataset.class-file $list_dir/class_file \ - --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh index 55cb8459..4ce703ba 100755 --- a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh +++ b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh @@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ --data.train.dataset.class-file $list_someknown_dir/class_file \ - --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_032_snr_verif.sh b/egs/voxceleb/adv.v2/run_032_snr_verif.sh index 3886c339..12d42c99 100755 --- a/egs/voxceleb/adv.v2/run_032_snr_verif.sh +++ b/egs/voxceleb/adv.v2/run_032_snr_verif.sh @@ -52,11 +52,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ --data.train.dataset.class-file $list_someknown_dir/class_file \ - --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh index 392bffb5..cbfaaa81 100755 --- a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh +++ b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh @@ -53,11 +53,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ --data.train.dataset.class-file $list_someknown_dir/class_file \ - --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh index 32c91da2..fdb3147f 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh @@ -26,8 +26,8 @@ nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth # back-end do_plda=false -do_snorm=false #true -do_qmf=false #true +do_snorm=true +do_qmf=true do_voxsrc22=true plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh index 62b02c28..7aa61f00 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh @@ -26,8 +26,8 @@ nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth # back-end do_plda=false -do_snorm=true -do_qmf=true +do_snorm=false #true +do_qmf=false #true do_voxsrc22=true plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh index c49936e0..b194d1bd 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh @@ -16,7 +16,7 @@ nnet_name=${feat_type}_resnet34.v3.0 nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml nnet_s1_name=$nnet_name.s1 -nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name.kk2 nnet_s1=$nnet_s1_dir/model_ep0035.pth nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml diff --git a/egs/voxceleb/v1.1/run_011_train_xvector.sh b/egs/voxceleb/v1.1/run_011_train_xvector.sh index a051c136..c8ab552e 100755 --- a/egs/voxceleb/v1.1/run_011_train_xvector.sh +++ b/egs/voxceleb/v1.1/run_011_train_xvector.sh @@ -44,11 +44,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_s1_dir \ @@ -67,11 +67,11 @@ if [ $stage -le 2 ]; then --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --in-model-file $nnet_s1 \ diff --git a/egs/voxceleb/v2/run_011_train_xvector.sh b/egs/voxceleb/v2/run_011_train_xvector.sh index 0eddb1a6..bc3b5420 100755 --- a/egs/voxceleb/v2/run_011_train_xvector.sh +++ b/egs/voxceleb/v2/run_011_train_xvector.sh @@ -47,11 +47,11 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_wav2vec2xvector.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_s1_dir $args \ @@ -71,11 +71,11 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2xvector.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --in-model-file $nnet_s1 \ @@ -96,11 +96,11 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2xvector.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --in-model-file $nnet_s2 \ diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py index a2456dc9..f5a3ce15 100755 --- a/hyperion/bin/apply_mvn_select_frames.py +++ b/hyperion/bin/apply_mvn_select_frames.py @@ -10,8 +10,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -28,7 +32,6 @@ def process_feats( output_spec, vad_spec, write_num_frames_spec, - scp_sep, path_prefix, vad_path_prefix, part_idx, @@ -51,25 +54,16 @@ def process_feats( logging.info("opening output stream: %s" % (output_spec)) with DWF.create( - output_spec, - compress=compress, - compression_method=compression_method, - scp_sep=scp_sep, + output_spec, compress=compress, compression_method=compression_method, ) as writer: logging.info("opening input stream: %s" % (output_spec)) with DRF.create( - input_spec, - path_prefix=path_prefix, - scp_sep=scp_sep, - part_idx=part_idx, - num_parts=num_parts, + input_spec, path_prefix=path_prefix, part_idx=part_idx, num_parts=num_parts, ) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = RDRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) + v_reader = RDRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): key, data = reader.read(1) @@ -112,28 +106,20 @@ def process_feats( parser.add_argument( "--write-num-frames", dest="write_num_frames_spec", default=None ) - parser.add_argument( - "--scp-sep", dest="scp_sep", default=" ", help=("scp file field separator") - ) parser.add_argument( "--path-prefix", dest="path_prefix", default=None, help=("scp file_path prefix") ) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument( "--part-idx", - dest="part_idx", type=int, default=1, help=("splits the list of files in num-parts and process part_idx"), ) parser.add_argument( "--num-parts", - dest="num_parts", type=int, default=1, help=("splits the list of files in num-parts and process part_idx"), @@ -141,14 +127,12 @@ def process_feats( parser.add_argument( "--compress", - dest="compress", default=False, action="store_true", help="Lossy compress the features", ) parser.add_argument( "--compression-method", - dest="compression_method", default="auto", choices=compression_methods, help=( diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index 15d74f3a..058f982a 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -9,8 +9,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -26,14 +30,14 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) - writer = DWF.create(output_path, scp_sep=" ") + writer = DWF.create(output_path) if write_num_frames is not None: f_num_frames = open(write_num_frames, "w") for data in reader: key, x, fs = data - logging.info("Extracting VAD for %s" % (key)) + logging.info("Extracting VAD for %s", key) t1 = time.time() y = vad.compute(x) dt = (time.time() - t1) * 1000 @@ -41,8 +45,13 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): num_speech_frames = np.sum(y) prob_speech = num_speech_frames / y.shape[0] * 100 logging.info( - "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f" - % (key, num_speech_frames, y.shape[0], prob_speech, dt, rtf) + "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f", + key, + num_speech_frames, + y.shape[0], + prob_speech, + dt, + rtf, ) writer.write([key], [y]) if write_num_frames is not None: diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py index a83f95d1..ca6e26f7 100755 --- a/hyperion/bin/compute_mfcc_feats.py +++ b/hyperion/bin/compute_mfcc_feats.py @@ -9,8 +9,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -35,10 +39,7 @@ def compute_mfcc_feats( reader = DRF.create(input_path, **input_args) writer = DWF.create( - output_path, - scp_sep=" ", - compress=compress, - compression_method=compression_method, + output_path, compress=compress, compression_method=compression_method, ) if write_num_frames is not None: @@ -55,8 +56,11 @@ def compute_mfcc_feats( dt = (time.time() - t1) * 1000 rtf = dt / (mfcc.frame_shift * y.shape[0]) logging.info( - "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f" - % (key, y.shape[0], dt, rtf) + "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f", + key, + y.shape[0], + dt, + rtf, ) writer.write([key], [y]) diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py index 81fa8803..c7de38f1 100755 --- a/hyperion/bin/decode_wav2transducer.py +++ b/hyperion/bin/decode_wav2transducer.py @@ -13,8 +13,12 @@ import numpy as np import pandas as pd import sentencepiece as spm -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -23,8 +27,7 @@ from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.models.wav2transducer.beam_search import (beam_search, - greedy_search) +from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info @@ -48,10 +51,11 @@ def load_model(model_path, device): def decode_one_batch( - model: nn.Module, - sp: spm.SentencePieceProcessor, - x: torch.Tensor, - decoding_method="beam_search") -> Dict[str, List[List[str]]]: + model: nn.Module, + sp: spm.SentencePieceProcessor, + x: torch.Tensor, + decoding_method="beam_search", +) -> Dict[str, List[List[str]]]: """Decode one batch and return the result in a dict. The dict has the following format: - key: It indicates the setting used for decoding. For example, @@ -77,7 +81,7 @@ def decode_one_batch( the returned dict. """ device = model.device - feature = x #batch["inputs"] + feature = x # batch["inputs"] assert x.shape[0] == 1 assert feature.ndim == 2 @@ -87,7 +91,8 @@ def decode_one_batch( feature_lens = torch.Tensor([x.shape[1]]).int() encoder_out, hid_feats, encoder_out_lens = model.forward_feats( - x=feature, x_lengths=feature_lens) + x=feature, x_lengths=feature_lens + ) hyps = [] batch_size = encoder_out.size(0) @@ -114,8 +119,9 @@ def decode_one_batch( return hyps[0] -def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, - use_gpu, **kwargs): +def decode_transducer( + input_spec, output_spec, model_path, bpe_model, use_gpu, **kwargs +): device = init_device(use_gpu) model = load_model(model_path, device) @@ -129,10 +135,10 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, ar_args = AR.filter_args(**kwargs) logging.info("opening output: %s" % (output_spec)) - # with DWF.create(output_spec, scp_sep=scp_sep) as writer: with open(output_spec, "w") as writer: - logging.info("opening input stream: {} with args={}".format( - input_spec, ar_args)) + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) with AR(input_spec, **ar_args) as reader: while not reader.eof(): t1 = time.time() @@ -147,65 +153,69 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, logging.info("processing utt %s" % (key0)) for aug_id in range(num_augs): t3 = time.time() - key, x = key0, x0 #augment(key0, x0, augmenter, aug_df, aug_id) + key, x = key0, x0 # augment(key0, x0, augmenter, aug_df, aug_id) t4 = time.time() with torch.no_grad(): x = torch.tensor( - x[None, :], - dtype=torch.get_default_dtype()).to(device) + x[None, :], dtype=torch.get_default_dtype() + ).to(device) t5 = time.time() tot_frames = x.shape[1] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" % ( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( key, x.shape[1], tot_frames, x.shape[1] / tot_frames * 100, - )) + ) + ) t6 = time.time() if x.shape[1] == 0: - y = np.zeros((model.embed_dim, ), - dtype=float_cpu()) + y = np.zeros((model.embed_dim,), dtype=float_cpu()) else: y = decode_one_batch(model=model, sp=sp, x=x) t7 = time.time() - writer.write(key + ' ' + ' '.join(y) + "\n") + writer.write(key + " " + " ".join(y) + "\n") t8 = time.time() read_time = t2 - t1 tot_time = read_time + t8 - t3 logging.info( - ("utt %s total-time=%.3f read-time=%.3f " - "aug-time=%.3f feat-time=%.3f " - "vad-time=%.3f embed-time=%.3f write-time=%.3f " - "rt-factor=%.2f") % ( - key, - tot_time, - read_time, - t4 - t3, - t5 - t4, - t6 - t5, - t7 - t6, - t8 - t7, - x0.shape[0] / fs[0] / tot_time, - )) + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + ) + ) if __name__ == "__main__": parser = ArgumentParser( - description=("Extracts x-vectors from waveform computing " - "acoustic features on the fly")) + description=( + "Extracts x-vectors from waveform computing " "acoustic features on the fly" + ) + ) parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument("--input", dest="input_spec", required=True) - parser.add_argument("--scp-sep", - default=" ", - help=("scp file field separator")) AR.add_class_args(parser) @@ -216,16 +226,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, parser.add_argument("--bpe-model", required=True) parser.add_argument("--output", dest="output_spec", required=True) - parser.add_argument("--use-gpu", - default=False, - action="store_true", - help="extract xvectors in gpu") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index bb01162f..10ea491c 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -10,8 +10,12 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -188,7 +192,7 @@ def eval_cosine_scoring( attack = AttackFactory.create(model, **attack_args) if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -327,9 +331,9 @@ def eval_cosine_scoring( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--v-file", dest="v_file", required=True) - parser.add_argument("--key-file", dest="key_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) @@ -337,10 +341,7 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index c483ce39..a6f535b3 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -7,13 +7,18 @@ import os import sys import time + # [Added Sonal May21] from pathlib import Path import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -243,7 +248,7 @@ def eval_cosine_scoring_wavegan( attack = AttackFactory.create(model, **attack_args) if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -384,9 +389,9 @@ def eval_cosine_scoring_wavegan( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--v-file", dest="v_file", required=True) - parser.add_argument("--key-file", dest="key_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) @@ -394,10 +399,7 @@ def eval_cosine_scoring_wavegan( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index fba182c4..5ba42477 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -13,8 +13,12 @@ import pandas as pd from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -25,8 +29,9 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device @@ -195,7 +200,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -354,9 +359,9 @@ def eval_cosine_scoring( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--v-file", dest="v_file", required=True) - parser.add_argument("--key-file", dest="key_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) @@ -364,10 +369,7 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index 3cfde93e..c3732bd3 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -10,8 +10,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -122,7 +126,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((ndx.num_models, ndx.num_tests), dtype="float32") with torch.no_grad(): @@ -217,10 +221,7 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index 44bdf59d..c00cf286 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -205,7 +205,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -361,7 +361,6 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--vad-path-prefix", - dest="vad_path_prefix", default=None, help=("scp file_path prefix for vad"), ) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index 676575fd..4f2b82ab 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -13,8 +13,12 @@ import pandas as pd from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -25,8 +29,9 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device @@ -213,7 +218,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -386,10 +391,7 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index da6389fb..2f5cf3da 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -11,8 +11,12 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -93,7 +97,6 @@ def eval_xvec( output_spec, vad_spec, write_num_frames_spec, - scp_sep, vad_path_prefix, model_path, chunk_length, @@ -125,8 +128,8 @@ def eval_xvec( num_augs = 1 ar_args = AR.filter_args(**kwargs) - logging.info("opening output stream: %s" % (output_spec)) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: + logging.info("opening output stream: %s", output_spec) + with DWF.create(output_spec) as writer: logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) @@ -135,9 +138,7 @@ def eval_xvec( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): t1 = time.time() @@ -243,7 +244,7 @@ def eval_xvec( parser.add_argument( "--write-num-frames", dest="write_num_frames_spec", default=None ) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index 37d6a2a6..c4c4676f 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -12,8 +12,12 @@ import numpy as np import pandas as pd import torchaudio.transforms as tat -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -44,6 +48,7 @@ def get_resampler(source_fs, target_fs): resamplers[source_fs] = resampler_f return resampler_f + resamplers = {} @@ -122,7 +127,6 @@ def extract_xvectors( output_spec, vad_spec, write_speech_dur, - scp_sep, vad_path_prefix, model_path, hf_chunk_length, @@ -157,16 +161,14 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) ar_args["wav_scale"] = 1.0 logging.info("opening output stream: %s", output_spec) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: + with DWF.create(output_spec) as writer: logging.info(f"opening input stream: {input_spec} with args={ar_args}") with AR(input_spec, **ar_args) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): t1 = time.time() @@ -283,7 +285,6 @@ def extract_xvectors( parser.add_argument("--input", dest="input_spec", required=True) parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument("--write-speech-dur", default=None) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index addabbcf..1da1ac05 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -11,8 +11,12 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -95,7 +99,6 @@ def extract_xvectors( output_spec, vad_spec, write_num_frames_spec, - scp_sep, vad_path_prefix, model_path, chunk_length, @@ -129,7 +132,7 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: + with DWF.create(output_spec) as writer: logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) @@ -138,9 +141,7 @@ def extract_xvectors( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) while not reader.eof(): t1 = time.time() @@ -249,7 +250,6 @@ def extract_xvectors( parser.add_argument( "--write-num-frames", dest="write_num_frames_spec", default=None ) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index e3d2fcbb..eaf0a5cc 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -11,8 +11,12 @@ import numpy as np import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -78,7 +82,7 @@ def extract_xvectors( model = load_model(model_path, device) if write_timestamps_spec is not None: - time_writer = DWF.create(write_timestamps_spec, scp_sep=" ") + time_writer = DWF.create(write_timestamps_spec) dr_args = DRF.filter_args(**kwargs) logging.info("opening output stream: %s" % (output_spec)) @@ -205,10 +209,7 @@ def extract_xvectors( ) parser.add_argument("--slidwin-params-path", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) MVN.add_class_args(parser, prefix="mvn") diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index 2b1bba3b..a31bd614 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -12,8 +12,12 @@ import numpy as np import pandas as pd import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -83,7 +87,6 @@ def extract_xvectors( vad_spec, write_timestamps_spec, slidwin_params_path, - scp_sep, vad_path_prefix, model_path, chunk_length, @@ -109,7 +112,7 @@ def extract_xvectors( feat_snip_edges = feat_args["snip_edges"] if write_timestamps_spec is not None: - time_writer = DWF.create(write_timestamps_spec, scp_sep=scp_sep) + time_writer = DWF.create(write_timestamps_spec) if aug_cfg is not None: augmenter = SpeechAugment.create(aug_cfg, rng=rng) @@ -121,7 +124,7 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: + with DWF.create(output_spec) as writer: logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) @@ -130,9 +133,7 @@ def extract_xvectors( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): t1 = time.time() @@ -275,7 +276,6 @@ def extract_xvectors( ) parser.add_argument("--slidwin-params-path", default=None) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index a058893d..8c6f38a6 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -12,8 +12,12 @@ import numpy as np import pandas as pd import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -168,7 +172,7 @@ def generate_attacks( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) keys, class_names, class_ids = read_utt_list( list_file, class2int_file, part_idx, num_parts @@ -329,10 +333,7 @@ def generate_attacks( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py index 83375cb6..fbd3a5fb 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_verif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -12,8 +12,12 @@ import numpy as np import pandas as pd import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -197,7 +201,7 @@ def generate_attacks( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) attack_factory = init_attack_factory(**kwargs) attacks_info = {} diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index dccf58da..4aafa075 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -10,8 +10,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -20,7 +24,7 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): - writer = DWF.create(output_spec, scp_sep=" ", compress=False) + writer = DWF.create(output_spec, compress=False) t1 = time.time() with AR(input_path, wav_scale=1) as reader: for data in reader: diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py index fb6fc6c5..19420761 100644 --- a/hyperion/data_prep/data_prep.py +++ b/hyperion/data_prep/data_prep.py @@ -50,12 +50,12 @@ def dataset_name(): raise NotImplementedError() @staticmethod - def _get_recording_duration(scp, i, n): + def _get_recording_duration(recordings, i, n): from ..io import SequentialAudioReader as AR durations = [] fss = [] - with AR(scp, part_idx=i, num_parts=n) as reader: + with AR(recordings, part_idx=i + 1, num_parts=n) as reader: for data in reader: key, x, fs = data duration = x.shape[0] / fs @@ -69,13 +69,13 @@ def get_recording_duration(self, recording_set): import itertools from ..utils import SCPList - scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values) + # scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values) futures = [] logging.info("submitting threats...") with ThreadPoolExecutor(max_workers=self.num_threads) as pool: for i in tqdm(range(self.num_threads)): future = pool.submit( - DataPrep._get_recording_duration, scp, i, self.num_threads + DataPrep._get_recording_duration, recording_set, i, self.num_threads ) futures.append(future) diff --git a/hyperion/io/ark_data_reader.py b/hyperion/io/ark_data_reader.py index 3919ddfa..6cf22d5f 100644 --- a/hyperion/io/ark_data_reader.py +++ b/hyperion/io/ark_data_reader.py @@ -4,15 +4,15 @@ """ import multiprocessing as threading -import sys +from typing import Union, Optional, List, Callable, Tuple import numpy as np from ..hyp_defs import float_cpu -from ..utils.kaldi_io_funcs import (init_kaldi_input_stream, is_token, peek, - read_token) +from ..utils.kaldi_io_funcs import init_kaldi_input_stream, is_token, peek, read_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix -from ..utils.scp_list import SCPList + +from ..utils import FeatureSet, PathLike from .data_reader import RandomAccessDataReader, SequentialDataReader @@ -27,10 +27,9 @@ class SequentialArkDataReader(SequentialDataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.f = None self.lock = threading.Lock() @@ -42,7 +41,7 @@ def close(self): self.f.close() self.f = None - def _seek(self, offset): + def _seek(self, offset: int): """Moves the pointer of the input file. Args: @@ -52,7 +51,7 @@ def _seek(self, offset): delta = offset - cur_pos self.f.seek(delta, 1) - def _open_archive(self, file_path, offset=0): + def _open_archive(self, file_path: PathLike, offset: int = 0): """Opens the current file if it is not open and moves the file pointer to a given position. Closes previous open Ark files. @@ -69,7 +68,7 @@ def _open_archive(self, file_path, offset=0): if offset > 0: self._seek(offset) - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -86,7 +85,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return keys, num_rows - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -120,10 +119,8 @@ class SequentialArkFileDataReader(SequentialArkDataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): - super(SequentialArkFileDataReader, self).__init__( - file_path, permissive=False, **kwargs - ) + def __init__(self, file_path: PathLike, **kwargs): + super().__init__(file_path, permissive=False, **kwargs) self._open_archive(self.file_path) self._eof = False self._keys = None @@ -151,7 +148,7 @@ def keys(self): return self._keys - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -188,7 +185,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -206,12 +209,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): key: List of recording names. data: List of feature matrices/vectors or 3D/2D numpy array. """ - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] count = 0 @@ -264,28 +263,25 @@ class SequentialArkScriptDataReader(SequentialArkDataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): - super(SequentialArkScriptDataReader, self).__init__( - file_path, permissive=False, **kwargs - ) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): + super().__init__(file_path, permissive=False, **kwargs) + self.feature_set = FeatureSet.load(self.file_path, sep=scp_sep) if self.num_parts > 1: - self.scp = self.scp.split( - self.part_idx, self.num_parts, group_by_key=self.split_by_key - ) + self.feature_set = self.feature_set.split(self.part_idx, self.num_parts) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) self.cur_item = 0 @property def keys(self): - return self.scp.key + return self.feature_set["id"] def reset(self): """Closes all the open Ark files and puts the read pointer pointing @@ -295,9 +291,9 @@ def reset(self): def eof(self): """Returns True when all the elements in the scp have been read.""" - return self.cur_item == len(self.scp) + return self.cur_item == len(self.feature_set) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -318,15 +314,18 @@ def read_shapes(self, num_records=0, assert_same_dim=True): for i in range(num_records): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + offset = feature_spec["storage_byte"] + file_path = feature_spec["storage_path"] self._open_archive(file_path, offset) binary = init_kaldi_input_stream(self.f) shape_i = KaldiMatrix.read_shape(self.f, binary, sequential_mode=True) - - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) keys.append(key) shapes.append(shape_i) @@ -338,7 +337,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -359,12 +364,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if num_records == 0: num_records = len(self.scp) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] @@ -373,7 +374,14 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + offset = feature_spec["storage_byte"] + file_path = feature_spec["storage_path"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows @@ -417,21 +425,24 @@ class RandomAccessArkDataReader(RandomAccessDataReader): features after reading them from disk. permissive: If True, if the data that we want to read is not in the file it returns an empty matrix, if False it raises an exception. - scp_sep: Separator for scp files (default ' '). """ def __init__( - self, file_path, path_prefix=None, transform=None, permissive=False, scp_sep=" " + self, + file_path: PathLike, + path_prefix: Optional[PathLike] = None, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, ): - super(RandomAccessArkDataReader, self).__init__( - file_path, transform, permissive - ) + super().__init__(file_path, transform, permissive) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) - archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True) + archives, archive_idx = np.unique( + self.feature_set["storage_path"], return_inverse=True + ) self.archives = archives self.archive_idx = archive_idx self.f = [None] * len(self.archives) @@ -448,7 +459,7 @@ def close(self): f.close() self.f = [None] * len(self.f) - def _open_archive(self, key_idx, offset=0): + def _open_archive(self, key_idx: int, offset: int = 0): """Opens the Ark file correspoding to a given feature/matrix if it is not already open and moves the file pointer to the point where we can read that feature matrix. @@ -473,7 +484,9 @@ def _open_archive(self, key_idx, offset=0): return f, self.locks[archive_idx] - def read_num_rows(self, keys, assert_same_dim=True): + def read_num_rows( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -489,7 +502,9 @@ def read_num_rows(self, keys, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=np.int) return num_rows - def read_dims(self, keys, assert_same_dim=True): + def read_dims( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -507,7 +522,9 @@ def read_dims(self, keys, assert_same_dim=True): assert np.all(dims == dims[0]) return dims - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -525,25 +542,26 @@ def read_shapes(self, keys, assert_same_dim=True): shapes = [] for key in keys: - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: shapes.append((0,)) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + offset = feature_spec["storage_byte"] f, lock = self._open_archive(index) with lock: f.seek(offset, 0) binary = init_kaldi_input_stream(f) shape_i = KaldiMatrix.read_shape(f, binary, sequential_mode=False) - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) shapes.append(shape_i) @@ -553,7 +571,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -574,12 +598,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -588,15 +608,20 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): data = [] for i, key in enumerate(keys): - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: data.append(np.array([], dtype=float_cpu())) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + offset = feature_spec["storage_byte"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows diff --git a/hyperion/io/ark_data_writer.py b/hyperion/io/ark_data_writer.py index 58f5c0a1..6adf78b2 100644 --- a/hyperion/io/ark_data_writer.py +++ b/hyperion/io/ark_data_writer.py @@ -3,15 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +from typing import Union, Optional, List import numpy as np from ..hyp_defs import float_save -from ..utils.kaldi_io_funcs import (init_kaldi_output_stream, is_token, - write_token) +from ..utils.kaldi_io_funcs import init_kaldi_output_stream, is_token, write_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix -from ..utils.scp_list import SCPList +from ..utils import PathLike from .data_writer import DataWriter @@ -28,11 +27,17 @@ class ArkDataWriter(DataWriter): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). + """ - def __init__(self, archive_path, script_path=None, binary=True, **kwargs): - super(ArkDataWriter, self).__init__(archive_path, script_path, **kwargs) + def __init__( + self, + archive_path: PathLike, + script_path: Optional[PathLike] = None, + binary: bool = True, + **kwargs, + ): + super().__init__(archive_path, script_path, **kwargs) self.binary = binary if binary: @@ -40,10 +45,9 @@ def __init__(self, archive_path, script_path=None, binary=True, **kwargs): else: self.f = open(archive_path, "w") - if script_path is not None: - self.f_script = open(script_path, "w") - else: - self.f_script = None + if script_path is not None and not self.script_is_scp: + row = self.script_sep.join(["id", "storage_path", "storage_byte"]) + self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type @@ -67,7 +71,7 @@ def flush(self): if self.f_script is not None: self.f_script.flush() - def _convert_data(self, data): + def _convert_data(self, data: np.array): """Converts the feature matrix from numpy array to KaldiMatrix or KaldiCompressedMatrix. """ @@ -89,7 +93,11 @@ def _convert_data(self, data): raise ValueError("Data is not ndarray or KaldiMatrix") - def write(self, keys, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + ): """Writes data to file. Args: @@ -114,9 +122,11 @@ def write(self, keys, data): data_i.write(self.f, self.binary) if self.f_script is not None: - self.f_script.write( - "%s%s%s:%d\n" % (key_i, self.scp_sep, self.archive_path, pos) - ) + if self.script_is_scp: + self.f_script.write(f"{key_i} {self.archive_path}:{pos}\n") + else: + row = self.script_sep.join([key_i, self.archive_path, str(pos)]) + self.f_script.write(f"{row}\n") if self._flush: self.flush() diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index 69cfa65b..1052ce8c 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -10,11 +10,13 @@ import subprocess import numpy as np +import pandas as pd import soundfile as sf from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from typing import Union, Optional, List from ..hyp_defs import float_cpu -from ..utils import SCPList, SegmentList +from ..utils import RecordingSet, SegmentSet, PathLike valid_ext = [ ".wav", @@ -34,7 +36,7 @@ ".sds", ".sf", ".voc", - "w64", + ".w64", ".wve", ".xi", ] @@ -44,38 +46,36 @@ class AudioReader(object): """Class to read audio files from wav, flac or pipe Attributes: - file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object. - segments_path: segments file with format: segment_id file_id tbeg tend + recordings: RecordingSet or file path to RecordingSet + segments: SegmentSet or file path to SegmentSet wav_scale: multiplies signal by scale factor """ - def __init__(self, file_path, segments_path=None, wav_scale=2**15 - 1): - self.file_path = file_path - if isinstance(file_path, SCPList): - self.scp = file_path - else: - self.scp = SCPList.load(file_path, sep=" ", is_wav=True) - - self.segments_path = segments_path - if segments_path is None: - self.segments = None - self.with_segments = False - else: + def __init__( + self, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 2 ** 15 - 1, + ): + if not isinstance(recordings, RecordingSet): + recordings = RecordingSet.load(recordings) + + self.recordings = recordings + + self.with_segments = False + if segments is not None: self.with_segments = True - if isinstance(file_path, SegmentList): - self.segments = segments_path - else: - self.segments = SegmentList.load(segments_path, - sep=" ", - index_by_file=False) + if not isinstance(segments, SegmentSet): + segments = SegmentSet.load(segments) + self.segments = segments self.wav_scale = wav_scale @property def keys(self): if self.with_segments: - return np.asarray(self.segments["segment_id"]) - return self.scp.key + return self.segments["id"].values + return self.recordings["id"].values def __enter__(self): """Function required when entering contructions of type @@ -94,10 +94,12 @@ def __exit__(self, exc_type, exc_value, traceback): pass @staticmethod - def read_wavspecifier(wavspecifier, - scale=2**15, - time_offset=0, - time_dur=0): + def read_wavspecifier( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0.0, + time_dur: float = 0.0, + ): """Reads an audiospecifier (audio_file/pipe) It reads from pipe or from all the files that can be read by `libsndfile ` @@ -113,59 +115,123 @@ def read_wavspecifier(wavspecifier, wavspecifier = wavspecifier.strip() if wavspecifier[-1] == "|": wavspecifier = wavspecifier[:-1] - x, fs = AudioReader.read_pipe(wavspecifier, scale) - if time_offset == 0 and time_dur == 0: - return x, fs - - start_sample = int(math.floor(time_offset * fs)) - num_samples = int(math.floor(time_dur * fs)) - if num_samples == 0: - return x[start_sample:], fs - - end_sample = start_sample + num_samples - assert end_sample <= len(x) - return x[start_sample:end_sample], fs + return AudioReader.read_pipe(wavspecifier, scale, time_offset, time_dur) ext = os.path.splitext(wavspecifier)[1] if ext in valid_ext: - if time_offset == 0 and time_dur == 0: - x, fs = sf.read(wavspecifier, dtype=float_cpu()) - x *= scale - return x, fs - - with sf.SoundFile(wavspecifier, "r") as f: - fs = f.samplerate - start_sample = int(math.floor(time_offset * fs)) - num_samples = int(math.floor(time_dur * fs)) - f.seek(start_sample) - if num_samples > 0: - x = scale * f.read(num_samples, dtype=float_cpu()) - else: - x = scale * f.read(dtype=float_cpu()) - return x, fs + return AudioReader.read_file(wavspecifier, scale, time_offset, time_dur) raise Exception("Unknown format for %s" % (wavspecifier)) @staticmethod - def read_pipe(wavspecifier, scale=2**15): + def read_pipe( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): """Reads wave file from a pipe Args: wavspecifier: Shell command with pipe output scale: Multiplies signal by scale factor """ - # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - proc = subprocess.Popen(wavspecifier, - shell=True, - stdout=subprocess.PIPE) + if wavspecifier[-1] == "|": + wavspecifier = wavspecifier[:-1] + + proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE) pipe = proc.communicate()[0] if proc.returncode != 0: - raise Exception("Wave read pipe command %s returned code %d" % - (wavspecifier, proc.returncode)) + raise Exception( + "Wave read pipe command %s returned code %d" + % (wavspecifier, proc.returncode) + ) x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu()) x *= scale - return x, fs + if time_offset == 0 and time_dur == 0: + return x, fs + + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + if num_samples == 0: + return x[start_sample:], fs + + end_sample = start_sample + num_samples + assert end_sample <= len(x) + return x[start_sample:end_sample], fs + + @staticmethod + def read_file_sf( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): + if time_offset == 0 and time_dur == 0: + x, fs = sf.read(wavspecifier, dtype=float_cpu()) + x *= scale + return x, fs + + with sf.SoundFile(wavspecifier, "r") as f: + fs = f.samplerate + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + f.seek(start_sample) + if num_samples > 0: + x = scale * f.read(num_samples, dtype=float_cpu()) + else: + x = scale * f.read(dtype=float_cpu()) + + return x, fs + + @staticmethod + def read_file( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): + try: + return AudioReader.read_file_sf(wavspecifier, scale, time_offset, time_dur) + except: + # some files produce error in the fseek after reading the data, + # this seems an issue from pysoundfile or soundfile lib itself + # we try to read from + # time-offset to the end of the file, and remove the extra frames later, + # this solves the problem in most cases + logging.info( + ( + "error-1 reading keys=%s offset=%f duration=%f" + "retrying reading until end-of-file ..." + ), + wavspecifier, + time_offset, + time_dur, + ) + try: + x, fs = AudioReader.read_file_sf(wavspecifier, scale, time_offset) + num_samples = int(math.floor(time_dur * fs)) + x = x[:num_samples] + return x, fs + except: + logging.info( + ( + "error-2 reading keys=%s offset=%f duration=%f" + "retrying reading full file ..." + ), + wavspecifier, + time_offset, + time_dur, + ) + + x, fs = AudioReader.read_file_sf(wavspecifier, scale) + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + x = x[start_sample : start_sample + num_samples] + return x, fs - def _read_segment(self, segment, time_offset=0, time_dur=0): + def _read_segment( + self, segment: pd.Series, time_offset: float = 0, time_dur: float = 0 + ): """Reads a wave segment Args: @@ -173,28 +239,11 @@ def _read_segment(self, segment, time_offset=0, time_dur=0): Returns: Wave, sampling frequency """ - file_id = segment["file_id"] - t_beg = segment["tbeg"] + time_offset - t_end = segment["tend"] - if time_dur > 0: - t_end_new = t_beg + time_dur - assert t_end_new <= t_end - t_end = t_end_new - - file_path, _, _ = self.scp[file_id] - x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale) - num_samples_i = len(x_i) - s_beg = int(t_beg * fs_i) - if s_beg >= num_samples_i: - raise Exception( - "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" - % (file_id, t_beg, s_beg, file_id, num_samples_i)) - - s_end = int(t_end * fs_i) - if s_end > num_samples_i or t_end < 0: - s_end = num_samples_i - - x_i = x_i[s_beg:s_end] + recording_id = segment["recording_id"] + t_start = segment["start"] + time_offset + t_dur = segment["duration"] + storage_path = self.recordings.loc[recording_id, "storage_path"] + x_i, fs_i = self.read_wavspecifier(storage_path, self.wav_scale, t_start, t_dur) return x_i, fs_i def read(self): @@ -202,27 +251,23 @@ def read(self): class SequentialAudioReader(AudioReader): - def __init__( self, - file_path, - segments_path=None, - wav_scale=2**15 - 1, - part_idx=1, - num_parts=1, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 2 ** 15 - 1, + part_idx: int = 1, + num_parts: int = 1, ): - super().__init__(file_path, segments_path, wav_scale=wav_scale) + super().__init__(recordings, segments, wav_scale=wav_scale) self.cur_item = 0 self.part_idx = part_idx self.num_parts = num_parts if self.num_parts > 1: if self.with_segments: - self.segments = self.segments.split(self.part_idx, - self.num_parts) + self.segments = self.segments.split(self.part_idx, self.num_parts) else: - self.scp = self.scp.split(self.part_idx, - self.num_parts, - group_by_key=False) + self.recordings = self.recordings.split(self.part_idx, self.num_parts) def __iter__(self): """Needed to build an iterator, e.g.: @@ -262,9 +307,9 @@ def eof(self): """ if self.with_segments: return self.cur_item == len(self.segments) - return self.cur_item == len(self.scp) + return self.cur_item == len(self.recordings) - def read(self, num_records=0, time_offset=0, time_durs=0): + def read(self, num_records: int = 0, time_offset: float = 0, time_durs: float = 0): """Reads next num_records audio files Args: @@ -281,7 +326,7 @@ def read(self, num_records=0, time_offset=0, time_durs=0): if self.with_segments: num_records = len(self.segments) - self.cur_item else: - num_records = len(self.scp) - self.cur_item + num_records = len(self.recordings) - self.cur_item offset_is_list = isinstance(time_offset, (list, np.ndarray)) dur_is_list = isinstance(time_durs, (list, np.ndarray)) @@ -297,13 +342,14 @@ def read(self, num_records=0, time_offset=0, time_durs=0): dur_i = time_durs[i] if dur_is_list else time_durs if self.with_segments: - segment = self.segments[self.cur_item] - key = segment["segment_id"] + segment = self.segments.iloc[self.cur_item] + key = segment["id"] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: - key, file_path, _, _ = self.scp[self.cur_item] - x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale, - offset_i, dur_i) + key, file_path = self.recordings.iloc[self.cur_item] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) keys.append(key) data.append(x_i) @@ -318,14 +364,14 @@ def filter_args(**kwargs): return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[str] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") parser.add_argument( "--wav-scale", - default=2**15 - 1, + default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) @@ -334,38 +380,50 @@ def add_class_args(parser, prefix=None): "--part-idx", type=int, default=1, - help=("splits the list of files into num-parts and " - "processes part-idx"), + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), ) parser.add_argument( "--num-parts", type=int, default=1, - help=("splits the list of files into num-parts and " - "processes part-idx"), + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), ) except: pass if prefix is not None: outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), + "--" + prefix, action=ActionParser(parser=parser), ) add_argparse_args = add_class_args class RandomAccessAudioReader(AudioReader): + def __init__( + self, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 2 ** 15 - 1, + ): + super().__init__(recordings, segments, wav_scale) - def __init__(self, file_path, segments_path=None, wav_scale=2**15 - 1): - super().__init__(file_path, segments_path, wav_scale) - - def _read(self, keys, time_offset=0, time_durs=0): + def read( + self, + keys: Union[str, List, np.array], + time_offset: float = 0, + time_durs: float = 0, + ): """Reads the waveforms for the recordings in keys. Args: keys: List of recording/segment_ids names. + time_offset: float or float list with time-offsets + time_durs: float or float list with durations Returns: data: List of waveforms @@ -384,93 +442,92 @@ def _read(self, keys, time_offset=0, time_durs=0): dur_i = time_durs[i] if dur_is_list else time_durs if self.with_segments: - if not (key in self.segments): + if not (key in self.segments.index): raise Exception("Key %s not found" % key) - segment = self.segments[key] + segment = self.segments.loc[key] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: - if not (key in self.scp): + if not (key in self.recordings.index): raise Exception("Key %s not found" % key) - file_path, _, _ = self.scp[key] - x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale, - offset_i, dur_i) + file_path = self.recordings.loc[key, "storage_path"] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) data.append(x_i) fs.append(fs_i) return data, fs - def read(self, keys, time_offset=0, time_durs=0): - """Reads the waveforms for the recordings in keys. - - Args: - keys: List of recording/segment_ids names. - - Returns: - data: List of waveforms - fs: List of sampling freq. - """ - try: - x, fs = self._read(keys, - time_offset=time_offset, - time_durs=time_durs) - except: - if isinstance(keys, str): - keys = [keys] - - if not isinstance(time_offset, (list, np.ndarray)): - time_offset = [time_offset] * len(keys) - if not isinstance(time_durs, (list, np.ndarray)): - time_durs = [time_durs] * len(keys) - - try: - # some files produce error in the fseek after reading the data, - # this seems an issue from pysoundfile or soundfile lib itself - # we try to read from - # time-offset to the end of the file, and remove the extra frames later, - # this solves the problem in most cases - logging.info(("error-1 reading at keys={} offset={} " - "retrying reading until end-of-file ...").format( - keys, time_offset)) - x, fs = self._read(keys, time_offset=time_offset) - for i in range(len(x)): - end_sample = int(time_durs[i] * fs[i]) - x[i] = x[i][:end_sample] - except: - # try to read the full file - logging.info(("error-2 reading at key={}, " - "retrying reading full file ...").format(keys)) - x, fs = self._read(keys) - for i in range(len(x)): - start_sample = int(time_offset[i] * fs[i]) - end_sample = start_sample + int(time_durs[i] * fs[i]) - x[i] = x[i][start_sample:end_sample] - - return x, fs + # def read(self, keys, time_offset=0, time_durs=0): + # """Reads the waveforms for the recordings in keys. + + # Args: + # keys: List of recording/segment_ids names. + + # Returns: + # data: List of waveforms + # fs: List of sampling freq. + # """ + # try: + # x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) + # except: + # if isinstance(keys, str): + # keys = [keys] + + # if not isinstance(time_offset, (list, np.ndarray)): + # time_offset = [time_offset] * len(keys) + # if not isinstance(time_durs, (list, np.ndarray)): + # time_durs = [time_durs] * len(keys) + + # try: + # logging.info( + # ( + # "error-1 reading at keys={} offset={} " + # "retrying reading until end-of-file ..." + # ).format(keys, time_offset) + # ) + # x, fs = self._read(keys, time_offset=time_offset) + # for i in range(len(x)): + # end_sample = int(time_durs[i] * fs[i]) + # x[i] = x[i][:end_sample] + # except: + # # try to read the full file + # logging.info( + # ( + # "error-2 reading at key={}, " "retrying reading full file ..." + # ).format(keys) + # ) + # x, fs = self._read(keys) + # for i in range(len(x)): + # start_sample = int(time_offset[i] * fs[i]) + # end_sample = start_sample + int(time_durs[i] * fs[i]) + # x[i] = x[i][start_sample:end_sample] + + # return x, fs @staticmethod def filter_args(**kwargs): - valid_args = ("wav_scale", ) + valid_args = ("wav_scale",) return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[str] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") parser.add_argument( "--wav-scale", - default=2**15 - 1, + default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) if prefix is not None: outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), + "--" + prefix, action=ActionParser(parser=parser), ) add_argparse_args = add_class_args diff --git a/hyperion/io/audio_writer.py b/hyperion/io/audio_writer.py index f98a3251..e416c209 100644 --- a/hyperion/io/audio_writer.py +++ b/hyperion/io/audio_writer.py @@ -8,12 +8,16 @@ import numpy as np import soundfile as sf +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from typing import Union, Optional, List +from pathlib import Path from ..hyp_defs import float_cpu from ..utils.kaldi_io_funcs import is_token -from ..utils.scp_list import SCPList +from ..utils import PathLike from .audio_reader import valid_ext + subtype_to_npdtype = { "PCM_32": "int32", "ALAW": "int16", @@ -37,25 +41,23 @@ class AudioWriter(object): Attributes: output_path: output data file path. - script_path: optional output scp file. + script_path: optional output kaldi .scp or pandas .csv file. audio_format: audio file format audio_subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...], if None, it uses soundfile defaults (recommended) - scp_sep: Separator for scp files (default ' '). """ def __init__( self, - output_path, - script_path=None, - audio_format="wav", - audio_subtype=None, - scp_sep=" ", + output_path: PathLike, + script_path: Optional[PathLike] = None, + audio_format: str = "wav", + audio_subtype: Optional[str] = None, ): - self.output_path = output_path - self.script_path = script_path + self.output_path = Path(output_path) + self.script_path = Path(script_path) if script_path is not None else None self.audio_format = audio_format - self.scp_sep = scp_sep + self.output_path.mkdir(exist_ok=True, parents=True) assert "." + self.audio_format in valid_ext if audio_subtype is None: @@ -64,16 +66,23 @@ def __init__( self.subtype = audio_subtype assert sf.check_format(self.audio_format, self.subtype) - if not os.path.exists(output_path): - try: - os.makedirs(output_path) - except FileExistsError: - pass - + self.script_is_scp = False + self.script_sep = None + self.f_script = None if script_path is not None: - self.f_script = open(script_path, "w") - else: - self.f_script = None + self.script_path.parent.mkdir(exist_ok=True, parents=True) + script_ext = self.script_path.suffix + self.script_is_scp = script_ext == ".scp" + + if self.script_is_scp: + self.f_script = open(self.script_path, "w") + else: + self.script_sep = "," if script_ext == ".csv" else "\t" + self.f_script = open(self.script_path, "w", "utf-8") + row = self.script_sep.join( + ["id", "storage_path", "duration", "sample_freq"] + ) + self.f_script.write(f"{row}\n") def __enter__(self): """Function required when entering contructions of type @@ -96,7 +105,12 @@ def close(self): if self.f_script is not None: self.f_script.close() - def write(self, keys, data, fs): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + fs: Union[int, float, List[int], List[float], np.array], + ): """Writes waveform to audio file. Args: @@ -120,14 +134,21 @@ def write(self, keys, data, fs): file_basename, self.audio_format, ) - fs_i = fs[i] if fs_is_list else fs + fs_i = int(fs[i]) if fs_is_list else fs data_i = data[i].astype(dtype, copy=False) sf.write(output_file, data_i, fs_i, subtype=self.subtype) output_files.append(output_file) if self.f_script is not None: - self.f_script.write("%s%s%s\n" % (key_i, self.scp_sep, output_file)) + if self.script_is_scp: + self.f_script.write(f"{key_i} {output_file}\n") + else: + duration_i = data_i.shape[-1] / fs_i + row = self.script_sep.join( + [key_i, output_file, str(duration_i), str(fs_i)] + ) + self.f_script.write(f"{row}\n") self.f_script.flush() return output_files @@ -146,29 +167,30 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") # parser.add_argument(p1+'output-wav-scale', default=1, type=float, # help=('scale to divide the waveform before writing')) parser.add_argument( - p1 + "output-audio-format", + "--output-audio-format", default="flac", choices=["flac", "ogg", "wav"], help=("ouput audio format"), ) parser.add_argument( - p1 + "output-audio-subtype", + "--output-audio-subtype", default=None, choices=["pcm_16", "pcm_24", "float", "double", "vorbis"], help=("coding format for audio file"), ) - # parser.add_argument(p1+'output-fs', default=16000, type=int, - # help=('output sample frequency')) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) add_argparse_args = add_class_args diff --git a/hyperion/io/bin_vad_reader.py b/hyperion/io/bin_vad_reader.py index e4e64777..82e2a0c5 100644 --- a/hyperion/io/bin_vad_reader.py +++ b/hyperion/io/bin_vad_reader.py @@ -18,13 +18,12 @@ def __init__( self, rspecifier, path_prefix=None, - scp_sep=" ", frame_length=25, frame_shift=10, snip_edges=False, ): - r = DRF.create(rspecifier, path_prefix, scp_sep=scp_sep) + r = DRF.create(rspecifier, path_prefix) super().__init__(r.file_path, r.permissive) self.r = r self.frame_shift = frame_shift diff --git a/hyperion/io/data_reader.py b/hyperion/io/data_reader.py index bbefa62d..73c120b5 100644 --- a/hyperion/io/data_reader.py +++ b/hyperion/io/data_reader.py @@ -6,18 +6,24 @@ import logging import multiprocessing from abc import ABCMeta, abstractmethod +from typing import Union, Optional, List, Callable, Tuple import numpy as np from ..hyp_defs import float_cpu from ..np.transforms import TransformList -from ..utils.scp_list import SCPList +from ..utils import PathLike class DataReader(object): __metaclass__ = ABCMeta - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): """Abstract base class to read Ark or hdf5 feature files. Attributes: @@ -57,7 +63,7 @@ def close(self): pass @staticmethod - def _squeeze(data, permissive=False): + def _squeeze(data: np.array, permissive: bool = False): """Converts list of matrices to 3D numpy array or list of vectors to 2D numpy array. @@ -121,7 +127,7 @@ def _combine_ranges(read_range, row_offset, num_rows): return row_offset, num_rows @staticmethod - def _apply_range_to_shape(shape, row_offset, num_rows): + def _apply_range_to_shape(shape: Tuple[int, int], row_offset: int, num_rows: int): """Modifies shape given the user defined row_offset and num_rows to read. If we are reading a matrix of shape (100,4) and row_offset=10, num_rows=20, it returns (20,4). @@ -158,25 +164,22 @@ class SequentialDataReader(DataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ __metaclass__ = ABCMeta def __init__( self, - file_path, - transform=None, - permissive=False, - part_idx=1, - num_parts=1, - split_by_key=False, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + part_idx: int = 1, + num_parts: int = 1, ): super().__init__(file_path, transform, permissive) self.lock = multiprocessing.Lock() self.part_idx = part_idx self.num_parts = num_parts - self.split_by_key = split_by_key def __iter__(self): """Needed to build an iterator, e.g.: @@ -218,7 +221,7 @@ def eof(self): return False @abstractmethod - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -234,7 +237,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -250,7 +253,7 @@ def read_dims(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -266,7 +269,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read(self, num_records=0, squeeze=False, offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -290,7 +299,12 @@ def read(self, num_records=0, squeeze=False, offset=0, num_rows=0): class RandomAccessDataReader(DataReader): __metaclass__ = ABCMeta - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): """Abstract base class to read Ark or hdf5 feature files in random order. @@ -305,7 +319,7 @@ def __init__(self, file_path, transform=None, permissive=False): super().__init__(file_path, transform, permissive) @abstractmethod - def read_num_rows(self, keys=None, assert_same_dim=True): + def read_num_rows(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -320,7 +334,7 @@ def read_num_rows(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read_dims(self, keys=None, assert_same_dim=True): + def read_dims(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -335,7 +349,7 @@ def read_dims(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read_shapes(self, keys=None, assert_same_dim=True): + def read_shapes(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -350,7 +364,13 @@ def read_shapes(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read(self, keys, squeeze=False, offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str]], + squeeze: bool = False, + offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: diff --git a/hyperion/io/data_rw_factory.py b/hyperion/io/data_rw_factory.py index 7868baae..b56e8c27 100644 --- a/hyperion/io/data_rw_factory.py +++ b/hyperion/io/data_rw_factory.py @@ -4,10 +4,13 @@ """ import logging +from typing import Union, Optional, List, Callable, Tuple from jsonargparse import ActionParser, ArgumentParser +import numpy as np from ..utils.kaldi_matrix import compression_methods +from ..utils import PathLike from .ark_data_reader import RandomAccessArkDataReader as RADR from .ark_data_reader import SequentialArkFileDataReader as SAFDR from .ark_data_reader import SequentialArkScriptDataReader as SASDR @@ -17,8 +20,7 @@ from .h5_data_reader import SequentialH5FileDataReader as SH5FDR from .h5_data_reader import SequentialH5ScriptDataReader as SH5SDR from .h5_data_writer import H5DataWriter as H5DW -from .rw_specifiers import (ArchiveType, RSpecifier, RSpecType, WSpecifier, - WSpecType) +from .rw_specifiers import ArchiveType, RSpecifier, RSpecType, WSpecifier, WSpecType class DataWriterFactory(object): @@ -27,7 +29,9 @@ class DataWriterFactory(object): """ @staticmethod - def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): + def create( + wspecifier: PathLike, compress: bool = False, compression_method: str = "auto" + ): if isinstance(wspecifier, str): wspecifier = WSpecifier.create(wspecifier) @@ -43,7 +47,6 @@ def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): flush=wspecifier.flush, compress=compress, compression_method=compression_method, - scp_sep=scp_sep, ) else: return ADW( @@ -53,21 +56,19 @@ def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): flush=wspecifier.flush, compress=compress, compression_method=compression_method, - scp_sep=scp_sep, ) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "compress", "compression_method") + valid_args = ("compress", "compression_method") return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument("--compress", default=False, action="store_true") parser.add_argument( "--compression-method", default="auto", choices=compression_methods @@ -80,7 +81,7 @@ def add_class_args(parser, prefix=None): class SequentialDataReaderFactory(object): @staticmethod - def create(rspecifier, path_prefix=None, scp_sep=" ", **kwargs): + def create(rspecifier: PathLike, path_prefix: Optional[PathLike] = None, **kwargs): if isinstance(rspecifier, str): rspecifier = RSpecifier.create(rspecifier) @@ -92,27 +93,21 @@ def create(rspecifier, path_prefix=None, scp_sep=" ", **kwargs): return SAFDR(rspecifier.archive, **kwargs) else: if rspecifier.archive_type == ArchiveType.H5: - return SH5SDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs) + return SH5SDR(rspecifier.script, path_prefix, **kwargs) else: - return SASDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs) + return SASDR(rspecifier.script, path_prefix, **kwargs) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "path_prefix", "part_idx", "num_parts") + valid_args = ("path_prefix", "part_idx", "num_parts") return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - try: - parser.add_argument( - "--scp-sep", default=" ", help=("scp file field separator") - ) - except: - pass parser.add_argument( "--path-prefix", default=None, help=("scp file_path prefix") ) @@ -139,7 +134,11 @@ def add_class_args(parser, prefix=None): class RandomAccessDataReaderFactory(object): @staticmethod - def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): + def create( + rspecifier: PathLike, + path_prefix: Optional[PathLike] = None, + transform: Optional[Callable[[np.array], np.array]] = None, + ): if isinstance(rspecifier, str): rspecifier = RSpecifier.create(rspecifier) logging.debug(rspecifier.__dict__) @@ -162,7 +161,6 @@ def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): path_prefix, transform=transform, permissive=rspecifier.permissive, - scp_sep=scp_sep, ) else: return RADR( @@ -170,26 +168,19 @@ def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): path_prefix, transform=transform, permissive=rspecifier.permissive, - scp_sep=scp_sep, ) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "path_prefix") + valid_args = "path_prefix" return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - try: - parser.add_argument( - "--scp-sep", default=" ", help=("scp file field separator") - ) - except: - pass parser.add_argument( "--path-prefix", default=None, help=("scp file_path prefix") ) diff --git a/hyperion/io/data_writer.py b/hyperion/io/data_writer.py index cf2bb4f9..8adbf87a 100644 --- a/hyperion/io/data_writer.py +++ b/hyperion/io/data_writer.py @@ -5,9 +5,13 @@ import os from abc import ABCMeta, abstractmethod +from typing import Union, Optional, List +from pathlib import Path +import numpy as np +from ..utils import PathLike -class DataWriter(object): +class DataWriter: """Abstract base class to write Ark or hdf5 feature files. Attributes: @@ -19,35 +23,42 @@ class DataWriter(object): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). """ __metaclass__ = ABCMeta def __init__( self, - archive_path, - script_path=None, - flush=False, - compress=False, - compression_method="auto", - scp_sep=" ", + archive_path: PathLike, + script_path: Optional[PathLike] = None, + flush: bool = False, + compress: bool = False, + compression_method: str = "auto", ): - self.archive_path = archive_path - self.script_path = script_path + self.archive_path = Path(archive_path) + self.script_path = Path(script_path) if script_path is not None else None self._flush = flush self.compress = compress self.compression_method = compression_method - self.scp_sep = scp_sep - archive_dir = os.path.dirname(archive_path) - if not os.path.exists(archive_dir): - os.makedirs(archive_dir) + archive_dir = self.archive_path.parent + archive_dir.mkdir(exist_ok=True, parents=True) + self.script_is_scp = False + self.script_sep = None + self.f_script = None if script_path is not None: - script_dir = os.path.dirname(script_path) - if not os.path.exists(script_dir): - os.makedirs(script_dir) + self.script_path.parent.mkdir(exist_ok=True, parents=True) + script_ext = self.script_path.suffix + self.script_is_scp = script_ext == ".scp" + + if self.script_is_scp: + self.f_script = open(self.script_path, "w") + else: + self.script_sep = "," if script_ext == ".csv" else "\t" + self.f_script = open(self.script_path, "w", "utf-8") + row = self.script_sep.join(["id", "storage_path"]) + self.f_script.write(f"{row}\n") def __enter__(self): """Function required when entering contructions of type @@ -77,7 +88,11 @@ def flush(self): pass @abstractmethod - def write(self, key, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + ): """Writes data to file. Args: diff --git a/hyperion/io/h5_data_reader.py b/hyperion/io/h5_data_reader.py index dfefbec3..d509504d 100644 --- a/hyperion/io/h5_data_reader.py +++ b/hyperion/io/h5_data_reader.py @@ -6,8 +6,8 @@ """ import multiprocessing -import sys import time +from typing import Union, Optional, List, Callable, Tuple import h5py import numpy as np @@ -16,11 +16,18 @@ from ..utils.kaldi_io_funcs import is_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix from ..utils.list_utils import split_list, split_list_group_by_key -from ..utils.scp_list import SCPList + +# from ..utils.scp_list import SCPList +from ..utils import FeatureSet, PathLike from .data_reader import RandomAccessDataReader, SequentialDataReader -def _read_h5_data(dset, row_offset=0, num_rows=0, transform=None): +def _read_h5_data( + dset, + row_offset: int = 0, + num_rows: int = 0, + transform: Optional[Callable[[np.array], np.array]] = None, +): """Auxiliary function to read the feature matrix from hdf5 dataset. It decompresses the data if it was compressed. @@ -74,7 +81,7 @@ class SequentialH5DataReader(SequentialDataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.f = None self.cur_file = None @@ -86,7 +93,7 @@ def close(self): self.f.close() self.f = None - def _open_archive(self, file_path): + def _open_archive(self, file_path: PathLike): """Opens the hdf5 file where the next matrix/vector is if it is not open. If there was another hdf5 file open, it closes it. @@ -96,7 +103,7 @@ def _open_archive(self, file_path): self.cur_file = file_path self.f = h5py.File(file_path, "r") - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -113,7 +120,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return keys, num_rows - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -147,7 +154,7 @@ class SequentialH5FileDataReader(SequentialH5DataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, permissive=False, **kwargs) self._open_archive(self.file_path) self._keys = list(self.f.keys()) @@ -172,7 +179,7 @@ def eof(self): """Returns True when it reaches the end of the ark file.""" return self.cur_item == len(self._keys) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -204,7 +211,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -225,12 +238,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if num_records == 0: num_records = len(self._keys) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] with self.lock: @@ -268,7 +277,6 @@ class SequentialH5ScriptDataReader(SequentialH5DataReader): the scp file. This is useful when data is read from a different directory of that it was created. - scp_sep: Separator for scp files (default ' '). transform: TransformList object, applies a transformation to the features after reading them from disk. part_idx: It splits the input into num_parts and writes only @@ -277,20 +285,20 @@ class SequentialH5ScriptDataReader(SequentialH5DataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): super().__init__(file_path, permissive=False, **kwargs) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if self.num_parts > 1: - self.scp = self.scp.split( - self.part_idx, self.num_parts, group_by_key=self.split_by_key - ) + self.feature_set = self.feature_set.split(self.part_idx, self.num_parts) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) @property def keys(self): - return self.scp.key + return self.feature_set["id"] def reset(self): """Closes all the open hdf5 files and puts the read pointer pointing @@ -300,9 +308,9 @@ def reset(self): def eof(self): """Returns True when all the elements in the scp have been read.""" - return self.cur_item == len(self.scp) + return self.cur_item == len(self.feature_set) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -316,7 +324,7 @@ def read_shapes(self, num_records=0, assert_same_dim=True): List of tuples with num_records shapes. """ if num_records == 0: - num_records = len(self.scp) - self.cur_item + num_records = len(self.feature_set) - self.cur_item keys = [] shapes = [] @@ -324,14 +332,15 @@ def read_shapes(self, num_records=0, assert_same_dim=True): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - - self._open_archive(file_path) + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + self._open_archive(feature_spec["storage_path"]) shape_i = self.f[key].shape - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) keys.append(key) shapes.append(shape_i) @@ -343,7 +352,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -362,14 +377,10 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): data: List of feature matrices/vectors or 3D/2D numpy array. """ if num_records == 0: - num_records = len(self.scp) - self.cur_item + num_records = len(self.feature_set) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] @@ -378,7 +389,13 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + file_path = feature_spec["storage_path"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows @@ -413,11 +430,18 @@ class RandomAccessH5DataReader(RandomAccessDataReader): it returns an empty matrix, if False it raises an exception. """ - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): super().__init__(file_path, transform, permissive) self.f = None - def read_num_rows(self, keys, assert_same_dim=True): + def read_num_rows( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -433,7 +457,9 @@ def read_num_rows(self, keys, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return num_rows - def read_dims(self, keys, assert_same_dim=True): + def read_dims( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -463,7 +489,7 @@ class RandomAccessH5FileDataReader(RandomAccessH5DataReader): it returns an empty matrix, if False it raises an exception. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.lock = multiprocessing.Lock() self._open_archive(file_path) @@ -474,7 +500,7 @@ def close(self): self.f.close() self.f = None - def _open_archive(self, file_path): + def _open_archive(self, file_path: PathLike): """Open the hdf5 file it it is not open.""" if self.f is None: self.close() @@ -484,7 +510,9 @@ def _open_archive(self, file_path): def keys(self): return list(self.f.keys()) - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -518,7 +546,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -539,12 +573,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -589,17 +619,20 @@ class RandomAccessH5ScriptDataReader(RandomAccessH5DataReader): features after reading them from disk. permissive: If True, if the data that we want to read is not in the file it returns an empty matrix, if False it raises an exception. - scp_sep: Separator for scp files (default ' '). """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): super().__init__(file_path, **kwargs) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) - archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True) + archives, archive_idx = np.unique( + self.feature_set["storage_path"], return_inverse=True + ) self.archives = archives self.archive_idx = archive_idx self.f = [None] * len(self.archives) @@ -614,9 +647,9 @@ def close(self): @property def keys(self): - return self.scp.key + return self.feature_set["id"] - def _open_archive(self, key_idx): + def _open_archive(self, key_idx: int): """Opens the hdf5 file correspoding to a given feature/matrix if it is not already open. @@ -633,7 +666,9 @@ def _open_archive(self, key_idx): return self.f[archive_idx], self.locks[archive_idx] - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -651,18 +686,15 @@ def read_shapes(self, keys, assert_same_dim=True): shapes = [] for key in keys: - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: shapes.append((0,)) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] f, lock = self._open_archive(index) if not (key in f): if self.permissive: @@ -673,8 +705,12 @@ def read_shapes(self, keys, assert_same_dim=True): with lock: shape_i = f[key].shape - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) - # print('%s %d %.2f' % (key,time.time()-t1, len(shapes)/len(keys)*100.)) + + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + shapes.append(shape_i) if assert_same_dim: @@ -683,7 +719,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -704,12 +746,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -718,15 +756,19 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): data = [] for i, key in enumerate(keys): - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: data.append(np.array([], dtype=float_cpu())) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows diff --git a/hyperion/io/h5_data_writer.py b/hyperion/io/h5_data_writer.py index fed91d1e..c34aa0ca 100644 --- a/hyperion/io/h5_data_writer.py +++ b/hyperion/io/h5_data_writer.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +from typing import Union, Optional, List import h5py import numpy as np @@ -11,7 +11,7 @@ from ..hyp_defs import float_save from ..utils.kaldi_io_funcs import is_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix -from ..utils.scp_list import SCPList +from ..utils import PathLike from .data_writer import DataWriter @@ -27,18 +27,18 @@ class H5DataWriter(DataWriter): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). """ - def __init__(self, archive_path, script_path=None, **kwargs): + def __init__( + self, archive_path: PathLike, script_path: Optional[PathLike] = None, **kwargs + ): super().__init__(archive_path, script_path, **kwargs) self.f = h5py.File(archive_path, "w") - if script_path is None: - self.f_script = None - else: - self.f_script = open(script_path, "w") + if script_path is not None and not self.script_is_scp: + row = self.script_sep.join(["id", "storage_path"]) + self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type @@ -64,7 +64,7 @@ def flush(self): if self.f_script is not None: self.f_script.flush() - def _convert_data(self, data): + def _convert_data(self, data: np.array): """Converts data to the format for saving. Compresses the data it needed. Args: @@ -85,7 +85,11 @@ def _convert_data(self, data): else: raise ValueError("Data is not ndarray") - def write(self, keys, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + ): """Writes data to file. Args: @@ -108,9 +112,11 @@ def write(self, keys, data): dset.attrs[k] = v if self.f_script is not None: - self.f_script.write( - "%s%s%s\n" % (key_i, self.scp_sep, self.archive_path) - ) + if self.script_is_scp: + self.f_script.write(f"{key_i} {self.archive_path}\n") + else: + row = self.script_sep.join([key_i, self.archive_path]) + self.f_script.write(f"{row}\n") if self._flush: self.flush() diff --git a/hyperion/io/old_audio_reader.py b/hyperion/io/old_audio_reader.py new file mode 100644 index 00000000..341f04a4 --- /dev/null +++ b/hyperion/io/old_audio_reader.py @@ -0,0 +1,477 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import io +import logging +import math +import os +import subprocess + +import numpy as np +import soundfile as sf +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ..hyp_defs import float_cpu +from ..utils import SCPList, SegmentList + +valid_ext = [ + ".wav", + ".flac", + ".ogg", + ".au", + ".avr", + ".caf", + ".htk", + ".iff", + ".mat", + ".mpc", + ".oga", + ".pvf", + ".rf64", + ".sd2", + ".sds", + ".sf", + ".voc", + "w64", + ".wve", + ".xi", +] + + +class AudioReader(object): + """Class to read audio files from wav, flac or pipe + + Attributes: + file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object. + segments_path: segments file with format: segment_id file_id tbeg tend + wav_scale: multiplies signal by scale factor + """ + + def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): + self.file_path = file_path + if isinstance(file_path, SCPList): + self.scp = file_path + else: + self.scp = SCPList.load(file_path, sep=" ", is_wav=True) + + self.segments_path = segments_path + if segments_path is None: + self.segments = None + self.with_segments = False + else: + self.with_segments = True + if isinstance(file_path, SegmentList): + self.segments = segments_path + else: + self.segments = SegmentList.load( + segments_path, sep=" ", index_by_file=False + ) + + self.wav_scale = wav_scale + + @property + def keys(self): + if self.with_segments: + return np.asarray(self.segments["segment_id"]) + return self.scp.key + + def __enter__(self): + """Function required when entering contructions of type + + with AudioReader('file.h5') as f: + keys, data = f.read() + """ + return self + + def __exit__(self, exc_type, exc_value, traceback): + """Function required when exiting from contructions of type + + with AudioReader('file.h5') as f: + keys, data = f.read() + """ + pass + + @staticmethod + def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0): + """Reads an audiospecifier (audio_file/pipe) + It reads from pipe or from all the files that can be read + by `libsndfile ` + + Args: + wavspecifier: A pipe, wav, flac, ogg file etc. + scale: Multiplies signal by scale factor + time_offset: float indicating the start time to read in the utterance. + time_durs: floats indicating the number of seconds to read from the utterance, + if 0 it reads untils the end + + """ + wavspecifier = wavspecifier.strip() + if wavspecifier[-1] == "|": + wavspecifier = wavspecifier[:-1] + x, fs = AudioReader.read_pipe(wavspecifier, scale) + if time_offset == 0 and time_dur == 0: + return x, fs + + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + if num_samples == 0: + return x[start_sample:], fs + + end_sample = start_sample + num_samples + assert end_sample <= len(x) + return x[start_sample:end_sample], fs + + ext = os.path.splitext(wavspecifier)[1] + if ext in valid_ext: + if time_offset == 0 and time_dur == 0: + x, fs = sf.read(wavspecifier, dtype=float_cpu()) + x *= scale + return x, fs + + with sf.SoundFile(wavspecifier, "r") as f: + fs = f.samplerate + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + f.seek(start_sample) + if num_samples > 0: + x = scale * f.read(num_samples, dtype=float_cpu()) + else: + x = scale * f.read(dtype=float_cpu()) + return x, fs + + raise Exception("Unknown format for %s" % (wavspecifier)) + + @staticmethod + def read_pipe(wavspecifier, scale=2 ** 15): + """Reads wave file from a pipe + Args: + wavspecifier: Shell command with pipe output + scale: Multiplies signal by scale factor + """ + # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE) + pipe = proc.communicate()[0] + if proc.returncode != 0: + raise Exception( + "Wave read pipe command %s returned code %d" + % (wavspecifier, proc.returncode) + ) + x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu()) + x *= scale + return x, fs + + def _read_segment(self, segment, time_offset=0, time_dur=0): + """Reads a wave segment + + Args: + segment: pandas DataFrame (segment_id , file_id, tbeg, tend) + Returns: + Wave, sampling frequency + """ + file_id = segment["file_id"] + t_beg = segment["tbeg"] + time_offset + t_end = segment["tend"] + if time_dur > 0: + t_end_new = t_beg + time_dur + assert t_end_new <= t_end + t_end = t_end_new + + file_path, _, _ = self.scp[file_id] + x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale) + num_samples_i = len(x_i) + s_beg = int(t_beg * fs_i) + if s_beg >= num_samples_i: + raise Exception( + "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" + % (file_id, t_beg, s_beg, file_id, num_samples_i) + ) + + s_end = int(t_end * fs_i) + if s_end > num_samples_i or t_end < 0: + s_end = num_samples_i + + x_i = x_i[s_beg:s_end] + return x_i, fs_i + + def read(self): + pass + + +class SequentialAudioReader(AudioReader): + def __init__( + self, + file_path, + segments_path=None, + wav_scale=2 ** 15 - 1, + part_idx=1, + num_parts=1, + ): + super().__init__(file_path, segments_path, wav_scale=wav_scale) + self.cur_item = 0 + self.part_idx = part_idx + self.num_parts = num_parts + if self.num_parts > 1: + if self.with_segments: + self.segments = self.segments.split(self.part_idx, self.num_parts) + else: + self.scp = self.scp.split( + self.part_idx, self.num_parts, group_by_key=False + ) + + def __iter__(self): + """Needed to build an iterator, e.g.: + r = SequentialAudioReader(...) + for key, s, fs in r: + print(key) + process(s) + """ + return self + + def __next__(self): + """Needed to build an iterator, e.g.: + r = SequentialAudioReader(...) + for key , s, fs in r: + process(s) + """ + key, x, fs = self.read(1) + if len(key) == 0: + raise StopIteration + return key[0], x[0], fs[0] + + def next(self): + """__next__ for Python 2""" + return self.__next__() + + def reset(self): + """Returns the file pointer to the begining of the dataset, + then we can start reading the features again. + """ + self.cur_item = 0 + + def eof(self): + """End of file. + + Returns: + True, when we have read all the recordings in the dataset. + """ + if self.with_segments: + return self.cur_item == len(self.segments) + return self.cur_item == len(self.scp) + + def read(self, num_records=0, time_offset=0, time_durs=0): + """Reads next num_records audio files + + Args: + num_records: Number of audio files to read. + time_offset: List of floats indicating the start time to read in the utterance. + time_durs: List of floats indicating the number of seconds to read from each utterance + + Returns: + key: List of recording names. + data: List of waveforms + fs: list of sample freqs + """ + if num_records == 0: + if self.with_segments: + num_records = len(self.segments) - self.cur_item + else: + num_records = len(self.scp) - self.cur_item + + offset_is_list = isinstance(time_offset, (list, np.ndarray)) + dur_is_list = isinstance(time_durs, (list, np.ndarray)) + + keys = [] + data = [] + fs = [] + for i in range(num_records): + if self.eof(): + break + + offset_i = time_offset[i] if offset_is_list else time_offset + dur_i = time_durs[i] if dur_is_list else time_durs + + if self.with_segments: + segment = self.segments[self.cur_item] + key = segment["segment_id"] + x_i, fs_i = self._read_segment(segment, offset_i, dur_i) + else: + key, file_path, _, _ = self.scp[self.cur_item] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) + + keys.append(key) + data.append(x_i) + fs.append(fs_i) + self.cur_item += 1 + + return keys, data, fs + + @staticmethod + def filter_args(**kwargs): + valid_args = ("part_idx", "num_parts", "wav_scale") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--wav-scale", + default=2 ** 15 - 1, + type=float, + help=("multiplicative factor for waveform"), + ) + try: + parser.add_argument( + "--part-idx", + type=int, + default=1, + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), + ) + parser.add_argument( + "--num-parts", + type=int, + default=1, + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + add_argparse_args = add_class_args + + +class RandomAccessAudioReader(AudioReader): + def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): + super().__init__(file_path, segments_path, wav_scale) + + def _read(self, keys, time_offset=0, time_durs=0): + """Reads the waveforms for the recordings in keys. + + Args: + keys: List of recording/segment_ids names. + + Returns: + data: List of waveforms + """ + if isinstance(keys, str): + keys = [keys] + + offset_is_list = isinstance(time_offset, (list, np.ndarray)) + dur_is_list = isinstance(time_durs, (list, np.ndarray)) + + data = [] + fs = [] + for i, key in enumerate(keys): + + offset_i = time_offset[i] if offset_is_list else time_offset + dur_i = time_durs[i] if dur_is_list else time_durs + + if self.with_segments: + if not (key in self.segments): + raise Exception("Key %s not found" % key) + + segment = self.segments[key] + x_i, fs_i = self._read_segment(segment, offset_i, dur_i) + else: + if not (key in self.scp): + raise Exception("Key %s not found" % key) + + file_path, _, _ = self.scp[key] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) + + data.append(x_i) + fs.append(fs_i) + + return data, fs + + def read(self, keys, time_offset=0, time_durs=0): + """Reads the waveforms for the recordings in keys. + + Args: + keys: List of recording/segment_ids names. + + Returns: + data: List of waveforms + fs: List of sampling freq. + """ + try: + x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) + except: + if isinstance(keys, str): + keys = [keys] + + if not isinstance(time_offset, (list, np.ndarray)): + time_offset = [time_offset] * len(keys) + if not isinstance(time_durs, (list, np.ndarray)): + time_durs = [time_durs] * len(keys) + + try: + # some files produce error in the fseek after reading the data, + # this seems an issue from pysoundfile or soundfile lib itself + # we try to read from + # time-offset to the end of the file, and remove the extra frames later, + # this solves the problem in most cases + logging.info( + ( + "error-1 reading at keys={} offset={} " + "retrying reading until end-of-file ..." + ).format(keys, time_offset) + ) + x, fs = self._read(keys, time_offset=time_offset) + for i in range(len(x)): + end_sample = int(time_durs[i] * fs[i]) + x[i] = x[i][:end_sample] + except: + # try to read the full file + logging.info( + ( + "error-2 reading at key={}, " "retrying reading full file ..." + ).format(keys) + ) + x, fs = self._read(keys) + for i in range(len(x)): + start_sample = int(time_offset[i] * fs[i]) + end_sample = start_sample + int(time_durs[i] * fs[i]) + x[i] = x[i][start_sample:end_sample] + + return x, fs + + @staticmethod + def filter_args(**kwargs): + valid_args = ("wav_scale",) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--wav-scale", + default=2 ** 15 - 1, + type=float, + help=("multiplicative factor for waveform"), + ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + add_argparse_args = add_class_args diff --git a/hyperion/io/vad_rw_factory.py b/hyperion/io/vad_rw_factory.py index 32032d1d..fff1ab4a 100644 --- a/hyperion/io/vad_rw_factory.py +++ b/hyperion/io/vad_rw_factory.py @@ -6,8 +6,7 @@ import logging from .bin_vad_reader import BinVADReader as BVR -from .rw_specifiers import (ArchiveType, RSpecifier, RSpecType, WSpecifier, - WSpecType) +from .rw_specifiers import ArchiveType, RSpecifier, RSpecType, WSpecifier, WSpecType from .segment_vad_reader import SegmentVADReader as SVR @@ -16,7 +15,6 @@ class VADReaderFactory(object): def create( rspecifier, path_prefix=None, - scp_sep=" ", frame_length=25, frame_shift=10, snip_edges=False, @@ -33,7 +31,6 @@ def create( return BVR( rspecifier, path_prefix, - scp_sep, frame_length=frame_length, frame_shift=frame_shift, snip_edges=snip_edges, @@ -48,7 +45,6 @@ def create( return BVR( rspecifier, path_prefix, - scp_sep, frame_length=frame_length, frame_shift=frame_shift, snip_edges=snip_edges, @@ -57,7 +53,6 @@ def create( @staticmethod def filter_args(**kwargs): valid_args = ( - "scp_sep", "path_prefix", "frame_shift", "frame_length", @@ -72,9 +67,6 @@ def add_class_args(parser, prefix=None): else: p1 = "--" + prefix + "." - parser.add_argument( - p1 + "scp-sep", default=" ", help=("scp file field separator") - ) parser.add_argument( p1 + "path-prefix", default=None, help=("scp file_path prefix") ) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 1e42a1c3..fa675fdb 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -9,7 +9,8 @@ import numpy as np import pandas as pd -#import k2 + +# import k2 import sentencepiece as spm import torchaudio.transforms as tat from jsonargparse import ActionParser, ActionYesNo, ArgumentParser @@ -25,16 +26,11 @@ from ...utils.text import read_text from ..torch_defs import floatstr_torch -#from torch.nn.utils.rnn import pad_sequence - - - class AudioDataset(Dataset): - def __init__( self, - audio_file, + recordings_file, segments_file, class_names=None, class_files=None, @@ -46,7 +42,7 @@ def __init__( return_segment_info=None, return_orig=False, target_sample_freq=None, - wav_scale=2**15 - 1, + wav_scale=2 ** 15 - 1, is_val=False, ): @@ -61,12 +57,6 @@ def __init__( self.rank = rank self.world_size = world_size self.epoch = 0 - - if rank == 0: - logging.info("opening audio reader %s", audio_file) - - self.r = AR(audio_file, wav_scale=wav_scale) - if rank == 0: logging.info("loading segments file %s", segments_file) @@ -74,17 +64,17 @@ def __init__( if rank == 0: logging.info("dataset contains %d seqs", len(self.seg_set)) + if rank == 0: + logging.info("opening audio reader %s", recordings_file) + + audio_seg_set = self.seg_set if self.seg_set.has_time_marks else None + self.r = AR(recordings_file, segments=audio_seg_set, wav_scale=wav_scale) + self.is_val = is_val if time_durs_file is not None: - if rank == 0: - logging.info("loading durations file %s", time_durs_file) + self._load_legacy_durations(time_durs_file) - time_durs = SegmentSet.load(time_durs_file) - self.seg_set["duration"] = time_durs.loc[ - self.seg_set["id"]].class_id.values.astype(np.float, - copy=False) - else: - assert "duration" in self.seg_set + assert "duration" in self.seg_set logging.info("loading class-info files") self._load_class_infos(class_names, class_files, is_val) @@ -96,8 +86,9 @@ def __init__( if text_file is not None: logging.info("loading text files") self._load_text_infos(text_file, is_val) - self.return_segment_info = ([] if return_segment_info is None else - return_segment_info) + self.return_segment_info = ( + [] if return_segment_info is None else return_segment_info + ) self.return_orig = return_orig self.num_augs = num_augs @@ -106,9 +97,18 @@ def __init__( self.target_sample_freq = target_sample_freq self.resamplers = {} + def _load_legacy_durations(self, time_durs_file): + if self.rank == 0: + logging.info("loading durations file %s", time_durs_file) + + time_durs = SegmentSet.load(time_durs_file) + self.seg_set["duration"] = time_durs.loc[ + self.seg_set["id"] + ].class_id.values.astype(np.float, copy=False) + def _load_bpe_model(self, bpe_model, is_val): if self.rank == 0: - logging.info("loading bpe file %s" % bpe_model) + logging.info("loading bpe file %s", bpe_model) self.sp = spm.SentencePieceProcessor() self.sp.load(bpe_model) blank_id = self.sp.piece_to_id("") @@ -118,7 +118,7 @@ def _load_text_infos(self, text_file, is_val): if text_file is None: return if self.rank == 0: - logging.info("loading text file %s" % text_file) + logging.info("loading text file %s", text_file) text = read_text(text_file) self.seg_set["text"] = text.loc[self.seg_set["id"]].text @@ -131,8 +131,9 @@ def _load_class_infos(self, class_names, class_files, is_val): assert len(class_names) == len(class_files) for name, file in zip(class_names, class_files): - assert (name in self.seg_set - ), f"class_name {name} not present in the segment set" + assert ( + name in self.seg_set + ), f"class_name {name} not present in the segment set" if self.rank == 0: logging.info("loading class-info file %s" % file) table = ClassInfo.load(file) @@ -143,8 +144,9 @@ def _load_class_infos(self, class_names, class_files, is_val): segment_class_ids = self.seg_set[name].unique() for c_id in class_ids: if c_id not in segment_class_ids: - logging.warning("%s class: %s not present in dataset", - name, c_id) + logging.warning( + "%s class: %s not present in dataset", name, c_id + ) def _create_augmenters(self, aug_cfgs): self.augmenters = [] @@ -154,12 +156,11 @@ def _create_augmenters(self, aug_cfgs): for aug_cfg in aug_cfgs: logging.info(f"loading augmentation={aug_cfg}") - augmenter = SpeechAugment.create(aug_cfg, - random_seed=112358 + - 1000 * self.rank) + augmenter = SpeechAugment.create( + aug_cfg, random_seed=112358 + 1000 * self.rank + ) self.augmenters.append(augmenter) - self.reverb_context = max(augmenter.max_reverb_context, - self.reverb_context) + self.reverb_context = max(augmenter.max_reverb_context, self.reverb_context) def set_epoch(self, epoch): self.epoch = epoch @@ -201,12 +202,13 @@ def _parse_segment_item(self, segment): assert duration <= self.seg_set.loc[seg_id].duration, ( f"{seg_id} with start={start} duration " f"({self.seg_set.loc[seg_id].duration}) < " - f"chunk duration ({duration})") + f"chunk duration ({duration})" + ) else: seg_id, start, duration = segment, 0, 0 - if "start" in self.seg_set: - start += self.seg_set.loc[seg_id].start + # if "start" in self.seg_set: + # start += self.seg_set.loc[seg_id].start return seg_id, start, duration @@ -217,14 +219,23 @@ def _read_audio(self, seg_id, start, duration): start -= reverb_context read_duration = duration + reverb_context + # read audio + x, fs = self.r.read([seg_id], time_offset=start, time_durs=read_duration) + return x[0].astype(floatstr_torch(), copy=False), fs[0] + + def _read_audio0(self, seg_id, start, duration): + # how much extra audio we need to load to + # calculate the reverb of the first part of the audio + reverb_context = min(self.reverb_context, start) + start -= reverb_context + read_duration = duration + reverb_context + # read audio recording_id = self.seg_set.recording_ids(seg_id) - x, fs = self.r.read([recording_id], - time_offset=start, - time_durs=read_duration) + x, fs = self.r.read([recording_id], time_offset=start, time_durs=read_duration) return x[0].astype(floatstr_torch(), copy=False), fs[0] - def _apply_augs(self, x, num_samples, reverb_context_samples): + def _apply_augs(self, x, reverb_context_samples): x_augs = {} # for each type of augmentation for i, augmenter in enumerate(self.augmenters): @@ -233,7 +244,7 @@ def _apply_augs(self, x, num_samples, reverb_context_samples): # augment x x_aug, aug_info = augmenter(x) # remove the extra left context used to compute the reverberation. - x_aug = x_aug[reverb_context_samples:len(x)] + x_aug = x_aug[reverb_context_samples : len(x)] x_aug = x_aug.astype(floatstr_torch(), copy=False) x_augs[f"x_aug_{i}_{j}"] = x_aug @@ -300,7 +311,7 @@ def __getitem__(self, segment): else: num_samples = int(duration * fs) reverb_context_samples = len(x) - num_samples - x_augs = self._apply_augs(x, num_samples, reverb_context_samples) + x_augs = self._apply_augs(x, reverb_context_samples) data.update(x_augs) # add original non augmented audio @@ -311,15 +322,6 @@ def __getitem__(self, segment): else: data["x"] = x - # try: - # import soundfile as sf - - # for i, z in enumerate(r): - # sf.write(f"file_{seg_id}.wav", z, fs, "PCM_16") - # except: - # print("soundfile failed", flush=True) - - # adds the segment labels seg_info = self._get_segment_info(seg_id) data.update(seg_info) return data @@ -329,7 +331,7 @@ def filter_args(**kwargs): ar_args = AR.filter_args(**kwargs) valid_args = ( - "audio_file", + "recordings_file", "segments_file", "aug_cfgs", "num_augs", @@ -352,48 +354,43 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - if "audio_file" not in skip: + if "recordings_file" not in skip: parser.add_argument( - "--audio-file", + "--recordings-file", required=True, - help=("audio manifest file"), + help=("recordings manifest file (kaldi .scp or pandas .csv)"), ) if "segments_file" not in skip: parser.add_argument( "--segments-file", required=True, - help=("segments manifest file"), + help=("segments manifest file (kaldi .scp or pandas .csv)"), ) parser.add_argument( "--class-names", default=None, nargs="+", - help= - ("list with the names of the types of classes in the datasets, e.g., speaker, language" - ), + help=( + "list with the names of the types of classes in the datasets, e.g., speaker, language" + ), ) parser.add_argument( - "--class-files", - default=None, - nargs="+", - help=("list of class info files"), + "--class-files", default=None, nargs="+", help=("list of class info files"), ) parser.add_argument( "--time-durs-file", default=None, - help= - ("segment to duration in secs file, if durations are not in segments_file" - ), + help=( + "(deprecated) segment to duration in secs file, if durations are not in segments_file" + ), ) parser.add_argument( - "--bpe-model", - default=None, - help=("bpe model for the text label"), + "--bpe-model", default=None, help=("bpe model for the text label"), ) parser.add_argument( @@ -418,32 +415,31 @@ def add_class_args(parser, prefix=None, skip=set()): "--return-segment-info", default=None, nargs="+", - help= - ("list of columns of the segment file which should be returned as supervisions" - ), + help=( + "list of columns of the segment file which should be returned as supervisions" + ), ) parser.add_argument( "--return-orig", default=False, action=ActionYesNo, - help= - ("when using augmentation, whether or not to return also the original audio" - ), + help=( + "when using augmentation, whether or not to return also the original audio" + ), ) parser.add_argument( "--target-sample-freq", default=None, type=int, - help= - ("target sampling frequencey, if not None all audios are converted to this sample freq" - ), + help=( + "target sampling frequencey, if not None all audios are converted to this sample freq" + ), ) AR.add_class_args(parser) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='audio dataset options') add_argparse_args = add_class_args diff --git a/hyperion/utils/feature_set.py b/hyperion/utils/feature_set.py index 2b2f0aaf..7e40dfd6 100644 --- a/hyperion/utils/feature_set.py +++ b/hyperion/utils/feature_set.py @@ -9,6 +9,7 @@ import pandas as pd from .info_table import InfoTable +from .misc import PathLike class FeatureSet(InfoTable): @@ -16,6 +17,9 @@ def __init__(self, df): super().__init__(df) assert "storage_path" in df + def add_prefix_to_storage_path(self, prefix: PathLike): + self.df["storge_path"] = self.df["storage_path"].apply(lambda x: f"{prefix}{x}") + def save(self, file_path, sep=None): """Saves info table to file @@ -31,14 +35,14 @@ def save(self, file_path, sep=None): from .scp_list import SCPList offset = self.df["storage_byte"] if "storage_byte" in self.df else None - range = None + range_spec = None if "start" and "num_frames" in self.df: - range = [ + range_spec = [ np.array([s, n], dtype=np.int64) for s, n in self.df[["start", "num_frames"]] ] scp = SCPList( - self.df["id"].values, self.df["storage_path"].values, offset, range + self.df["id"].values, self.df["storage_path"].values, offset, range_spec ) scp.save(file_path) return @@ -67,9 +71,9 @@ def load(cls, file_path, sep=None): if scp.offset is not None: df["storage_byte"] = scp.offset - if scp.range is not None: - df["start"] = [r[0] for r in scp.range] - df["num_frames"] = [r[0] for r in scp.range] + if scp.range_spec is not None: + df["start"] = [r[0] for r in scp.range_spec] + df["num_frames"] = [r[1] for r in scp.range_spec] return cls(df) diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index a3a1da27..5a4f27d2 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -22,6 +22,7 @@ class InfoTable: Attributes: df: pandas dataframe. """ + def __init__(self, df): self.df = df assert "id" in df, f"info_table={df}" @@ -137,10 +138,7 @@ def load(cls, file_path, sep=None, name="class_id"): sep=" ", header=None, names=["id", name], - dtype={ - "id": np.str, - name: np.str - }, + dtype={"id": np.str, name: np.str}, ) else: if sep is None: @@ -163,17 +161,16 @@ def split(self, idx, num_parts, group_by=None): Args: idx: Part to return from 1 to num_parts. num_parts: Number of parts to split the list. - group_by_field: All the lines with the same value in column + group_by: All the lines with the same value in column groub_by_field go to the same part Returns: - Sub Utt2Info object + Sub InfoTable object """ - if group_by is None: + if group_by is None or group_by == "id": _, idx1 = split_list(self.df["id"], idx, num_parts) else: - _, idx1 = split_list_group_by_key(self.df[group_by], idx, - num_parts) + _, idx1 = split_list_group_by_key(self.df[group_by], idx, num_parts) df = self.df.iloc[idx1] return self.__class__(df) @@ -192,14 +189,10 @@ def merge(cls, tables): df = pd.concat(df_list) return cls(df) - def filter(self, - items=None, - iindex=None, - columns=None, - by="id", - keep=True): - assert (items is None or iindex is None - ), "items and iindex cannot be not None at the same time" + def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): + assert ( + items is None or iindex is None + ), "items and iindex cannot be not None at the same time" df = self.df if not keep: diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py index f9da69fa..d51edc34 100644 --- a/hyperion/utils/segment_set.py +++ b/hyperion/utils/segment_set.py @@ -9,9 +9,36 @@ class SegmentSet(InfoTable): def __init__(self, df): super().__init__(df) + if "start" in df and "recording_id" not in df: + df["recording_id"] = df["id"] + + if "start" not in df and "recording_id" in df: + df["start"] = 0.0 + + @property + def has_time_marks(self): + return ( + "recording_id" in self.df and "start" in self.df and "duration" in self.df + ) + + @property + def has_recording_ids(self): + return "recording_id" in self.df def recording_ids(self, ids): if "recording_id" in self.df: return self.df.loc[ids, "recording_id"] return ids + + def recording_time_marks(self, ids): + if "recording" in self.df: + rec_col = "recording_id" + else: + rec_col = "id" + + assert "duration" in self.df + if "start" not in self.df: + self.df["start"] = 0.0 + + return self.df.loc[ids, [rec_col, "start", "duration"]] diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py index 9785d021..edf2c23a 100644 --- a/hyperion/utils/utt2info.py +++ b/hyperion/utils/utt2info.py @@ -142,7 +142,7 @@ def save(self, file_path, sep=" "): self.utt_info.to_csv(file_path, sep=sep, header=False, index=False) @classmethod - def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}): + def load(cls, file_path, sep=" ", dtype={0: np.str_, 1: np.str_}): """Loads utt2info list from text file. Args: From c408f7428b7443761a0142a7b010dacf16aeaf2b Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 11 May 2023 14:15:47 -0400 Subject: [PATCH 19/89] some fixes in sre21 --- ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml | 102 +++++++++++++ ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml | 66 +++++++++ ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 47 +----- .../v1.16k/local/make_voxceleb1cat_v2.pl | 4 +- egs/sre21-av-a/v1.16k/run_002_compute_evad.sh | 39 ----- .../v1.16k/run_011_train_xvector.sh | 53 ++++++- egs/voxceleb/v1.1/README.md | 52 ++++--- ...train_res2net50w26s4_xvec_stage1_v3.0.yaml | 72 +++++++++ ...train_res2net50w26s4_xvec_stage2_v3.0.yaml | 69 +++++++++ ...train_res2net50w26s8_xvec_stage1_v3.0.yaml | 72 +++++++++ ...train_res2net50w26s8_xvec_stage2_v3.0.yaml | 69 +++++++++ .../train_tseresnet34_xvec_stage1_v3.0.yaml | 4 +- .../config_fbank80_stmn_tseresnet34.v3.0.sh | 2 +- hyperion/np/classifiers/svmc.py | 138 +++++++++--------- hyperion/np/np_model.py | 5 + hyperion/torch/layer_blocks/res2net_blocks.py | 3 - hyperion/torch/trainers/xvector_trainer.py | 29 ---- 17 files changed, 608 insertions(+), 218 deletions(-) create mode 100644 egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..01cfa082 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,102 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 8192 + dropout_rate: 0.0 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 35000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..24b1c081 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index c8732c36..1b7c3764 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,72 +9,34 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 lr=0.02 nnet_type=resnet1d -block_type=seres2bn # squeeze-excitation res2net bottleneck -channels=2048 -ep_channels=8192 -width_factor=1 -scale=8 -se_r=16 dropout=0 -attstats_inner=128 embed_dim=256 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet_enc.in-feats 80 \ - --resnet_enc.in-conv-channels $channels \ - --resnet_enc.in-kernel-size 5 \ - --resnet_enc.in-stride 1 \ - --resnet_enc.resb-type $block_type \ - --resnet_enc.resb-repeats 1 1 1 1 \ - --resnet_enc.resb-channels $channels \ - --resnet_enc.resb-kernel-sizes 3 \ - --resnet_enc.resb-dilations 2 3 4 5 \ - --resnet_enc.resb-strides 1 \ - --resnet_enc.res2net-width-factor $width_factor \ - --resnet_enc.res2net-scale $scale \ - --resnet_enc.se-r $se_r \ - --resnet_enc.multilayer \ - --resnet_enc.multilayer-concat \ - --resnet_enc.endpoint-channels $ep_channels \ - --pool_net.pool-type ch-wise-att-mean+stddev \ - --pool_net.inner-feats $attstats_inner \ - --embed-dim $embed_dim" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 35000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - +nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml nnet_name=${feat_type}_ecapatdnn2048x4_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=75 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth - +nnet=$nnet_dir/model_ep0004.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -88,7 +50,4 @@ else plda_data=voxceleb2cat_train_augx${plda_num_augs} fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl b/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl index 27b1f152..18b6d40c 100755 --- a/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl +++ b/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl @@ -31,7 +31,7 @@ my $meta_path = "$data_base/vox1_meta.csv"; if (! -e "$meta_path") { $meta_path = "$out_dir/vox1_meta.csv"; - system("wget -O $meta_path $meta_url"); + system("wget --no-check-certificate -O $meta_path $meta_url"); } open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; @@ -53,7 +53,7 @@ my $lid_path = "$data_base/lang_vox1_final.csv"; if (! -e "$lid_path") { $lid_path = "$out_dir/lang_vox1_final.csv"; - system("wget -O $lid_path $lid_url"); + system("wget --no-check-certificate -O $lid_path $lid_url"); } open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; my %utt2lang = (); diff --git a/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh b/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh index f7aa7828..08f655ea 100755 --- a/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh +++ b/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh @@ -9,7 +9,6 @@ set -e nodes=fs01 storage_name=$(date +'%m_%d_%H_%M') vaddir=`pwd`/exp/vad_e -vad_config=conf/vad_16k.yaml stage=1 config_file=default_config.sh @@ -75,41 +74,3 @@ if [ $stage -le 3 ];then done fi -# #Enroll multi-speaker Datasets with time marks -# if [ $stage -le 3 ];then -# for name in sre18_dev_enroll_vast sre18_eval_enroll_vast sre19_av_a_dev_enroll sre19_av_a_eval_enroll -# do -# num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') -# nj=$(($num_spk < 40 ? $num_spk:40)) -# # we just run energy vad to get the utt2num_frames file -# hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ -# --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ -# data/${name} exp/make_vad/$name $vaddir -# utils/fix_data_dir.sh data/${name} -# local/sre18_diar_to_vad.sh data/${name} exp/make_vad $vaddir -# utils/fix_data_dir.sh data/${name} -# done -# fi - -# #Dihard Datasets -# if [ $stage -le 4 ];then -# for name in dihard2_train_dev dihard2_train_eval -# do -# num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') -# nj=$(($num_spk < 40 ? $num_spk:40)) -# # we just run energy vad to get the utt2num_frames file -# hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ -# --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ -# data/${name} exp/make_vad/$name $vaddir -# hyp_utils/rttm_to_bin_vad.sh --nj 5 data/$name/vad.rttm data/$name $vaddir -# utils/fix_data_dir.sh data/${name} -# done - -# fi - -# if [ $stage -le 5 ];then -# utils/combine_data.sh --extra-files "utt2num_frames" data/dihard2_train data/dihard2_train_dev data/dihard2_train_eval -# utils/fix_data_dir.sh data/dihard2_train -# fi - - diff --git a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh index 0608929c..7f405952 100755 --- a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh +++ b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh @@ -10,28 +10,67 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" . parse_options.sh || exit 1; . $config_file . datapath.sh -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/${nnet_data}_proc_audio_no_sil args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type \ + --cfg $nnet_base_cfg $nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu \ + +fi + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + mkdir -p $ft_nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $ft_nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type \ + --cfg $ft_nnet_base_cfg $ft_nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet \ + --trainer.exp-path $ft_nnet_dir \ + --num-gpus $ngpu \ + +fi +exit + # Network Training if [ $stage -le 1 ]; then diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 23e0a26f..73b9bb4e 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -104,12 +104,12 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 | | | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062| | | | | Cosine + QMF | 0.62 | 0.036 | 0.063 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.053 | 0.082 | +| | | | Cosine + AS-Norm | 0.70 | 0.043 | 0.076 | +| | | | Cosine + QMF | 0.63 | 0.042 | 0.071 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.051 | 0.095 | +| | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 | +| | | | Cosine + QMF | 0.67 | 0.039 | 0.074 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | | | | | Cosine + AS-Norm | 0.52 | 0.33 | 0.045 | | | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | @@ -134,16 +134,18 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 | | | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 | | | | | Cosine + QMF | 0.74 | 0.045 | 0.081 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.91 | 0.057 | 0.100 | +| | | | Cosine + AS-Norm | 0.85 | 0.052 | 0.089 | +| | | | Cosine + QMF | 0.81 | 0.049 | 0.085 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.94 | 0.059 | 0.105 | +| | | | Cosine + AS-Norm | 0.88 | 0.053 | 0.093 | +| | | | Cosine + QMF | 0.84 | 0.051 | 0.088 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| | | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | | | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | + + ### VoxCeleb 1 Hard-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -163,16 +165,18 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 | | | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 | | | | | Cosine + QMF | 1.34 | 0.079 | 0.136 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.75 | 0.104 | 0.171 | +| | | | Cosine + AS-Norm | 1.56 | 0.091 | 0.152 | +| | | | Cosine + QMF | 1.50 | 0.087 | 0.145 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.76 | 0.104 | 0.174 | +| | | | Cosine + AS-Norm | 1.58 | 0.092 | 0.152 | +| | | | Cosine + QMF | 1.51 | 0.089 | 0.149 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | | | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | | | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | + + ### VoxSRC2022 dev | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -192,12 +196,12 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 | | | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 | | | | | Cosine + QMF | 1.87 | 0.119 | 0.216 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.36 | 0.153 | 0.259 | +| | | | Cosine + AS-Norm | 2.18 | 0.139 | 0.249 | +| | | | Cosine + QMF | 2.08 | 0.128 | 0.222 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.49 | 0.158 | 0.265 | +| | | | Cosine + AS-Norm | 2.29 | 0.145 | 0.251 | +| | | | Cosine + QMF | 2.17 | 0.133 | 0.230 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | | | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | | | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..5dda7913 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..469e166b --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..40fb362e --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..469e166b --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml index 1d864080..31dcaf9a 100644 --- a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml @@ -47,7 +47,7 @@ model: dropout_rate: 0.1 norm_before: false hid_act: swish - se_r: 128 + se_r: 256 trainer: optim: opt_type: adam @@ -67,5 +67,5 @@ trainer: grad_clip: 250 use_amp: true log_interval: 1000 - epochs: 35 + epochs: 25 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh index 42af2d52..00622772 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh @@ -17,7 +17,7 @@ nnet_name=${feat_type}_tseresnet34.v3.0 nnet_s1_base_cfg=conf/train_tseresnet34_xvec_stage1_v3.0.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0035.pth +nnet_s1=$nnet_s1_dir/model_ep0025.pth nnet_s2_base_cfg=conf/train_tseresnet34_xvec_stage2_v3.0.yaml nnet_s2_name=${nnet_name}.s2 diff --git a/hyperion/np/classifiers/svmc.py b/hyperion/np/classifiers/svmc.py index 9311b8e8..6b54034b 100644 --- a/hyperion/np/classifiers/svmc.py +++ b/hyperion/np/classifiers/svmc.py @@ -9,20 +9,24 @@ import numpy as np from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from sklearn.svm import SVC as SVC +from sklearn.svm import SVC from ...hyp_defs import float_cpu from ...utils.math import softmax +from ...utils.misc import filter_func_args from ..np_model import NPModel -class GaussianSVMC(NPModel): +class SVMC(NPModel): """Gaussian Support Vector Machine for Classification.""" def __init__( self, C=1.0, + kernel="rbf", + degree=3, gamma="scale", + coef0=0.0, shrinking=True, probability=True, tol=0.0001, @@ -32,7 +36,6 @@ def __init__( class_weight=None, random_state=None, max_iter=100, - model=None, verbose=0, balance_class_weight=True, lr_seed=1024, @@ -48,25 +51,38 @@ def __init__( if random_state is None: random_state = np.random.RandomState(seed=lr_seed) + self.C = C + self.kernel = kernel + self.degree = degree + self.gamma = gamma + self.coef0 = coef0 + self.shrinking = shrinking + self.probability = probability + self.tol = tol + self.cache_size = cache_size + self.multi_class = multi_class + self.break_ties = break_ties + self.class_weight = class_weight + self.balance_class_weight = balance_class_weight - if model is None: - self.svm = SVC( - C=C, - kernel="rbf", - gamma=gamma, - shrinking=shrinking, - probability=probability, - tol=tol, - cache_size=cache_size, - class_weight=class_weight, - verbose=verbose, - max_iter=max_iter, - decision_function_shape=multi_class, - break_ties=break_ties, - random_state=random_state, - ) - else: - self.svm = model + self.svm = SVC( + C=C, + kernel=kernel, + gamma=gamma, + degree=degree, + coef0=coef0, + shrinking=shrinking, + probability=probability, + tol=tol, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=multi_class, + break_ties=break_ties, + random_state=random_state, + ) + self.set_labels(labels) @property @@ -84,6 +100,18 @@ def get_config(self): Dictionary with config hyperparams. """ config = { + "C": self.C, + "kernel": self.kernel, + "gamma": self.gamma, + "degree": self.degree, + "coef0": self.coef0, + "shrinking": self.shrinking, + "probability": self.probability, + "tol": self.tol, + "cache_size": self.cache_size, + "multi_class": self.multi_class, + "break_ties": self.break_ties, + "class_weight": self.class_weight, "balance_class_weight": self.balance_class_weight, "labels": self.labels, } @@ -135,7 +163,6 @@ def fit(self, x, class_ids, sample_weight=None): class_ids: class integer [0, num_classes-1] identifier (num_samples,) sample_weight: weight of each sample in the estimation (num_samples,) """ - print("--------------", type(x[3, 2]), type(class_ids[20]), "--------------") self.svm.fit(x, class_ids) if self.svm.fit_status_: logging.warning("SVM did not converge") @@ -153,9 +180,6 @@ def save(self, file_path): if not split_path[-1] == "sav": file_path = "".join(split_path[0] + ".sav") with open(file_path, "wb") as f: - # with h5py.File(file_path, "w") as f: - # config = self.to_json() - # f.create_dataset("config", data=np.array(config, dtype="S")) self.save_params(f) @classmethod @@ -169,27 +193,17 @@ def load(cls, file_path): Model object. """ split_path = os.path.splitext(file_path) - if not split_path[-1] == "sav": - file_path = "".join(split_path[0] + ".sav") + if not split_path[-1] == "pkl": + file_path = "".join(split_path[0] + ".pkl") - # with h5py.File(file_path, "r") as f: with open(file_path, "rb") as f: - # json_str = str(np.asarray(f["config"]).astype("U")) - # config = cls.load_config_from_json(json_str) - config = None - return cls.load_params(f, config) + return pickle.load(f) def save_params(self, f): - # params = {"A": self.A, "b": self.b} - # self._save_params_from_dict(f, params) pickle.dump(self, f) @classmethod - def load_params(cls, f, config): - # param_list = ["A", "b"] - # params = cls._load_params_to_dict(f, config["name"], param_list) - # kwargs = dict(list(config.items()) + list(params.items())) - # return cls(**kwargs) + def load_params(cls, f): svmc = pickle.load(f) return svmc @@ -200,27 +214,7 @@ def filter_class_args(**kwargs): Returns: Hyperparamter dictionary to initialize the class. """ - valid_args = ( - "nu", - "gamma", - "shrinking", - "probability", - "tol", - "cache_size", - "multi_class", - "break_ties", - "class_weight", - "random_state", - "max_iter", - "verbose", - "balance_class_weight", - "lr_seed", - "model", - "labels", - ) - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - filter_train_args = filter_class_args + return filter_func_args(SVMC.__init__, **kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -240,17 +234,27 @@ def add_class_args(parser, prefix=None): type=float, help="inverse of regularization strength", ) - # parser.add_argument( - # "--class_weight", - # default=None, - # help="Class weights", - # ) + parser.add_argument( + "--kernel", + default="rbf", + choices=["linear", "poly", "rbf", "sigmoid", "precomputed"], + help="kernel for svm", + ) + parser.add_argument( + "--degree", defaut=3, type=int, help="degree of polynomial kernel" + ) parser.add_argument( "--gamma", default="scale", choices=["scale", "auto"], help="Kernel coefficient for ‘rbf’", ) + parser.add_argument( + "--coef0", + default=0.0, + type=float, + help="independent term of poly and sigmoid kernels", + ) parser.add_argument( "--shrinking", default=True, @@ -264,7 +268,7 @@ def add_class_args(parser, prefix=None): help="Whether to enable probability estimates", ) parser.add_argument( - "--break_ties", + "--break-ties", default=True, type=bool, help="If true, predict will break ties according to the confidence values of decision_function; otherwise \ @@ -293,7 +297,7 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - "--cache_size", + "--cache-size", default=600, type=int, help="Specify the size of the kernel cache (in MB)", diff --git a/hyperion/np/np_model.py b/hyperion/np/np_model.py index ee464161..aa635fc5 100644 --- a/hyperion/np/np_model.py +++ b/hyperion/np/np_model.py @@ -99,6 +99,8 @@ def _save_params_from_dict(self, f, params, dtypes=None): """ if dtypes is None: dtypes = dict((k, float_save()) for k in params) + elif isinstance(dtypes, type): + dtypes = dict((k, dtypes) for k in params) if self.name is None: prefix = "" @@ -174,6 +176,9 @@ def _load_params_to_dict(f, name, params, dtypes=None): """ if dtypes is None: dtypes = dict((k, float_cpu()) for k in params) + elif isinstance(dtypes, type): + dtypes = dict((k, dtypes) for k in params) + if name is None: prefix = "" else: diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py index 73255a24..8de700c4 100644 --- a/hyperion/torch/layer_blocks/res2net_blocks.py +++ b/hyperion/torch/layer_blocks/res2net_blocks.py @@ -410,9 +410,6 @@ def forward(self, x, x_mask=None): x += residual - if not self.norm_before: - x = self.bn3(x) - if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index a9a9d98f..eddf47a7 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -88,35 +88,6 @@ def __init__( super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - # super().__init__( - # model, - # loss, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - @record def train_epoch(self, data_loader): """Training epoch loop From 9c28408d396340a4eb59086bcf62197b9887f900 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 13 May 2023 02:29:16 +0000 Subject: [PATCH 20/89] update lid configs and np.str to str --- ...c2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml | 65 ++++++++++++++++++ ...c2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml | 65 ++++++++++++++++++ ...c2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml | 67 +++++++++++++++++++ ...ec2xlsr300m_ecapatdnn1024x3_subcenter.yaml | 44 ++++++++++++ ...r300m_ecapatdnn1024x3_subcenter_do0.2.yaml | 44 ++++++++++++ .../v1/global_conf/config_lid_v3.0_13langs.sh | 44 ++++++++++++ .../v1/global_conf/config_lid_v4.0_13langs.sh | 44 ++++++++++++ .../v1/global_conf/config_lid_v4.1_13langs.sh | 44 ++++++++++++ egs/commonvoice/v1/run_011_train_asr.sh | 9 ++- hyperion/torch/data/audio_dataset.py | 2 +- .../data/class_weighted_seg_chunk_sampler.py | 4 +- .../torch/narchs/rnn_transducer_decoder.py | 20 ++++-- hyperion/utils/class_info.py | 2 +- hyperion/utils/info_table.py | 4 +- hyperion/utils/scp_list.py | 2 +- hyperion/utils/utt2info.py | 2 +- 16 files changed, 449 insertions(+), 13 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v3.0_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml new file mode 100644 index 00000000..56e08794 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml +trainer: + optim: + opt_type: sgd + lr: 0.15 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 4200 + #hold_steps: 1500 + decay_steps: 16000 + hold_steps: 32000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml new file mode 100644 index 00000000..cf1a549f --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 4200 + #hold_steps: 1500 + decay_steps: 16000 + hold_steps: 32000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml new file mode 100644 index 00000000..d409fb47 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + drop_last: false + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + drop_last: false + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 30000 + hold_steps: 16000 + min_lr: 4e-5 + warmup_steps: 3000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml new file mode 100644 index 00000000..27132c2d --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml new file mode 100644 index 00000000..63c914e3 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.2 + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_lid_v3.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v3.0_13langs.sh new file mode 100644 index 00000000..40516709 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v3.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v3.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v3.0_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0014.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v3.0.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v3.0_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh new file mode 100644 index 00000000..e6c3afda --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v4.0_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0014.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v4.0.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v4.0_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh new file mode 100644 index 00000000..7d0ed120 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v4.1_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0014.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v4.1.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v4.1_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/run_011_train_asr.sh b/egs/commonvoice/v1/run_011_train_asr.sh index e79de7af..284a68f5 100755 --- a/egs/commonvoice/v1/run_011_train_asr.sh +++ b/egs/commonvoice/v1/run_011_train_asr.sh @@ -18,7 +18,7 @@ set -e #export CONV_RSH=ssh #export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH - +export CUDA_VISIBLE_DEVICES=0,1 stage=1 ngpu=2 config_file=default_config.sh @@ -89,19 +89,24 @@ if [ $stage -le 2 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - finetune_wav2vec2transducer.py $nnet_type \ + finetune_wav2vec2rnn_transducer.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ --data.val.dataset.audio-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s2_dir $args \ --in-model-file $nnet_s1 \ --data.train.dataset.time-durs-file $train_dir/utt2dur \ --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1236 \ --num-gpus $ngpu fi diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 2b1f1cf6..9ffb964d 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -80,7 +80,7 @@ def __init__( time_durs = SegmentSet.load(time_durs_file) self.seg_set["duration"] = time_durs.loc[ - self.seg_set["id"]].class_id.values.astype(np.float, + self.seg_set["id"]].class_id.values.astype(float, copy=False) else: assert "duration" in self.seg_set diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index b551f342..8ec63b6f 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -204,7 +204,8 @@ def _set_class_weights(self): self.class_info.set_uniform_weights() elif self.weight_mode == "data-prior": weights = self.class_info["total_duration"].values - self.class_info.set_weights(self, weights) + logging.info(weights) + self.class_info.set_weights(weights) if self.weight_exponent != 1.0: self.class_info.exp_weights(self.weight_exponent) @@ -216,6 +217,7 @@ def _set_class_weights(self): self.var_weights = np.any( self.seg_set[self.length_name] < self.max_chunk_length ) + logging.info(f'updated weight:{self.class_info["weights"]}') @property def hard_prototype_mining(self): diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index bf9189ee..efc11113 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -4,13 +4,14 @@ """ from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +import logging +from typing import Dict, List, Optional, Tuple, Union import torch import torch.nn as nn import torchaudio import torchaudio.functional -from jsonargparse import ActionParser, ArgumentParser +from jsonargparse import ActionParser, ArgumentParser, ActionYesNo try: import k2 @@ -74,6 +75,7 @@ def __init__( am_scale: float = 0.0, simple_loss_scale: float = 0.5, pruned_warmup_steps: int = 2000, + # film: bool=False, ): super().__init__() @@ -615,10 +617,13 @@ def change_config( override_dropouts=False, embed_dropout_rate: float = 0.0, rnn_dropout_rate: float = 0.0, + prune_range: Optional[int] = None, ): logging.info("changing decoder config") self.predictor.change_config(override_dropouts, embed_dropout_rate, rnn_dropout_rate) + if prune_range is not None: + self.prune_range = prune_range @staticmethod def filter_args(**kwargs): @@ -751,8 +756,8 @@ def add_class_args(parser, help="""type of reduction for rnn-t loss between sum or mean""") parser.add_argument( "--prune-range", - default=5, - type=int, + default=None, + type=Optional[int], help="""how many symbols to keep for each frame in k2 rnn-t pruned loss.""") parser.add_argument( @@ -804,6 +809,13 @@ def add_finetune_args(parser, prefix=None, skip=set()): type=float, help=("dropout prob for decoder RNN ")) + parser.add_argument( + "--prune-range", + default=5, + type=int, + help="""how many symbols to keep for each frame in k2 rnn-t + pruned loss.""") + if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index 9e158d87..4e10fac2 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -66,7 +66,7 @@ def load(cls, file_path, sep=None): if ext == "": # if no extension we load as kaldi utt2spk file df = pd.read_csv( - file_path, sep=" ", header=None, names=["id"], dtype={"id": np.str}, + file_path, sep=" ", header=None, names=["id"], dtype={"id": str}, ) return cls(df) diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index f76ba6af..5db7393e 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -138,8 +138,8 @@ def load(cls, file_path, sep=None, name="class_id"): header=None, names=["id", name], dtype={ - "id": np.str, - name: np.str + "id": str, + name: str }, ) else: diff --git a/hyperion/utils/scp_list.py b/hyperion/utils/scp_list.py index 5abf76f2..537102b4 100644 --- a/hyperion/utils/scp_list.py +++ b/hyperion/utils/scp_list.py @@ -36,7 +36,7 @@ def __init__(self, key, file_path, offset=None, range_spec=None): def validate(self): """Validates the attributes of the SCPList object.""" self.key = list2ndarray(self.key) - self.file_path = list2ndarray(self.file_path, dtype=np.object) + self.file_path = list2ndarray(self.file_path, dtype=object) assert len(self.key) == len(self.file_path) if self.offset is not None: if isinstance(self.offset, list): diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py index edf2c23a..e0d27e64 100644 --- a/hyperion/utils/utt2info.py +++ b/hyperion/utils/utt2info.py @@ -142,7 +142,7 @@ def save(self, file_path, sep=" "): self.utt_info.to_csv(file_path, sep=sep, header=False, index=False) @classmethod - def load(cls, file_path, sep=" ", dtype={0: np.str_, 1: np.str_}): + def load(cls, file_path, sep=" ", dtype={0: str, 1: str}): """Loads utt2info list from text file. Args: From 7f43376d4976c00b885a66463475714beb90053e Mon Sep 17 00:00:00 2001 From: ylu125 Date: Sat, 13 May 2023 14:33:07 -0400 Subject: [PATCH 21/89] FiLM transducer --- ...2base_rnnt_film_k2_pruned_stage1_v1.0.yaml | 86 ++ ...g_pruned_filmed_transducer_v1.0_13langs.sh | 50 ++ egs/commonvoice/v1/run_015_train_film_asr.sh | 142 +++ .../bin/train_wav2vec2rnn_film_transducer.py | 278 ++++++ hyperion/torch/layer_blocks/__init__.py | 5 + hyperion/torch/layer_blocks/film_blocks.py | 88 ++ .../layer_blocks/transducer_film_joiner.py | 79 ++ .../layer_blocks/transducer_film_predictor.py | 128 +++ hyperion/torch/models/__init__.py | 3 +- hyperion/torch/models/transducer/__init__.py | 1 + .../models/transducer/rnn_film_transducer.py | 255 ++++++ .../torch/models/wav2transducer/__init__.py | 1 + .../hf_wav2rnn_film_transducer.py | 372 ++++++++ .../hf_wav2vec2rnn_film_transducer.py | 145 +++ .../hf_wav2rnn_transducer_languageid.py | 122 +-- hyperion/torch/narchs/__init__.py | 1 + .../narchs/rnn_film_transducer_decoder.py | 843 ++++++++++++++++++ 17 files changed, 2542 insertions(+), 57 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh create mode 100755 egs/commonvoice/v1/run_015_train_film_asr.sh create mode 100755 hyperion/bin/train_wav2vec2rnn_film_transducer.py create mode 100644 hyperion/torch/layer_blocks/film_blocks.py create mode 100644 hyperion/torch/layer_blocks/transducer_film_joiner.py create mode 100644 hyperion/torch/layer_blocks/transducer_film_predictor.py create mode 100644 hyperion/torch/models/transducer/rnn_film_transducer.py create mode 100644 hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py create mode 100644 hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py create mode 100644 hyperion/torch/narchs/rnn_film_transducer_decoder.py diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml new file mode 100644 index 00000000..7e059b3b --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml @@ -0,0 +1,86 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh new file mode 100644 index 00000000..1fc49fdd --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh @@ -0,0 +1,50 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + +nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe +nnet_s2_transducer_name=$nnet_transducer_name.s2 +nnet_s2_transducer_dir=exp/transducer_nnets/$nnet_s2_transducer_name +nnet_rnn_transducer=$nnet_s2_transducer_dir/model_ep0010.pth + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0016.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/run_015_train_film_asr.sh b/egs/commonvoice/v1/run_015_train_film_asr.sh new file mode 100755 index 00000000..ba1197a8 --- /dev/null +++ b/egs/commonvoice/v1/run_015_train_film_asr.sh @@ -0,0 +1,142 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +# export CUDA_VISIBLE_DEVICES=0 + +#ml purge +#module load namd/2.14-cuda-smp +#module load cuda/11.6.0 +#ml +#nvidia-smi +#export CUDA_VISIBLE_DEVICES=0,1,2,3 +#export CONV_RSH=ssh +#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH + +# export CUDA_VISIBLE_DEVICES=0,1 +stage=1 +ngpu=1 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2rnn_film_transducer.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --in-model-file $nnet_rnn_transducer \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2rnn_transducer.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + # --master-port 1236 \ + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + + .py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + diff --git a/hyperion/bin/train_wav2vec2rnn_film_transducer.py b/hyperion/bin/train_wav2vec2rnn_film_transducer.py new file mode 100755 index 00000000..0239820f --- /dev/null +++ b/hyperion/bin/train_wav2vec2rnn_film_transducer.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2RNNFiLMTransducer) +from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + +model_dict = { + "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, + "hf_wav2vec2rnn_filmed_transducer": HFWav2Vec2RNNFiLMTransducer, + "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, + # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, + # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, + # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, + # "hf_wavlm2rnn_rnn_transducer": HFWavLM2RNNRNNTransducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + # FiLM: add language ID to the input + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + "language": language, + } + return batch + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs, + collate_fn=transducer_collate) + return data_loader + + +# def init_model_from_transducer(in_model_file, rank, model_class, **kwargs): +# model_args = model_class.filter_finetune_args(**kwargs["model"]) +# # model_args = model_class.filter_args(**kwargs["model"]) +# if rank == 0: +# logging.info("model network ft args={}".format(model_args)) +# model = TML.load(in_model_file) +# model.change_config(**model_args) +# if rank == 0: +# logging.info("model={}".format(model)) +# return model + + +def init_model(blank_id, vocab_size, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + # TODO: check model_args + model_args["transducer"]["decoder"]["blank_id"] = blank_id + model_args["transducer"]["decoder"]["vocab_size"] = vocab_size + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + #torch.backends.cudnn.deterministic = True + #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.enabled = False + + # ddp_args = ddp.filter_ddp_args(**kwargs) + # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + # kwargs["rank"] = rank + + # for Debug + rank = 0 + kwargs["rank"] = 0 + device = "cpu" + world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + # model = init_model_from_transducer(**kwargs) + model = init_model(train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} #{"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + parser.link_arguments("data.train.dataset.bpe_model", + "data.val.dataset.bpe_model") + + parser.add_argument("--in-model-file", required=True) + model_class.add_class_args(parser, prefix="model") + + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py index 0487ae4f..6e2f1eb9 100644 --- a/hyperion/torch/layer_blocks/__init__.py +++ b/hyperion/torch/layer_blocks/__init__.py @@ -9,6 +9,7 @@ from .dc2d_blocks import DC2dDecBlock, DC2dEncBlock from .etdnn_blocks import ETDNNBlock from .fc_blocks import FCBlock +from .film_blocks import FiLM, LSTMWithFiLM, initialize_lstm_with_film from .mbconv_blocks import MBConvBlock, MBConvInOutBlock from .res2net1d_blocks import Res2Net1dBasicBlock, Res2Net1dBNBlock from .res2net2d_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock @@ -32,6 +33,10 @@ from .tdnn_blocks import TDNNBlock from .transducer_joiner import TransducerJoiner from .transducer_predictor import TransducerRNNPredictor, TransducerConvPredictor + +from .transducer_film_joiner import TransducerFiLMJoiner +from .transducer_film_predictor import TransducerRNNFiLMPredictor + from .transformer_conv2d_subsampler import TransformerConv2dSubsampler from .transformer_encoder_v1 import TransformerEncoderBlockV1 from .transformer_feedforward import (Conv1dLinear, Conv1dx2, diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py new file mode 100644 index 00000000..8370a42b --- /dev/null +++ b/hyperion/torch/layer_blocks/film_blocks.py @@ -0,0 +1,88 @@ +import torch +import torch.nn as nn + +class FiLM(nn.Module): + def __init__(self, input_size, condition_size): + # condition_size: the size of the language id vector + # input_size: the size of the RNN input to the FiLM layer + super(FiLM, self).__init__() + self.linear_scale = nn.Linear(condition_size, input_size) + self.linear_shift = nn.Linear(condition_size, input_size) + + def forward(self, x, condition): + gamma = self.linear_scale(condition).unsqueeze(2).expand_as(x) + beta = self.linear_shift(condition).unsqueeze(2).expand_as(x) + x = x * gamma + beta + return x + + + +class LSTMWithFiLM(nn.Module): + def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True): + super(LSTMWithFiLM, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.batch_first = batch_first + + self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) + self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)]) + self.dropout_layer = nn.Dropout(dropout) + + def forward(self, x, states, condition): + outputs = [] + h, c = states + new_h, new_c = [], [] + for i, (lstm, film) in enumerate(zip(self.lstms, self.films)): + x, (h_i, c_i) = lstm(x, (h[i].unsqueeze(0), c[i].unsqueeze(0))) + x = film(x, condition) + new_h.append(h_i) + new_c.append(c_i) + if i != self.num_layers - 1: + x = self.dropout_layer(x) + outputs.append(x) + new_h = torch.cat(new_h, dim=0) + new_c = torch.cat(new_c, dim=0) + return torch.cat(outputs, dim=0), (new_h, new_c) + + + +def initialize_lstm_with_film(lstm_with_film, pretrained_dict): + # Load pretrained LSTM state_dict + pretrained_lstm = pretrained_dict['lstm'] + pretrained_num_layers = pretrained_dict['num_layers'] + + # Copy weights from pretrained LSTM layers to LSTMWithFiLM + for i, (lstm, film) in enumerate(zip(lstm_with_film.lstms, lstm_with_film.films)): + if i < pretrained_num_layers: + lstm.weight_ih_l0.data.copy_(pretrained_lstm['weight_ih_l' + str(i)]) + lstm.weight_hh_l0.data.copy_(pretrained_lstm['weight_hh_l' + str(i)]) + lstm.bias_ih_l0.data.copy_(pretrained_lstm['bias_ih_l' + str(i)]) + lstm.bias_hh_l0.data.copy_(pretrained_lstm['bias_hh_l' + str(i)]) + else: + # For extra layers in LSTMWithFiLM, just reset the weights + nn.init.xavier_uniform_(lstm.weight_ih_l0) + nn.init.orthogonal_(lstm.weight_hh_l0) + nn.init.zeros_(lstm.bias_ih_l0) + nn.init.zeros_(lstm.bias_hh_l0) + + +# def initialize_lstm_with_film(lstm_with_film, pretrained_lstm): +# # Copy weights from pretrained LSTM layers to LSTMWithFiLM +# for i, (lstm, film) in enumerate(zip(lstm_with_film.lstms, lstm_with_film.films)): +# if i < pretrained_lstm.num_layers: +# lstm.weight_ih_l0.data.copy_(pretrained_lstm.weight_ih_l[i]) +# lstm.weight_hh_l0.data.copy_(pretrained_lstm.weight_hh_l[i]) +# lstm.bias_ih_l0.data.copy_(pretrained_lstm.bias_ih_l[i]) +# lstm.bias_hh_l0.data.copy_(pretrained_lstm.bias_hh_l[i]) +# else: +# # For extra layers in LSTMWithFiLM, just reset the weights +# nn.init.xavier_uniform_(lstm.weight_ih_l0) +# nn.init.orthogonal_(lstm.weight_hh_l0) +# nn.init.zeros_(lstm.bias_ih_l0) +# nn.init.zeros_(lstm.bias_hh_l0) + + + + # rnn = LSTMWithFiLM(embed_dim, hid_feats, num_layers, rnn_dropout_rate, batch_first=True) \ No newline at end of file diff --git a/hyperion/torch/layer_blocks/transducer_film_joiner.py b/hyperion/torch/layer_blocks/transducer_film_joiner.py new file mode 100644 index 00000000..22875258 --- /dev/null +++ b/hyperion/torch/layer_blocks/transducer_film_joiner.py @@ -0,0 +1,79 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba, Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from .film_blocks import FiLM + + +class TransducerFiLMJoiner(nn.Module): + """ RNN-T Joiner network. + Implementation based on + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer_stateless7/joiner.py + + Attributes: + in_feats: input feature dimension. + vocab_size: vocabulary size + """ + + def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: int, condition_size: int): + + super().__init__() + self.enc_feats = enc_feats + self.pred_feats = pred_feats + self.hid_feats = hid_feats + self.vocab_size = vocab_size + + self.enc_proj = nn.Linear(enc_feats, hid_feats) + self.pred_proj = nn.Linear(pred_feats, hid_feats) + self.output = nn.Linear(hid_feats, vocab_size) + + self.FiLM_encoder = FiLM(hid_feats, condition_size) + self.FiLM_joiner = FiLM(hid_feats, condition_size) + + def get_config(self): + config = { + "joiner_type": "basic", + "hid_feats": self.hid_feats, + } + return config + + def forward(self, + enc_out: torch.Tensor, + pred_out: torch.Tensor, + condition: torch.Tensor, + project_input: bool = True) -> torch.Tensor: + + """ + Args: + enc_out: output from the encoder with shape = (N, T, C) or (N, T, s_range, C) + pred_out: output from the predictor with shape = (N, U, C) or (N, T, s_range, C) + project_input: if True projects the encoder and predictor features + in the forward founction, if False it expects them outside. + Returns: + Symbols' logits of shape (N, T, U, C). + """ + assert enc_out.ndim == pred_out.ndim + assert enc_out.ndim in (3, 4) + + if enc_out.ndim == 3: + enc_out = enc_out.unsqueeze(2) # (N, T, 1, C) + pred_out = pred_out.unsqueeze(1) # (N, 1, U, C) + + enc_out = self.FiLM_encoder(enc_out, condition) + + if project_input: + x = self.enc_proj(enc_out) + self.pred_proj(pred_out) + else: + x = enc_out + pred_out + + x = self.FiLM_joiner(x, condition) + + x = torch.tanh(x) + logits = self.output(x) + return logits diff --git a/hyperion/torch/layer_blocks/transducer_film_predictor.py b/hyperion/torch/layer_blocks/transducer_film_predictor.py new file mode 100644 index 00000000..09fae3ec --- /dev/null +++ b/hyperion/torch/layer_blocks/transducer_film_predictor.py @@ -0,0 +1,128 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba, Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args +from ..layers import ActivationFactory as AF +from .film_blocks import FiLM, LSTMWithFiLM + +class TransducerRNNFiLMPredictor(nn.Module): + """ RNN-T prediction network with LSTM or GRU + Attributes: + vocab_size: Number of tokens of the modeling unit including blank. + embed_dim: Dimension of the input embedding. + num_layers: Number of LSTM layers. + hid_feats: Hidden dimension of LSTM layers. + out_feats: Output dimension of the predictor. + embed_dropout_rate: Dropout rate for the embedding layer. + rnn_dropout_rate: Dropout for LSTM layers. + rnn_type: between lstm and gru + blank_id: The ID of the blank symbol. + """ + + def __init__(self, + vocab_size: int, + embed_dim: int, + num_layers: int, + hid_feats: int, + condition_size: int, + out_feats: Optional[int] = None, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + rnn_type: str = "lstm", + blank_id: int = 0): + super().__init__() + self.embedding = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=embed_dim, + padding_idx=blank_id, + ) + self.embed_dropout = nn.Dropout(embed_dropout_rate) + if rnn_type == "lstm": + self.rnn = LSTMWithFiLM( + input_size=embed_dim, + hidden_size=hid_feats, + num_layers=num_layers, + dropout=rnn_dropout_rate, + condition_size=condition_size, + batch_first=True, + ) + else: + raise Exception(f"Unknown RNN type {rnn_type}") + + self.out_feats = out_feats + self.blank_id = blank_id + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.num_layers = num_layers + self.hid_feats = hid_feats + self.embed_dropout_rate = embed_dropout_rate + self.rnn_dropout_rate = rnn_dropout_rate + if out_feats is None: + out_feats = hid_feats + + self.out_feats = out_feats + if out_feats != hid_feats: + self.output_proj = nn.Linear(hid_feats, out_feats) + else: + self.output_proj = None + + def get_config(self): + config = { + "pred_type": "conv", + "vocab_size": self.vocab_size, + "embed_dim": self.embed_dim, + "num_layers": self.num_layers, + "hid_feats": self.hid_feats, + "out_feats": self.out_feats, + "embed_dropout_rate": self.embed_dropout_rate, + "rnn_dropout_rate": self.rnn_dropout_rate, + "rnn_type": self.rnn_type, + "blank_id": self.blank_id, + } + return config + + def forward( + self, + y: torch.Tensor, + condition: torch.Tensor, + states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + Args: + y: previous y_{ prepended. + states: tuple of tensors containing RNN layers states + Returns: + - rnn_output, a tensor of shape (N, U, C) + - (h, c), containing the states i for RNN layers with shape (num_layers, N, C). + """ + embed = self.embedding(y) + embed = self.embed_dropout(embed) + out, (h, c) = self.rnn(embed, states, condition) + if self.output_proj: + out = self.output_proj(out) + + return out, (h, c) + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + ): + logging.info("changing decoder config") + + if override_dropouts: + logging.info("overriding decoder dropouts") + self.rnn_dropout_rate = rnn_dropout_rate + self.rnn.p = self.rnn_dropout_rate + self.embed_dropout_rate = embed_dropout_rate + self.embed_dropout = nn.Dropout(self.embed_dropout_rate) diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index a8bb24d5..419ea742 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -9,7 +9,8 @@ from .transducer import RNNTransducer, RNNRNNTransducer from .wav2languageid import HFWav2Vec2ResNet1dLanguageID from .wav2transducer import (HFWav2Vec2RNNRNNTransducer, - HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer) + HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer, + HFWav2Vec2RNNFiLMTransducer) from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector, HFWavLM2ResNet1dXVector) from .wav2transducer_languageid import HFWav2Vec2RNNTransducerResnet1D diff --git a/hyperion/torch/models/transducer/__init__.py b/hyperion/torch/models/transducer/__init__.py index ee3c85f5..9d860a22 100644 --- a/hyperion/torch/models/transducer/__init__.py +++ b/hyperion/torch/models/transducer/__init__.py @@ -6,6 +6,7 @@ from .rnn_rnn_transducer import RNNRNNTransducer from .rnn_transducer import RNNTransducer, RNNTransducerOutput +from .rnn_film_transducer import RNNFiLMTransducer from .transducer import Transducer #from .conformer import Conformer diff --git a/hyperion/torch/models/transducer/rnn_film_transducer.py b/hyperion/torch/models/transducer/rnn_film_transducer.py new file mode 100644 index 00000000..0e8c2889 --- /dev/null +++ b/hyperion/torch/models/transducer/rnn_film_transducer.py @@ -0,0 +1,255 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +import torch + +from ....utils import HypDataClass +from ....utils.misc import filter_func_args +from ...narchs import RNNFiLMTransducerDecoder +from ...torch_model import TorchModel + + +@dataclass +class RNNTransducerOutput(HypDataClass): + + loss: torch.Tensor + loss_simple: Optional[torch.Tensor] = None + loss_pruned: Optional[torch.Tensor] = None + h_feats: Optional[List[torch.Tensor]] = None + + +class RNNFiLMTransducer(TorchModel): + """ Base-class for RNN-T in + "Sequence Transduction with Recurrent Neural Networks" + https://arxiv.org/pdf/1211.3711.pdf + + Attributes: + encoder: Encoder network module + decoder: RNN-T Decoder config. dictionary or module. + """ + + def __init__( + self, + encoder: Union[TorchModel, None], + decoder: Union[Dict, RNNFiLMTransducerDecoder], + ): + super().__init__() + if encoder is not None: + assert isinstance(encoder, TorchModel) + if isinstance(decoder, dict): + decoder = RNNFiLMTransducerDecoder(**decoder) + else: + assert isinstance(decoder, RNNFiLMTransducerDecoder) + + self.encoder = encoder + self.decoder = decoder + + def forward( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: Union[Dict, k2.RaggedTensor], + lang: torch.Tensor, + ) -> RNNTransducerOutput: + """ + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + y: ragged tensor with 2 axes [utt][label]. It contains labels of each + utterance. + Returns: + - Token logits with shape = (N, vocab_size) + - RNN-T loss. + """ + assert x.ndim == 3, x.shape + assert x_lengths.ndim == 1, x_lengths.shape + assert y.num_axes == 2, y.num_axes + + assert x.size(0) == x_lengths.size(0) == y.dim0 + assert torch.all( + x_lengths[:-1] >= x_lengths[1:] + ), f"x_lengths={x_lengths}" # check x_lengths are sorted + assert lang.size(0) == y.dim0 + assert lang.size(1) == 1 + + if self.encoder is not None: + x, x_lengths = self.encoder(x, x_lengths) + assert torch.all(x_lengths > 0) + + dec_output = self.decoder(x, x_lengths, y, lang) + output = RNNTransducerOutput(*dec_output) + return output + + def infer(self, + x: torch.Tensor, + x_lengths: torch.Tensor, + lang: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000) -> List[List[int]]: + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + assert x.ndim == 3, x.shape + assert x_lengths.ndim == 1, x_lengths.shape + assert x.size(0) == x_lengths.size(0) + + if self.encoder is not None: + x, x_lengths = self.encoder(x, x_lengths) + assert torch.all(x_lengths > 0) + + batch_size = x.size(0) + y = [] + for i in range(batch_size): + x_i = x[i:i + 1, :x_lengths[i]] + y_i = self.decoder.decode(x_i, + lang, + method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + y.append(y_i) + + return y + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + else: + raise ValueError(f"invalid train_mode={mode}") + + self._train_mode = mode + + def _train(self, train_mode: str): + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return ["full", "frozen"] + + def get_config(self): + if self.encoder is None: + enc_cfg = None + else: + enc_cfg = self.encoder.get_config() + del enc_cfg["class_name"] + + dec_cfg = self.decoder.get_config() + del dec_cfg["class_name"] + config = { + "encoder": enc_cfg, + "decoder": dec_cfg, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + # get arguments for pooling + args = {} + decoder_args = RNNFiLMTransducerDecoder.filter_args(**kwargs["decoder"]) + args["decoder"] = decoder_args + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNFiLMTransducerDecoder.add_class_args(parser, prefix="decoder") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + def change_config( + self, + decoder: Dict, + ): + logging.info("changing decoder config") + self.decoder.change_config(**decoder) + + @staticmethod + def filter_finetune_args(**kwargs): + args = {} + decoder_args = RNNFiLMTransducerDecoder.filter_finetune_args(**kwargs["decoder"]) + args["decoder"] = decoder_args + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNFiLMTransducerDecoder.add_finetune_args(parser, prefix="decoder") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument("--decoding-method", + default="time_sync_beam_search", + choices=[ + "greedy", "time_sync_beam_search", + "align_length_sync_beam_search" + ]) + + parser.add_argument("--beam-width", + default=5, + type=int, + help="beam width for beam search") + parser.add_argument("--max-sym-per-frame", + default=3, + type=int, + help="max symbols RNN-T can emit in 1 frame") + parser.add_argument("--max-sym-per-utt", + default=1000, + type=int, + help="max symbols RNN-T can emit in 1 frame") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return filter_func_args(RNNTransducer.infer, kwargs) diff --git a/hyperion/torch/models/wav2transducer/__init__.py b/hyperion/torch/models/wav2transducer/__init__.py index 79af6349..cd446982 100644 --- a/hyperion/torch/models/wav2transducer/__init__.py +++ b/hyperion/torch/models/wav2transducer/__init__.py @@ -7,3 +7,4 @@ from .hf_wav2vec2_transducer import HFWav2Vec2Transducer from .hf_wav2vec2rnn_rnn_transducer import HFWav2Vec2RNNRNNTransducer from .hf_wav2vec2rnn_transducer import HFWav2Vec2RNNTransducer +from .hf_wav2vec2rnn_film_transducer import HFWav2Vec2RNNFiLMTransducer diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py new file mode 100644 index 00000000..48d8084b --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py @@ -0,0 +1,372 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import contextlib +import logging +from dataclasses import dataclass +from typing import Dict, List, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...torch_model import TorchModel +from ...utils import remove_silence +from ..transducer import RNNFiLMTransducer + + +class HFWav2RNNFiLMTransducer(TorchModel): + """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + transducer: transducer model object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__(self, + hf_feats: TorchModel, + transducer: Union[Dict, TorchModel], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg"): + + super().__init__() + self.hf_feats = hf_feats + if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + #transducer["joiner"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer["encoder"] = None + transducer = RNNFiLMTransducer(**transducer) + else: + assert isinstance(transducer, RNNFiLMTransducer) + if transducer.encoder is None: + assert transducer.decoder.in_feats == hf_feats.hidden_size + #assert transducer.joiner.in_feats == hf_feats.hidden_size + + self.transducer = transducer + self.feat_fusion_start = feat_fusion_start + self.feat_fusion_method = feat_fusion_method + self._hf_context = contextlib.nullcontext() + self._make_fuser() + + def _make_fuser(self): + if self.feat_fusion_method == "last": + self.feat_fuser = None + return + + num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "linear": + self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, + num_layers) / num_layers + elif self.feat_fusion_method == "cat": + self.feat_fuser = nn.Linear(num_layers * layer_dim, + layer_dim, + bias=False) + + def _fuse_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start:] + if self.feat_fusion_method == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "last": + feats = hid_feats[-1] + + return feats + + def forward_feats(self, + x, + x_lengths, + return_feat_layers=None, + chunk_length=0, + detach_chunks=False): + return_hid_states = (False if return_feat_layers is None + and self.feat_fusion_method == "last" else True) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + feats = self._fuse_hid_feats(hid_feats) + else: + hid_feats = None + feats = hf_output["last_hidden_state"] + + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + languageid, + x_lengths=None, + text=None, + return_feat_layers=None, + # return_enc_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Dataclass with losses, "h_enc" (list of hidden encoder layers), + "h_feats" (wav2vec features) + """ + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers) + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + output = self.transducer( + feats, + feat_lengths, + text, + languageid, + ) + + if return_feat_layers: + output.h_feats = hid_feats + + return output + + def infer(self, + x: torch.Tensor, + x_lengths: torch.Tensor, + langugeid: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000): + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + + feats, _, feat_lengths = self.forward_feats(x, x_lengths) + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + y = self.transducer.infer(feats, + feat_lengths, + langugeid, + decoding_method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + return y + + def freeze_feat_fuser(self): + if self.feat_fuser is None: + return + + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser.requires_grad = False + return + + for param in self.feat_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode in ["ft-transducer", "ft-transducer-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode in [ + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.transducer._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "transducer", + "feat_fusion_start", + "feat_fusion_method", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + hf_cfg = self.hf_feats.get_config() + tran_cfg = self.transducer.get_config() + del hf_cfg["class_name"] + del tran_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "transducer": tran_cfg, + "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_method": self.feat_fusion_method, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, transducer): + logging.info("changing hf wav2transducer config") + self.hf_feats.change_config(**hf_feats) + self.transducer.change_config(**transducer) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help=""" + the input to x-vector model will fuse the wav2vec + layers from feat_fusion_start to + the wav2vec num_layers""", + ) + parser.add_argument( + "--feat-fusion-method", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]"), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNFiLMTransducer.add_infer_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return RNNFiLMTransducer.filter_infer_args(**kwargs) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py new file mode 100644 index 00000000..e76867bc --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py @@ -0,0 +1,145 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from .hf_wav2rnn_film_transducer import HFWav2RNNFiLMTransducer +from ..transducer import RNNFiLMTransducer +from ...layer_blocks import initialize_lstm_with_film + +class HFWav2Vec2RNNFiLMTransducer(HFWav2RNNFiLMTransducer): + """Class for RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNFiLMTransducer], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + # if isinstance(transducer, dict): + # transducer["decoder"]["in_feats"] = hf_feats.hidden_size + # transducer["joiner"]["in_feats"] = hf_feats.hidden_size + # if "class_name" in transducer: + # del transducer["class_name"] + # transducer = Transducer(**transducer) + # else: + # assert isinstance(transducer, Transducer) + # assert transducer.decoder.in_feats == hf_feats.hidden_size + # assert transducer.joiner.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, transducer, feat_fusion_start, + feat_fusion_method) + + + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNFiLMTransducer.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNFiLMTransducer.filter_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + RNNFiLMTransducer.add_class_args(parser, prefix="transducer") + HFWav2RNNFiLMTransducer.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNFiLMTransducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + RNNFiLMTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + + + @staticmethod + def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): + model_data = None + if cfg is None or state_dict is None: + assert file_path is not None + model_data = torch.load(file_path) + if cfg is None: + cfg = model_data["model_cfg"] + if state_dict is None and model_data is not None: + state_dict = model_data["model_state_dict"] + + if "class_name" in cfg: + del cfg["class_name"] + + return cfg, state_dict + + # check again + + @classmethod + def load(cls, file_path=None, cfg=None, state_dict=None): + cfg, state_dict = TorchModel._load_cfg_state_dict( + file_path, cfg, state_dict) + + model = cls(**cfg) + if state_dict is not None: + # remove the lstm layers from the state_dict + # because the lstm are changed to lstm with film + state_dict = ODict( + [(k, v) for k, v in state_dict.items() + if not k.startswith("lstm")]) + # initialize the lstm with film with the pretrained lstm + initialize_lstm_with_film( + model.transducer.predictor.rnn, [(k, v) for k, v in state_dict.items() if k.startswith("lstm")]) + + # load the state_dict + model.load_state_dict(state_dict, strict=False) + return model \ No newline at end of file diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py index b710655e..b9f39de8 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py @@ -40,8 +40,10 @@ class HFWav2RNNTransducerLanguageID(TorchModel): def __init__(self, hf_feats: TorchModel, - transducer: Union[Dict, TorchModel], - languageid: Union[Dict, TorchModel], + transducer: TorchModel, + languageid: TorchModel, + transducer_fuser: TorchModel, + languageid_fuser: TorchModel, feat_fusion_start: int = 0, feat_fusion_method: str = "weighted-avg", loss_weight_transducer: float = 0.005, @@ -49,46 +51,48 @@ def __init__(self, super().__init__() self.hf_feats = hf_feats - if isinstance(transducer, dict): - transducer["decoder"]["in_feats"] = hf_feats.hidden_size - #transducer["joiner"]["in_feats"] = hf_feats.hidden_size - if "class_name" in transducer: - del transducer["class_name"] - - transducer["encoder"] = None - transducer = RNNTransducer(**transducer) - else: - assert isinstance(transducer, RNNTransducer) - if transducer.encoder is None: - assert transducer.decoder.in_feats == hf_feats.hidden_size - #assert transducer.joiner.in_feats == hf_feats.hidden_size + # if isinstance(transducer, dict): + # transducer["decoder"]["in_feats"] = hf_feats.hidden_size + # #transducer["joiner"]["in_feats"] = hf_feats.hidden_size + # if "class_name" in transducer: + # del transducer["class_name"] + + # transducer["encoder"] = None + # transducer = RNNTransducer(**transducer) + # else: + # assert isinstance(transducer, RNNTransducer) + # if transducer.encoder is None: + # assert transducer.decoder.in_feats == hf_feats.hidden_size + # #assert transducer.joiner.in_feats == hf_feats.hidden_size self.transducer = transducer self.languageid = languageid + self.transducer_fuser = transducer_fuser + self.languageid_fuser = languageid_fuser + self.feat_fusion_start = feat_fusion_start self.feat_fusion_method = feat_fusion_method self.loss_weight_transducer = loss_weight_transducer self.loss_weight_lid = loss_weight_lid self._hf_context = contextlib.nullcontext() - self._make_fuser() - - def _make_fuser(self): - if self.feat_fusion_method == "last": - self.feat_fuser = None - return - num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start - layer_dim = self.hf_feats.hidden_size - if self.feat_fusion_method == "weighted-avg": - self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) - elif self.feat_fusion_method == "linear": - self.feat_fuser = nn.Linear(num_layers, 1, bias=False) - self.feat_fuser.weight.data = torch.ones(1, - num_layers) / num_layers - elif self.feat_fusion_method == "cat": - self.feat_fuser = nn.Linear(num_layers * layer_dim, - layer_dim, - bias=False) + # def _make_fuser(self, transducer_fuser, languageid_fuser): + # if self.feat_fusion_method == "last": + # self.feat_fuser = None + # return + + # num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + # layer_dim = self.hf_feats.hidden_size + # if self.feat_fusion_method == "weighted-avg": + # self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + # elif self.feat_fusion_method == "linear": + # self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + # self.feat_fuser.weight.data = torch.ones(1, + # num_layers) / num_layers + # elif self.feat_fusion_method == "cat": + # self.feat_fuser = nn.Linear(num_layers * layer_dim, + # layer_dim, + # bias=False) def _fuse_hid_feats(self, hid_feats): """Fuses the hidden features from the Wav2Vec model. @@ -106,18 +110,23 @@ def _fuse_hid_feats(self, hid_feats): hid_feats = hid_feats[self.feat_fusion_start:] if self.feat_fusion_method == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) - norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) - feats = torch.sum(hid_feats * norm_weights, dim=-1) + norm_weights_transducer = nn.functional.softmax(self.transducer_fuser, dim=-1) + norm_weights_languageid = nn.functional.softmax(self.languageid_fuser, dim=-1) + feats_transducer = torch.sum(hid_feats * norm_weights_transducer, dim=-1) + feats_languageid = torch.sum(hid_feats * norm_weights_languageid, dim=-1) elif self.feat_fusion_method == "linear": hid_feats = torch.stack(hid_feats, dim=-1) - feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + feats_transducer = self.transducer_fuser(hid_feats).squeeze(dim=-1) + feats_languageid = self.languageid_fuser(hid_feats).squeeze(dim=-1) elif self.feat_fusion_method == "cat": hid_feats = torch.cat(hid_feats, dim=-1) - feats = self.feat_fuser(hid_feats) + feats_transducer = self.transducer_fuser(hid_feats) + feats_languageid = self.languageid_fuser(hid_feats) elif self.feat_fusion_method == "last": - feats = hid_feats[-1] + feats_transducer = hid_feats[-1] + feats_languageid = hid_feats[-1] - return feats + return feats_transducer, feats_languageid def forward_feats(self, x, @@ -138,12 +147,14 @@ def forward_feats(self, feat_lengths = hf_output["hidden_states_lengths"] if return_hid_states: hid_feats = hf_output["hidden_states"] - feats = self._fuse_hid_feats(hid_feats) + feats_transducer, feats_languageid = self._fuse_hid_feats(hid_feats) else: hid_feats = None - feats = hf_output["last_hidden_state"] + feats_transducer = hf_output["last_hidden_state"] + feats_languageid = hf_output["last_hidden_state"] - feats = feats.transpose(1, 2) + feats_transducer = feats_transducer.transpose(1, 2) + feats_languageid = feats_languageid.transpose(1, 2) if return_feat_layers is not None: # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) # as the hidden features of the x-vector encoder. @@ -154,7 +165,7 @@ def forward_feats(self, else: hid_feats = None - return feats, hid_feats, feat_lengths + return feats_transducer, feats_languageid, hid_feats, feat_lengths def forward( self, @@ -185,13 +196,13 @@ def forward( Dataclass with losses, "h_enc" (list of hidden encoder layers), "h_feats" (wav2vec features) """ - feats, hid_feats, feat_lengths = self.forward_feats( + feats_transducer, feats_languageid, hid_feats, feat_lengths = self.forward_feats( x, x_lengths, return_feat_layers) - - + feats_transducer = feats_transducer.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + logits = self.languageid( - feats, + feats_languageid, None, languageid, return_enc_layers=return_enc_layers, @@ -200,21 +211,17 @@ def forward( ) loss_lid = nn.CrossEntropyLoss()(logits, languageid) - - - - feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + trans_output = self.transducer( - feats, + feats_transducer, feat_lengths, text, ) - if return_feat_layers: trans_output.h_feats = hid_feats - output = RNNTransducerLanguageIDOutput( self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, trans_output.loss, loss_lid,trans_output.loss_simple, trans_output.loss_pruned,trans_output.h_feats) + output = RNNTransducerLanguageIDOutput(self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, trans_output.loss, loss_lid,trans_output.loss_simple, trans_output.loss_pruned,trans_output.h_feats) return output def infer(self, @@ -236,9 +243,9 @@ def infer(self, List of list of integer indexes of the recognizer's symbols. """ - feats, _, feat_lengths = self.forward_feats(x, x_lengths) + feats_transducer, _, _, feat_lengths = self.forward_feats(x, x_lengths) - feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + feats = feats_transducer.permute(0, 2, 1) # (N, C, T) ->(N, T, C) y = self.transducer.infer(feats, feat_lengths, @@ -341,11 +348,14 @@ def filter_args(**kwargs): def get_config(self): hf_cfg = self.hf_feats.get_config() tran_cfg = self.transducer.get_config() + lid_cfg = self.languageid.get_config() del hf_cfg["class_name"] del tran_cfg["class_name"] + del lid_cfg["class_name"] config = { "hf_feats": hf_cfg, "transducer": tran_cfg, + "languageid": lid_cfg, "feat_fusion_start": self.feat_fusion_start, "feat_fusion_method": self.feat_fusion_method, "loss_weight_transducer": self.loss_weight_transducer, diff --git a/hyperion/torch/narchs/__init__.py b/hyperion/torch/narchs/__init__.py index 4fe8b4ed..049f5d23 100644 --- a/hyperion/torch/narchs/__init__.py +++ b/hyperion/torch/narchs/__init__.py @@ -22,6 +22,7 @@ from .resnet_factory import ResNetFactory from .rnn_encoder import RNNEncoder from .rnn_transducer_decoder import RNNTransducerDecoder +from .rnn_film_transducer_decoder import RNNFiLMTransducerDecoder from .spinenet import * from .spinenet_factory import SpineNetFactory from .tdnn import TDNNV1 diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py new file mode 100644 index 00000000..cf1652b5 --- /dev/null +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -0,0 +1,843 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from dataclasses import dataclass +import logging +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torchaudio +import torchaudio.functional +from jsonargparse import ActionParser, ArgumentParser, ActionYesNo + +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +from ...utils.misc import filter_func_args +from ...utils.text import add_sos +from ..layer_blocks import TransducerJoiner as Joiner +from ..layer_blocks import TransducerRNNPredictor as RNNPredictor, TransducerConvPredictor as ConvPredictor +from .net_arch import NetArch + + +@dataclass +class Hypothesis: + ys: List[int] # predicted sequences + log_prob: float # log prob of ys + + # Optional LSTM predictor state. + pred_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + + +class RNNFiLMTransducerDecoder(NetArch): + """ RNN-T Decoder composed of Predictor and Joiner networks + Implementation based on + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer/transducer.py + + Attributes: + in_feats: input features dimension (encoder output) + vocab_size: Number of tokens of the modeling unit including blank. + predictor: Dictionary with the predictor options. + joiner: Dictionary with the joiner options. + blank_id: id of the null symbol. + rnnt_loss: type of rnn-t loss between torchaudio, k2 or k2_pruned. + rnnt_type: rnn-t variation between regular, modified or constrained. + delay_penalty: penalize symbol delay, which is used to make symbol + emit earlier. + reduction: type of reduction for rnn-t loss between sum or mean + prune_range: how many symbols to keep for each frame in k2 rnn-t + pruned loss. + lm_scale: language model scale in rnn-t smoothed loss. + am_scale: acoustic model scale in rnn-t smoothed loss. + simple_loss_scale: weight of rnn-t simple loss when using k2 pruned loss. + pruned_warmup_steps: number of steps to warm up the k2 rnn-t pruned loss + from 0.1 to 1. + """ + + def __init__( + self, + in_feats: int, + vocab_size: int, + predictor: Dict, + joiner: Dict, + blank_id: int = 0, + rnnt_loss: str = "k2_pruned", + rnnt_type: str = "regular", + delay_penalty: float = 0.0, + reduction: str = "sum", + prune_range: int = 5, + lm_scale: float = 0.25, + am_scale: float = 0.0, + simple_loss_scale: float = 0.5, + pruned_warmup_steps: int = 2000, + langs_size: int = 13, + condition_size: int = 64, + ): + + super().__init__() + self.in_feats = in_feats + self.vocab_size = vocab_size + self.predictor_args = predictor + self.joiner_args = joiner + self.blank_id = blank_id + self.rnnt_loss = rnnt_loss + self.rnnt_type = rnnt_type + self.delay_penalty = delay_penalty + self.reduction = reduction + self.prune_range = prune_range + self.lm_scale = lm_scale + self.am_scale = am_scale + self.simple_loss_scale = simple_loss_scale + self.pruned_warmup_steps = pruned_warmup_steps + self.condition_size = condition_size + + + self._make_predictor() + self._make_joiner() + # make embedding layer for language id + self.lang_embedding = nn.Embedding(langs_size, condition_size) + if self.rnnt_loss == "k2_pruned": + self.simple_am_proj = nn.Linear(in_feats, vocab_size) + self.simple_lm_proj = nn.Linear(self.predictor.out_feats, + vocab_size) + self.register_buffer("cur_step", torch.as_tensor(0, + dtype=torch.int)) + + def _make_predictor(self): + pred_type = self.predictor_args["pred_type"] + self.predictor_args["in_feats"] = self.in_feats + self.predictor_args["vocab_size"] = self.vocab_size + self.predictor_args["blank_id"] = self.blank_id + self.predictor_args["condition_size"] = self.condition_size + # Add FiLM args to the predictor args + if pred_type == "rnn": + pred_args = filter_func_args(RNNPredictor.__init__, + self.predictor_args) + self.predictor = RNNPredictor(**pred_args) + # elif pred_type == "conv": + # pred_args = filter_func_args(ConvPredictor.__init__, + # self.predictor_args) + # self.predictor = ConvPredictor(**pred_args) + else: + raise ValueError(f"Unknown predictor type {pred_type}") + + def _make_joiner(self): + joiner_type = self.joiner_args["joiner_type"] + # Add FiLM args to the joiner args + + if joiner_type == "basic": + pred_feats = self.predictor_args["out_feats"] + hid_feats = self.joiner_args["hid_feats"] + self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, + self.vocab_size, self.condition_size) + else: + raise ValueError(f"Unknown joiner type {joiner_type}") + + def get_config(self): + config = { + "in_feats": self.in_feats, + "vocab_size": self.vocab_size, + "predictor": self.predictor_args, + "joiner": self.joiner_args, + "blank_id": self.blank_id, + "rnnt_loss": self.rnnt_loss, + "rnnt_type": self.rnnt_type, + "delay_penalty": self.delay_penalty, + "reduction": self.reduction, + "prune_range": self.prune_range, + "lm_scale": self.lm_scale, + "am_scale": self.am_scale, + "simple_loss_scale": self.simple_loss_scale, + "pruned_warmup_steps": self.pruned_warmup_steps, + "condition_size": self.condition_size, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor, + y: torch.Tensor, y_lengths: torch.Tensor, + pred_out: torch.Tensor, lang_embedding: torch.Tensor): + logits = self.joiner(x, pred_out, lang_embedding) + # rnnt_loss requires 0 padded targets + # Note: y does not start with SOS + y_padded = y.pad(mode="constant", padding_value=0) + x_lengths = x_lengths.to(torch.int32) + loss = torchaudio.functional.rnnt_loss( + logits=logits, + targets=y_padded.to(torch.int32), + logit_lengths=x_lengths, + target_lengths=y_lengths, + blank=self.blank_id, + reduction=self.reduction, + ) + return loss + + def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor, + y: torch.Tensor, y_lengths: torch.Tensor, + pred_out: torch.Tensor, lang_embedding: torch.Tensor): + y_padded = y.pad(mode="constant", padding_value=0) + y_padded = y_padded.to(torch.int64) + boundary = torch.zeros((x.size(0), 4), + dtype=torch.int64, + device=x.device) + boundary[:, 2] = y_lengths + boundary[:, 3] = x_lengths + + logits = self.joiner(x, pred_out, lang_embedding) + + with torch.cuda.amp.autocast(enabled=False): + loss = k2.rnnt_loss( + logits=logits.float(), + symbols=y_padded, + termination_symbol=self.blank_id, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + ) + return loss + + def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, + y: torch.Tensor, y_lengths: torch.Tensor, + pred_out: torch.Tensor, lang_embedding: torch.Tensor): + + y_padded = y.pad(mode="constant", padding_value=0) + y_padded = y_padded.to(torch.int64) + boundary = torch.zeros((x.size(0), 4), + dtype=torch.int64, + device=x.device) + boundary[:, 2] = y_lengths + boundary[:, 3] = x_lengths + + am_simple = self.simple_am_proj(x) + lm_simple = self.simple_lm_proj(pred_out) + with torch.cuda.amp.autocast(enabled=False): + loss_simple, (px_grad, py_grad) = k2.rnnt_loss_smoothed( + lm=lm_simple.float(), + am=am_simple.float(), + symbols=y_padded, + termination_symbol=self.blank_id, + lm_only_scale=self.lm_scale, + am_only_scale=self.am_scale, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + return_grad=True, + ) + + # ranges : [B, T, prune_range] + ranges = k2.get_rnnt_prune_ranges( + px_grad=px_grad, + py_grad=py_grad, + boundary=boundary, + s_range=self.prune_range, + ) + + # am_pruned : [B, T, prune_range, encoder_dim] + # lm_pruned : [B, T, prune_range, decoder_dim] + am_pruned, lm_pruned = k2.do_rnnt_pruning( + am=self.joiner.enc_proj(x), + lm=self.joiner.pred_proj(pred_out), + ranges=ranges, + ) + + # logits : [B, T, prune_range, vocab_size] + + # project_input=False since we applied the decoder's input projections + # prior to do_rnnt_pruning (this is an optimization for speed). + logits = self.joiner(am_pruned, lm_pruned, lang_embedding, project_input=False) + + + with torch.cuda.amp.autocast(enabled=False): + loss_pruned = k2.rnnt_loss_pruned( + logits=logits.float(), + symbols=y_padded, + ranges=ranges, + termination_symbol=self.blank_id, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + ) + + if self.cur_step > self.pruned_warmup_steps: + simple_loss_scale = self.simple_loss_scale + pruned_loss_scale = 1.0 + else: + r = self.cur_step / self.pruned_warmup_steps + simple_loss_scale = 1.0 - r * (1.0 - self.simple_loss_scale) + pruned_loss_scale = 0.1 + 0.9 * r + self.cur_step += 1 + # print(simple_loss_scale, pruned_loss_scale) + + loss = simple_loss_scale * loss_simple + pruned_loss_scale * loss_pruned + + return loss, loss_simple, loss_pruned + + def forward( + self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, lang: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + # embed lang + lang_embedding = self.lang_embedding(lang) + # get y_lengths + row_splits = y.shape.row_splits(1) + y_lengths = row_splits[1:] - row_splits[:-1] + # shift y adding token + sos_y = add_sos(y, sos_id=self.blank_id) + sos_y_padded = sos_y.pad(mode="constant", padding_value=self.blank_id) + sos_y_padded = sos_y_padded.to(torch.int64) + # apply predictor and joiner + pred_out, _ = self.predictor(sos_y_padded, lang_embedding) + loss_simple = loss_pruned = None + if self.rnnt_loss == "k2_pruned": + loss, loss_simple, loss_pruned = self._rnnt_loss_k2_pruned( + x, x_lengths, y, y_lengths, pred_out, lang_embedding) + elif self.rnnt_loss == "k2": + loss = self._rnnt_loss_k2(x, x_lengths, y, y_lengths, pred_out, lang_embedding) + elif self.rnnt_loss == "torchaudio": + loss_simple = loss_pruned = None + loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths, + pred_out, lang_embedding) + + return loss, loss_simple, loss_pruned + + def decode(self, + x: torch.Tensor, + lang: torch.Tensor, + x_lengths: torch.Tensor = None, + method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, ) -> List[int]: + + # embed lang + lang_embedding = self.lang_embedding(lang) + if method == "time_sync_beam_search": + return self.decode_time_sync_beam_search(x, + lang_embedding, + x_lengths, + beam_width=beam_width) + elif method == "align_length_sync_beam_search": + return self.decode_align_length_sync_beam_search( + x, + x_lengths, + lang_embedding, + beam_width=beam_width, + max_sym_per_utt=max_sym_per_utt) + elif method == "greedy": + return self.decode_greedy(x, + lang_embedding, + x_lengths, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + + def decode_greedy(self, + x: torch.Tensor, + lang_embedding: torch.Tensor, + x_lengths: torch.Tensor = None, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000) -> List[int]: + """ + Args: + x: encoder embeddings with shape = (N, T, C) + Returns: + Decoded tokens + """ + assert x.ndim == 3 + + # support only batch_size == 1 for now + assert x.size(0) == 1, x.size(0) + blank_id = self.blank_id + device = x.device + + sos = torch.Tensor([blank_id], device=device, + dtype=torch.int64).reshape(1, 1) + pred_out, (h, c) = self.predictor(sos, lang_embedding) + T = x.size(1) + t = 0 + hyp = [] + + sym_per_frame = 0 + sym_per_utt = 0 + + while t < T and sym_per_utt < max_sym_per_utt: + x_t = x[:, t:t + 1, :] + logits = self.joiner(x_t, pred_out, lang_embedding) # (1, 1, 1, vocab_size) + # logits is + + log_prob = logits.log_softmax(dim=-1) # (1, 1, 1, vocab_size) + # TODO: Use logits.argmax() + y = log_prob.argmax() + if y != blank_id: + hyp.append(y.item()) + y = y.reshape(1, 1) + pred_out, (h, c) = self.predictor(y, lang_embedding, (h, c)) + + sym_per_utt += 1 + sym_per_frame += 1 + + if y == blank_id or sym_per_frame > max_sym_per_frame: + sym_per_frame = 0 + t += 1 + + return hyp + + def decode_time_sync_beam_search(self, + x: torch.Tensor, + lang_embedding: torch.Tensor, + x_lengths: torch.Tensor = None, + beam_width: int = 5) -> List[int]: + assert x.ndim == 3 + assert x.size(0) == 1, x.size(0) + + blank_id = self.blank_id + device = x.device + + sos = torch.Tensor([blank_id], device=device).reshape(1, 1) + pred_out, (h, c) = self.predictor(sos, lang_embedding) + T = x.size(1) + t = 0 + B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] + max_u = 20000 # terminate after this number of steps + u = 0 + + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]] = {} + + while t < T and u < max_u: + x_t = x[:, t:t + 1, :] + A = B + B = [] + + while u < max_u: + y_star = max(A, key=lambda hyp: hyp.log_prob) + A.remove(y_star) + + # Note: y_star.ys is unhashable, i.e., cannot be used + # as a key into a dict + cached_key = "_".join(map(str, y_star.ys)) + + if cached_key not in cache: + pred_in = torch.Tensor([y_star.ys[-1]], + device=device).reshape(1, 1) + + pred_out, pred_state = self.predictor( + pred_in, + lang, + y_star.pred_state, + ) + cache[cached_key] = (pred_out, pred_state) + else: + pred_out, pred_state = cache[cached_key] + + logits = self.joiner(x_t, pred_out, lang_embedding) + log_prob = logits.log_softmax(dim=-1) + # log_prob is (1, 1, 1, vocab_size) + log_prob = log_prob.squeeze() + # Now log_prob is (vocab_size,) + + # If we choose blank here, add the new hypothesis to B. + # Otherwise, add the new hypothesis to A + + # First, choose blank + skip_log_prob = log_prob[blank_id] + new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() + # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob, + # skip_log_prob.item(), new_y_star_log_prob) + # ys[:] returns a copy of ys + new_y_star = Hypothesis( + ys=y_star.ys[:], + log_prob=new_y_star_log_prob, + # Caution: Use y_star.decoder_state here + pred_state=y_star.pred_state, + ) + B.append(new_y_star) + + topk_log_prob = log_prob.topk(beam_width, dim=-1) + + # Second, choose other labels + #for i, v in enumerate(log_prob.tolist()): + for v, i in zip(*topk_log_prob): + v = v.item() + i = i.item() + if i == blank_id: + continue + new_ys = y_star.ys + [i] + new_log_prob = y_star.log_prob + v + new_hyp = Hypothesis( + ys=new_ys, + log_prob=new_log_prob, + pred_state=pred_state, + ) + A.append(new_hyp) + + u += 1 + # check whether B contains more than "beam" elements more probable + # than the most probable in A + A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + B = sorted( + [ + hyp + for hyp in B if hyp.log_prob > A_most_probable.log_prob + ], + key=lambda hyp: hyp.log_prob, + reverse=True, + ) + # print("tuAB2", + # t, + # u, + # len(A), + # A_most_probable.log_prob, + # len(B), + # flush=True) + if len(B) >= beam_width: + B = B[:beam_width] + break + t += 1 + + try: + best_hyp = max(B, + key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + except: + return "" + ys = best_hyp.ys[1:] # [1:] to remove the blank + return ys + + def decode_align_length_sync_beam_search( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + lang_embedding: torch.Tensor, + beam_width: int = 5, + max_sym_per_utt: int = 1000) -> List[int]: + assert x.ndim == 3 + assert x.size(0) == 1, x.size(0) + + blank_id = self.blank_id + device = x.device + + sos = torch.Tensor([blank_id], device=device).reshape(1, 1) + pred_out, (h, c) = self.predictor(sos, lang_embedding) + T = x.size(1) + #t = 0 + B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] + #max_u = 20000 # terminate after this number of steps + #u = 0 + + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]] = {} + F = [] + #for t < T and u < max_u: + for i in range(T + max_sym_per_utt): + A = [] + for y_star in B: + #while u < max_u: + u = len(y_star.ys) - 1 + t = i - u + if t >= T: + continue + + #y_star = max(A, key=lambda hyp: hyp.log_prob) + #A.remove(y_star) + x_t = x[:, t:t + 1, :] + # Note: y_star.ys is unhashable, i.e., cannot be used + # as a key into a dict + cached_key = "_".join(map(str, y_star.ys)) + + if cached_key not in cache: + pred_in = torch.Tensor([y_star.ys[-1]], + device=device).reshape(1, 1) + + pred_out, pred_state = self.predictor( + pred_in, + lang_embedding, + y_star.pred_state, + ) + cache[cached_key] = (pred_out, pred_state) + else: + pred_out, pred_state = cache[cached_key] + + logits = self.joiner(x_t, pred_out, lang_embedding) + log_prob = logits.log_softmax(dim=-1) # (1, 1, 1, vocab_size) + log_prob = log_prob.squeeze() # (vocab_size,) + + # First, choose blank + skip_log_prob = log_prob[blank_id] + new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() + # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob, + # skip_log_prob.item(), new_y_star_log_prob) + # ys[:] returns a copy of ys + new_y_star = Hypothesis( + ys=y_star.ys[:], + log_prob=new_y_star_log_prob, + # Caution: Use y_star.pred_state here + pred_state=y_star.pred_state, + ) + A.append(new_y_star) + if t == T - 1: + F.append(y_star) + + topk_log_prob = log_prob.topk(beam_width, dim=-1) + + # Second, choose other labels + #for i, v in enumerate(log_prob.tolist()): + for v, i in zip(*topk_log_prob): + v = v.item() + i = i.item() + if i == blank_id: + continue + new_ys = y_star.ys + [i] + new_log_prob = y_star.log_prob + v + new_hyp = Hypothesis( + ys=new_ys, + log_prob=new_log_prob, + pred_state=pred_state, + ) + A.append(new_hyp) + + # check whether B contains more than "beam_width" elements more probable + # than the most probable in A + #A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + B0 = sorted( + [hyp for hyp in A], + key=lambda hyp: hyp.log_prob, + reverse=True, + ) + B = [] + B_ys = set() + for hyp in B0: + hyp_ys = tuple(hyp.ys) # to make ys hashable + if hyp_ys not in B_ys: + B.append(hyp) + B_ys.add(hyp_ys) + # print("tuAB2", + # t, + # u, + # len(A), + # A_most_probable.log_prob, + # len(B), + # flush=True) + if len(B) >= beam_width: + B = B[:beam_width] + break + + best_hyp = max(F, + key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + ys = best_hyp.ys[1:] # [1:] to remove the blank + return ys + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + prune_range: Optional[int] = None, + ): + logging.info("changing decoder config") + self.predictor.change_config(override_dropouts, embed_dropout_rate, + rnn_dropout_rate) + if prune_range is not None: + self.prune_range = prune_range + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(RNNFiLMTransducerDecoder.__init__, kwargs) + return args + + @staticmethod + def filter_finetune_args(**kwargs): + args = filter_func_args(RNNFiLMTransducerDecoder.change_config, kwargs) + return args + + @staticmethod + def add_pred_args(parser): + + pred_parser = ArgumentParser(prog="") + pred_parser.add_argument( + "--pred-type", + default="rnn", + choices=["rnn", "conv"], + help= + """type of predictor between RNN and Convolutional [rnn, conv]""") + pred_parser.add_argument("--embed-dim", + default=1024, + type=int, + help=("token embedding dimension")) + pred_parser.add_argument( + "--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for predictor input embeddings")) + pred_parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help="""dropout prob for decoder RNN """) + pred_parser.add_argument( + "--rnn-type", + default="lstm", + choices=["lstm", "gru"], + help= + """type of recurrent network for thep predictor in [lstm, gru]""") + + pred_parser.add_argument("--num-layers", + default=2, + type=int, + help="""number of layers of the predictor """) + + pred_parser.add_argument("--hid-feats", + default=512, + type=int, + help="""hidden features of the predictor""") + pred_parser.add_argument("--out-feats", + default=512, + type=int, + help="""output features of the predictor""") + pred_parser.add_argument("--context-size", + default=2, + type=int, + help="""context length of the convolutional + predictor, 1->bigram, 2-> trigram,...""") + + parser.add_argument("--predictor", + action=ActionParser(parser=pred_parser)) + + @staticmethod + def add_joiner_args(parser): + + pred_parser = ArgumentParser(prog="") + pred_parser.add_argument( + "--joiner-type", + default="basic", + choices=["basic"], + help= + """type of joiner network, there is only basic joiner for now""") + pred_parser.add_argument("--hid-feats", + default=512, + type=int, + help="""hidden features of the joiner""") + parser.add_argument("--joiner", + action=ActionParser(parser=pred_parser)) + + @staticmethod + def add_class_args(parser, + prefix=None, + skip=set(["in_feats", "blank_id", "vocab_size"])): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument("--in-feats", + type=int, + required=True, + help=("input feature dimension")) + if "blank_id" not in skip: + parser.add_argument("--blank-id", + type=int, + default=0, + help=("blank id from tokenizer model")) + if "vocab_size" not in skip: + parser.add_argument("--vocab-size", + type=int, + required=True, + help=("output prediction dimension")) + + RNNFiLMTransducerDecoder.add_pred_args(parser) + RNNFiLMTransducerDecoder.add_joiner_args(parser) + parser.add_argument( + "--rnnt-loss", + default="k2_pruned", + choices=["torchaudio", "k2", "k2_pruned"], + help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""") + parser.add_argument( + "--rnnt-type", + default="regular", + choices=["regular", "modified", "constrained"], + help= + """type of rnn-t loss between regular, modified or constrained.""") + parser.add_argument( + "--delay-penalty", + default=0.0, + type=float, + help= + """penalize symbol delay, which is used to make symbol emit earlier + for streaming models.""") + parser.add_argument( + "--reduction", + default="sum", + choices=["sum", "mean"], + help="""type of reduction for rnn-t loss between sum or mean""") + parser.add_argument( + "--prune-range", + default=None, + type=Optional[int], + help="""how many symbols to keep for each frame in k2 rnn-t + pruned loss.""") + parser.add_argument( + "--lm-scale", + default=0.25, + type=float, + help="""language model scale in rnn-t smoothed loss""") + parser.add_argument( + "--am-scale", + default=0.0, + type=float, + help="""acoustic model scale in rnn-t smoothed loss""") + parser.add_argument( + "--simple-loss-scale", + default=0.5, + type=float, + help="""weight of rnn-t simple loss when using k2 pruned loss""") + parser.add_argument( + "--pruned-warmup-steps", + default=2000, + type=int, + help="""number of steps to warm up the k2 rnn-t pruned loss + from 0.1 to 1""") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model.")) + parser.add_argument("--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings")) + parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN ")) + + parser.add_argument( + "--prune-range", + default=5, + type=int, + help="""how many symbols to keep for each frame in k2 rnn-t + pruned loss.""") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) From 20c13e7abb532a3453142124fde7e240fa455ae5 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 14 May 2023 00:35:47 +0000 Subject: [PATCH 22/89] Add FiLMed Transducer --- ...2base_rnnt_film_k2_pruned_stage1_v1.0.yaml | 6 +- ...c2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml | 67 +++++++++++++++++++ .../v1/global_conf/config_lid_v4.0_13langs.sh | 5 +- .../bin/train_wav2vec2rnn_film_transducer.py | 17 ++--- hyperion/torch/layer_blocks/film_blocks.py | 65 +++++------------- .../layer_blocks/transducer_film_joiner.py | 10 ++- .../layer_blocks/transducer_film_predictor.py | 4 +- .../models/transducer/rnn_film_transducer.py | 1 - .../hf_wav2rnn_film_transducer.py | 33 +++++++-- .../narchs/rnn_film_transducer_decoder.py | 4 +- .../trainers/transducer_languageid_trainer.py | 4 +- 11 files changed, 135 insertions(+), 81 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml index 7e059b3b..7110b50e 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml @@ -6,6 +6,7 @@ data: - conf/reverb_noise_aug.yaml return_segment_info: - text + - language sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' @@ -17,7 +18,7 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.3 + num_chunks_per_seg_epoch: 0.1 data_loader: num_workers: 1 @@ -28,6 +29,7 @@ data: wav_scale: 1 return_segment_info: - text + - language sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' @@ -59,7 +61,7 @@ model: rnn_type: lstm joiner: hid_feats: 512 - feat_fusion_method: weighted-avg + feat_fusion_method: film-weighted-avg feat_fusion_start: 2 trainer: optim: diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml new file mode 100644 index 00000000..d270d62c --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + languageid: + cos_scale: 32.0 +trainer: + optim: + opt_type: sgd + lr: 0.0005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 4200 + #hold_steps: 1500 + decay_steps: 16000 + hold_steps: 32000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 1024 + train_mode: full + + \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh index e6c3afda..ba42ad38 100644 --- a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs.sh @@ -28,9 +28,8 @@ nnet_name=${hf_model_name}_resnet1d_v4.0_13_langs nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0014.pth - -nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v4.0.yaml +nnet_s1=$nnet_s1_dir/model_ep0008.pth +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml nnet_s2_args="" nnet_s2_name=${hf_model_name}_resnet1d_v4.0_13_langs.s2 nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name diff --git a/hyperion/bin/train_wav2vec2rnn_film_transducer.py b/hyperion/bin/train_wav2vec2rnn_film_transducer.py index 0239820f..f06cc684 100755 --- a/hyperion/bin/train_wav2vec2rnn_film_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_film_transducer.py @@ -144,15 +144,15 @@ def train_model(gpu_id, args): #torch.backends.cudnn.benchmark = False # torch.backends.cudnn.enabled = False - # ddp_args = ddp.filter_ddp_args(**kwargs) - # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - # kwargs["rank"] = rank + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank - # for Debug - rank = 0 - kwargs["rank"] = 0 - device = "cpu" - world_size=1 + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = torch.device("cuda:{}".format(gpu_id)) + # world_size=1 train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) @@ -172,6 +172,7 @@ def train_model(gpu_id, args): **trn_args, ) trainer.load_last_checkpoint() + # import pdb; pdb.set_trace() trainer.fit(train_loader, val_loader) ddp.ddp_cleanup() diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py index 8370a42b..5caeab76 100644 --- a/hyperion/torch/layer_blocks/film_blocks.py +++ b/hyperion/torch/layer_blocks/film_blocks.py @@ -9,10 +9,15 @@ def __init__(self, input_size, condition_size): self.linear_scale = nn.Linear(condition_size, input_size) self.linear_shift = nn.Linear(condition_size, input_size) - def forward(self, x, condition): - gamma = self.linear_scale(condition).unsqueeze(2).expand_as(x) - beta = self.linear_shift(condition).unsqueeze(2).expand_as(x) - x = x * gamma + beta + def forward(self, x, lang_condition): + if x.ndim == 3: + gamma = self.linear_scale(lang_condition).unsqueeze(1).expand_as(x) + beta = self.linear_shift(lang_condition).unsqueeze(1).expand_as(x) + x = x * gamma + beta + elif x.ndim == 4: + gamma = self.linear_scale(lang_condition).unsqueeze(1).unsqueeze(2).expand_as(x) + beta = self.linear_shift(lang_condition).unsqueeze(1).unsqueeze(2).expand_as(x) + x = x * gamma + beta return x @@ -30,13 +35,15 @@ def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)]) self.dropout_layer = nn.Dropout(dropout) - def forward(self, x, states, condition): + def forward(self, x, states, lang_condition): outputs = [] - h, c = states new_h, new_c = [], [] for i, (lstm, film) in enumerate(zip(self.lstms, self.films)): - x, (h_i, c_i) = lstm(x, (h[i].unsqueeze(0), c[i].unsqueeze(0))) - x = film(x, condition) + if states: + x, (h_i, c_i) = lstm(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0))) + else: + x, (h_i, c_i) = lstm(x) + x = film(x, lang_condition) new_h.append(h_i) new_c.append(c_i) if i != self.num_layers - 1: @@ -44,45 +51,5 @@ def forward(self, x, states, condition): outputs.append(x) new_h = torch.cat(new_h, dim=0) new_c = torch.cat(new_c, dim=0) - return torch.cat(outputs, dim=0), (new_h, new_c) + return x, (new_h, new_c) - - -def initialize_lstm_with_film(lstm_with_film, pretrained_dict): - # Load pretrained LSTM state_dict - pretrained_lstm = pretrained_dict['lstm'] - pretrained_num_layers = pretrained_dict['num_layers'] - - # Copy weights from pretrained LSTM layers to LSTMWithFiLM - for i, (lstm, film) in enumerate(zip(lstm_with_film.lstms, lstm_with_film.films)): - if i < pretrained_num_layers: - lstm.weight_ih_l0.data.copy_(pretrained_lstm['weight_ih_l' + str(i)]) - lstm.weight_hh_l0.data.copy_(pretrained_lstm['weight_hh_l' + str(i)]) - lstm.bias_ih_l0.data.copy_(pretrained_lstm['bias_ih_l' + str(i)]) - lstm.bias_hh_l0.data.copy_(pretrained_lstm['bias_hh_l' + str(i)]) - else: - # For extra layers in LSTMWithFiLM, just reset the weights - nn.init.xavier_uniform_(lstm.weight_ih_l0) - nn.init.orthogonal_(lstm.weight_hh_l0) - nn.init.zeros_(lstm.bias_ih_l0) - nn.init.zeros_(lstm.bias_hh_l0) - - -# def initialize_lstm_with_film(lstm_with_film, pretrained_lstm): -# # Copy weights from pretrained LSTM layers to LSTMWithFiLM -# for i, (lstm, film) in enumerate(zip(lstm_with_film.lstms, lstm_with_film.films)): -# if i < pretrained_lstm.num_layers: -# lstm.weight_ih_l0.data.copy_(pretrained_lstm.weight_ih_l[i]) -# lstm.weight_hh_l0.data.copy_(pretrained_lstm.weight_hh_l[i]) -# lstm.bias_ih_l0.data.copy_(pretrained_lstm.bias_ih_l[i]) -# lstm.bias_hh_l0.data.copy_(pretrained_lstm.bias_hh_l[i]) -# else: -# # For extra layers in LSTMWithFiLM, just reset the weights -# nn.init.xavier_uniform_(lstm.weight_ih_l0) -# nn.init.orthogonal_(lstm.weight_hh_l0) -# nn.init.zeros_(lstm.bias_ih_l0) -# nn.init.zeros_(lstm.bias_hh_l0) - - - - # rnn = LSTMWithFiLM(embed_dim, hid_feats, num_layers, rnn_dropout_rate, batch_first=True) \ No newline at end of file diff --git a/hyperion/torch/layer_blocks/transducer_film_joiner.py b/hyperion/torch/layer_blocks/transducer_film_joiner.py index 22875258..7fdae60d 100644 --- a/hyperion/torch/layer_blocks/transducer_film_joiner.py +++ b/hyperion/torch/layer_blocks/transducer_film_joiner.py @@ -33,8 +33,7 @@ def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: self.pred_proj = nn.Linear(pred_feats, hid_feats) self.output = nn.Linear(hid_feats, vocab_size) - self.FiLM_encoder = FiLM(hid_feats, condition_size) - self.FiLM_joiner = FiLM(hid_feats, condition_size) + self.film = FiLM(hid_feats, condition_size) def get_config(self): config = { @@ -46,7 +45,7 @@ def get_config(self): def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor, - condition: torch.Tensor, + lang_condition: torch.Tensor, project_input: bool = True) -> torch.Tensor: """ @@ -60,19 +59,18 @@ def forward(self, """ assert enc_out.ndim == pred_out.ndim assert enc_out.ndim in (3, 4) - if enc_out.ndim == 3: enc_out = enc_out.unsqueeze(2) # (N, T, 1, C) pred_out = pred_out.unsqueeze(1) # (N, 1, U, C) - enc_out = self.FiLM_encoder(enc_out, condition) + # enc_out = self.FiLM_encoder(enc_out, lang_condition) if project_input: x = self.enc_proj(enc_out) + self.pred_proj(pred_out) else: x = enc_out + pred_out - x = self.FiLM_joiner(x, condition) + x = self.film(x, lang_condition) x = torch.tanh(x) logits = self.output(x) diff --git a/hyperion/torch/layer_blocks/transducer_film_predictor.py b/hyperion/torch/layer_blocks/transducer_film_predictor.py index 09fae3ec..dbb93218 100644 --- a/hyperion/torch/layer_blocks/transducer_film_predictor.py +++ b/hyperion/torch/layer_blocks/transducer_film_predictor.py @@ -93,7 +93,7 @@ def get_config(self): def forward( self, y: torch.Tensor, - condition: torch.Tensor, + lang_condition: torch.Tensor, states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ @@ -106,7 +106,7 @@ def forward( """ embed = self.embedding(y) embed = self.embed_dropout(embed) - out, (h, c) = self.rnn(embed, states, condition) + out, (h, c) = self.rnn(embed, states, lang_condition) if self.output_proj: out = self.output_proj(out) diff --git a/hyperion/torch/models/transducer/rnn_film_transducer.py b/hyperion/torch/models/transducer/rnn_film_transducer.py index 0e8c2889..444c4521 100644 --- a/hyperion/torch/models/transducer/rnn_film_transducer.py +++ b/hyperion/torch/models/transducer/rnn_film_transducer.py @@ -83,7 +83,6 @@ def forward( x_lengths[:-1] >= x_lengths[1:] ), f"x_lengths={x_lengths}" # check x_lengths are sorted assert lang.size(0) == y.dim0 - assert lang.size(1) == 1 if self.encoder is not None: x, x_lengths = self.encoder(x, x_lengths) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py index 48d8084b..7f6b9ba7 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py @@ -13,6 +13,7 @@ from ...torch_model import TorchModel from ...utils import remove_silence +from ...layer_blocks import FiLM from ..transducer import RNNFiLMTransducer @@ -63,7 +64,13 @@ def _make_fuser(self): num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start layer_dim = self.hf_feats.hidden_size - if self.feat_fusion_method == "weighted-avg": + if self.feat_fusion_method == "film-weighted-avg": + self.films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)]) + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "weighted-avg-film": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + self.film = FiLM(layer_dim, self.transducer.decoder.condition_size) + elif self.feat_fusion_method == "weighted-avg": self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif self.feat_fusion_method == "linear": self.feat_fuser = nn.Linear(num_layers, 1, bias=False) @@ -74,11 +81,12 @@ def _make_fuser(self): layer_dim, bias=False) - def _fuse_hid_feats(self, hid_feats): + def _fuse_hid_feats(self, hid_feats, lang): """Fuses the hidden features from the Wav2Vec model. Args: hid_feats: list of hidden features Tensors from Wav2Vec model. + lang: language id Tensor. Returns: Tensor of fused features (batch, channels, time) @@ -87,8 +95,19 @@ def _fuse_hid_feats(self, hid_feats): # There is only one layer of features return hid_feats[0] + lang_condition = self.transducer.decoder.lang_embedding(lang) hid_feats = hid_feats[self.feat_fusion_start:] - if self.feat_fusion_method == "weighted-avg": + if self.feat_fusion_method == "film-weighted-avg": + film_hid_feats = tuple(self.films[i](hid_feats[i], lang_condition) for i in range(len(self.films))) + film_hid_feats = torch.stack(film_hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(film_hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "weighted-avg-film": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + feats = self.film(feats, lang_condition) + elif self.feat_fusion_method == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) feats = torch.sum(hid_feats * norm_weights, dim=-1) @@ -106,6 +125,7 @@ def _fuse_hid_feats(self, hid_feats): def forward_feats(self, x, x_lengths, + lang: torch.Tensor, return_feat_layers=None, chunk_length=0, detach_chunks=False): @@ -122,7 +142,7 @@ def forward_feats(self, feat_lengths = hf_output["hidden_states_lengths"] if return_hid_states: hid_feats = hf_output["hidden_states"] - feats = self._fuse_hid_feats(hid_feats) + feats = self._fuse_hid_feats(hid_feats, lang) else: hid_feats = None feats = hf_output["last_hidden_state"] @@ -168,8 +188,9 @@ def forward( Dataclass with losses, "h_enc" (list of hidden encoder layers), "h_feats" (wav2vec features) """ + feats, hid_feats, feat_lengths = self.forward_feats( - x, x_lengths, return_feat_layers) + x, x_lengths, languageid, return_feat_layers) feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) output = self.transducer( @@ -204,7 +225,7 @@ def infer(self, List of list of integer indexes of the recognizer's symbols. """ - feats, _, feat_lengths = self.forward_feats(x, x_lengths) + feats, _, feat_lengths = self.forward_feats(x, x_lengths, languageid) feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index cf1652b5..9f42a09c 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -20,8 +20,8 @@ from ...utils.misc import filter_func_args from ...utils.text import add_sos -from ..layer_blocks import TransducerJoiner as Joiner -from ..layer_blocks import TransducerRNNPredictor as RNNPredictor, TransducerConvPredictor as ConvPredictor +from ..layer_blocks import TransducerFiLMJoiner as Joiner +from ..layer_blocks import TransducerRNNFiLMPredictor as RNNPredictor from .net_arch import NetArch diff --git a/hyperion/torch/trainers/transducer_languageid_trainer.py b/hyperion/torch/trainers/transducer_languageid_trainer.py index 238e8022..2e9df702 100644 --- a/hyperion/torch/trainers/transducer_languageid_trainer.py +++ b/hyperion/torch/trainers/transducer_languageid_trainer.py @@ -78,7 +78,7 @@ def __init__( swa_anneal_epochs=10, cpu_offload=False, input_key="x", - target_key=["text", "languageid"], + target_key=["text", "language"], ): loss = None @@ -214,7 +214,7 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): skip=super_skip) if "target_key" not in skip: parser.add_argument("--target-keys", - default=["text", "languageid"], + default=["text", "language"], help="list of dict. key for nnet targets") if prefix is not None: From f8c84a9977d61e65cc4bc2ab67ce4af792e73836 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 14 May 2023 01:17:23 +0000 Subject: [PATCH 23/89] remove unused function --- hyperion/torch/layer_blocks/__init__.py | 2 +- .../models/wav2transducer/hf_wav2vec2rnn_film_transducer.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py index 6e2f1eb9..62c096b2 100644 --- a/hyperion/torch/layer_blocks/__init__.py +++ b/hyperion/torch/layer_blocks/__init__.py @@ -9,7 +9,7 @@ from .dc2d_blocks import DC2dDecBlock, DC2dEncBlock from .etdnn_blocks import ETDNNBlock from .fc_blocks import FCBlock -from .film_blocks import FiLM, LSTMWithFiLM, initialize_lstm_with_film +from .film_blocks import FiLM, LSTMWithFiLM from .mbconv_blocks import MBConvBlock, MBConvInOutBlock from .res2net1d_blocks import Res2Net1dBasicBlock, Res2Net1dBNBlock from .res2net2d_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py index e76867bc..6d1ea944 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py @@ -12,7 +12,6 @@ from ...tpm import HFWav2Vec2 from .hf_wav2rnn_film_transducer import HFWav2RNNFiLMTransducer from ..transducer import RNNFiLMTransducer -from ...layer_blocks import initialize_lstm_with_film class HFWav2Vec2RNNFiLMTransducer(HFWav2RNNFiLMTransducer): """Class for RNN-T with Wav2Vec2 features From 05474decc6016ecdf8521c2078dd2b7cc01c7dc1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 15 May 2023 03:13:47 +0000 Subject: [PATCH 24/89] Add decode script and configurations --- ...2base_rnnt_film_k2_pruned_stage1_v1.0.yaml | 8 +- .../global_conf/config_lid_v4.0_13langs_v3.sh | 43 ++++++++++ egs/commonvoice/v1/local/initailize_model.py | 55 ++++++++++++ .../decode_wav2vec2rnn_transducer.sh | 79 ++++++++++++++++++ .../decode_wav2vec2rnn_transducer_lid.sh | 83 +++++++++++++++++++ 5 files changed, 265 insertions(+), 3 deletions(-) create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh create mode 100644 egs/commonvoice/v1/local/initailize_model.py create mode 100755 hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh create mode 100755 hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml index 7110b50e..7d3d133e 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml @@ -11,6 +11,7 @@ data: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' max_batch_length: 50 + max_audio_length: 15. min_batch_size: 1 drop_last: false # for class_weighted_random_bucketing_seg_sampler @@ -34,6 +35,7 @@ data: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' max_batch_length: 50 + max_audio_length: 15. min_batch_size: 1 drop_last: true # for class_weighted_random_bucketing_seg_sampler @@ -41,7 +43,7 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 1.0 + num_chunks_per_seg_epoch: 0.5 data_loader: num_workers: 1 model: @@ -66,7 +68,7 @@ model: trainer: optim: opt_type: sgd - lr: 0.005 + lr: 0.002 momentum: 0.9 weight_decay: 4e-4 lrsched: @@ -85,4 +87,4 @@ trainer: eff_batch_size: 128 train_mode: hf-feats-frozen-nograd - \ No newline at end of file + diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh new file mode 100644 index 00000000..8d6cbc80 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh @@ -0,0 +1,43 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v4.0_13_langs +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v4.0_13_langs.s4 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/local/initailize_model.py b/egs/commonvoice/v1/local/initailize_model.py new file mode 100644 index 00000000..7ae9db8e --- /dev/null +++ b/egs/commonvoice/v1/local/initailize_model.py @@ -0,0 +1,55 @@ +import torch + +# arguments example +# pretrained_model = 'exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth' +# film_model = "exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe.s1_initial/model_ep0000.pth" +# output_model = "model_initialized.pth" + +pretrained_model = torch.load(sys.argv[1]) +film_model = torch.load(sys.argv[2]) + +output_model = sys.argv[3] + + +def update_film_lstm_parameters(film_state_dict, pretrained_state_dict): + for i in range(2): + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_ih_l0"] = pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_ih_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_hh_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_ih_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_ih_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_hh_l' + str(i)].clone() + return film_state_dict + + +def copy_model_parameters(pretrained_model, film_model): + pretrained_state_dict = pretrained_model["model_state_dict"] + film_state_dict = film_model["model_state_dict"] + + update_state_dict = {name: param for name, param in pretrained_state_dict.items() if name in film_state_dict and param.shape == film_state_dict[name].shape} + new_film_state_dict = film_state_dict.copy() + new_film_state_dict.update(update_state_dict) + + new_film_state_dict = update_film_lstm_parameters(new_film_state_dict, pretrained_state_dict) + + film_model["model_state_dict"] = new_film_state_dict + + unchanged_parameters = [] + changed_parameters = [] + unloaded_parameters = [] + for name, param in film_state_dict.items(): + if torch.all(torch.eq(param, new_film_state_dict[name])): + unchanged_parameters.append(name) + else: + changed_parameters.append(name) + + for name, param in pretrained_state_dict.items(): + if name not in changed_parameters: + unloaded_parameters.append(name) + + print(f"Unchanged parameters: {unchanged_parameters}") + print(f"Unloaded parameters: {unloaded_parameters}") + print(f"Changed parameters: {changed_parameters}") + film_model["epoch"] =1 + torch.save(film_model, output_model) + + +unchanged_parameters = copy_model_parameters(pretrained_model, film_model) \ No newline at end of file diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh new file mode 100755 index 00000000..986c8190 --- /dev/null +++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" +set -e +use_gpu=false +#write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +extra_args="" +infer_cfg=conf/infer.yaml +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 4 ] && [ $# != 5 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --infer-cfg # decoding configuration" + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + extra_args="${extra_args} --use-gpu" +fi + +# if [ "$write_utt2num_frames" == "true" ];then +# write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +# fi + +if [ $stage -le 0 ];then + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2vec2rnn_transducer.py \ + --infer-args $infer_cfg \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --output $output_dir/transducer.JOB.text $extra_args +fi + +if [ $stage -le 1 ];then + echo "compute wer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + + python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text + python steps_transducer/word2char.py $data_dir/text $data_dir/text_char + + python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model + python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model + + # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer + # compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char + # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe + +fi diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh new file mode 100755 index 00000000..3bf84cbd --- /dev/null +++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" +set -e +use_gpu=false +#write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +extra_args="" +infer_cfg=conf/infer.yaml +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 5 ] && [ $# != 6 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --infer-cfg # decoding configuration" + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 +lang_file=$5 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + extra_args="${extra_args} --use-gpu" +fi + +# if [ "$write_utt2num_frames" == "true" ];then +# write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +# fi + +if [ $stage -le 0 ];then + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2vec2rnn_transducer_languageid.py \ + --infer-args $infer_cfg \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --lang-file $lang_file \ + --output_transducer $output_dir/transducer.JOB.text \ + --output_languageid $output_dir/languageid.JOB $extra_args +fi + +if [ $stage -le 1 ];then + echo "compute wer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + cat $output_dir/languageid.* > $output_dir/langs + + python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text + python steps_transducer/word2char.py $data_dir/text $data_dir/text_char + + # python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model + # python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model + + compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer + compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char + # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe + +fi From cb4f20eb2b03b4243a9e750d9b0039de610eea0d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 16 May 2023 02:56:27 +0000 Subject: [PATCH 25/89] remove redundant code --- .../hf_wav2vec2rnn_film_transducer.py | 38 ------------------- 1 file changed, 38 deletions(-) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py index 6d1ea944..513d193c 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py @@ -104,41 +104,3 @@ def add_finetune_args(parser, prefix=None): action=ActionParser(parser=parser)) - - @staticmethod - def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): - model_data = None - if cfg is None or state_dict is None: - assert file_path is not None - model_data = torch.load(file_path) - if cfg is None: - cfg = model_data["model_cfg"] - if state_dict is None and model_data is not None: - state_dict = model_data["model_state_dict"] - - if "class_name" in cfg: - del cfg["class_name"] - - return cfg, state_dict - - # check again - - @classmethod - def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = TorchModel._load_cfg_state_dict( - file_path, cfg, state_dict) - - model = cls(**cfg) - if state_dict is not None: - # remove the lstm layers from the state_dict - # because the lstm are changed to lstm with film - state_dict = ODict( - [(k, v) for k, v in state_dict.items() - if not k.startswith("lstm")]) - # initialize the lstm with film with the pretrained lstm - initialize_lstm_with_film( - model.transducer.predictor.rnn, [(k, v) for k, v in state_dict.items() if k.startswith("lstm")]) - - # load the state_dict - model.load_state_dict(state_dict, strict=False) - return model \ No newline at end of file From 2b61053496a9034c6ede99c4fac00d2e9472fc39 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 17 May 2023 19:51:58 +0000 Subject: [PATCH 26/89] update fine-tune and decoding scripts for rnnt_film_transducer --- ...2base_rnnt_film_k2_pruned_stage2_v1.0.yaml | 75 +++++ egs/commonvoice/v1/run_031_inference_film.sh | 49 ++++ .../decode_wav2vec2rnn_film_transducer.sh | 82 ++++++ .../bin/decode_wav2vec2rnn_film_transducer.py | 167 +++++++++++ .../finetune_wav2vec2rnn_film_transducer.py | 261 ++++++++++++++++++ .../models/transducer/rnn_film_transducer.py | 3 +- .../hf_wav2rnn_film_transducer.py | 9 +- .../narchs/rnn_film_transducer_decoder.py | 12 +- 8 files changed, 648 insertions(+), 10 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml create mode 100755 egs/commonvoice/v1/run_031_inference_film.sh create mode 100755 hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh create mode 100755 hyperion/bin/decode_wav2vec2rnn_film_transducer.py create mode 100755 hyperion/bin/finetune_wav2vec2rnn_film_transducer.py diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml new file mode 100644 index 00000000..a867f12a --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.5 + data_loader: + num_workers: 1 +model: + transducer: + decoder: + prune_range: 15 +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + diff --git a/egs/commonvoice/v1/run_031_inference_film.sh b/egs/commonvoice/v1/run_031_inference_film.sh new file mode 100755 index 00000000..7b796107 --- /dev/null +++ b/egs/commonvoice/v1/run_031_inference_film.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +use_gpu=false +nnet_stage=1 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + transducer_args="--use-gpu true" + transducer_cmd="$cuda_eval_cmd --mem 6G" +else + transducer_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +transducer_dir=exp/transducer/$nnet_name + + +# test_data=test_clean + + +# Extracts x-vectors for evaluation +for name in $test_data +do + nj=40 + steps_transducer/decode_wav2vec2rnn_film_transducer.sh \ + --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ + $nnet data/$name \ + $transducer_dir/$name $bpe_model data/$nnet_data/langs +done + diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh new file mode 100755 index 00000000..ebd6398d --- /dev/null +++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" +set -e +use_gpu=false +#write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +extra_args="" +infer_cfg=conf/infer.yaml +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 5 ] && [ $# != 6 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --infer-cfg # decoding configuration" + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 +lang_file=$5 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + extra_args="${extra_args} --use-gpu" +fi + +# if [ "$write_utt2num_frames" == "true" ];then +# write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +# fi + +if [ $stage -le 0 ];then + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2vec2rnn_film_transducer.py \ + --infer-args $infer_cfg \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --lang_input $data_dir/utt2lang \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --lang-file $lang_file \ + --output $output_dir/transducer.JOB.text $extra_args +fi + +if [ $stage -le 1 ];then + echo "compute wer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + + python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text + python steps_transducer/word2char.py $data_dir/text $data_dir/text_char + + python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model + python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model + + # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer + # compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char + # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe + +fi diff --git a/hyperion/bin/decode_wav2vec2rnn_film_transducer.py b/hyperion/bin/decode_wav2vec2rnn_film_transducer.py new file mode 100755 index 00000000..17cb0c3f --- /dev/null +++ b/hyperion/bin/decode_wav2vec2rnn_film_transducer.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time +from typing import Dict, List, Tuple + +import numpy as np +import pandas as pd +import sentencepiece as spm +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.np.augment import SpeechAugment +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.models import HFWav2Vec2RNNFiLMTransducer +from hyperion.torch.models.wav2transducer.beam_search import (beam_search, + greedy_search) +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info +from hyperion.utils.class_info import ClassInfo +from hyperion.utils.segment_set import SegmentSet +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("transducer-film-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def decode_transducer(input_spec, lang_input_spec, output_spec, scp_sep, model_path, bpe_model, lang_file, + infer_args, use_gpu, **kwargs): + + device = init_device(use_gpu) + model = load_model(model_path, device) + + # load language dict form langfile by row number + lang_info = ClassInfo.load(lang_file) + utt2lang = SegmentSet.load(lang_input_spec) + + + logging.info("bpe-model=%s", bpe_model) + sp = spm.SentencePieceProcessor() + sp.load(bpe_model) + + infer_args = HFWav2Vec2RNNFiLMTransducer.filter_infer_args(**infer_args) + logging.info(f"infer-args={infer_args}") + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output: %s", output_spec) + with open(output_spec, "w") as writer: + logging.info(f"opening input stream: {input_spec} with args={ar_args}") + with AR(input_spec, **ar_args) as reader: + while not reader.eof(): + t1 = time.time() + key, x, fs = reader.read(1) + lang = utt2lang.loc[key, "class_id"] + lang_id = torch.tensor([lang_info.loc[lang, "class_idx"]]).to(torch.int64) + if len(key) == 0: + break + + x, key, fs = x[0], key[0], fs[0] + t2 = time.time() + logging.info("processing utt %s", key) + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype()).to(device) + + tot_frames = x.shape[1] + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + + if x.shape[1] == 0: + y = [""] + else: + #y = decode_one_batch(model=model, sp=sp, x=x) + x_lengths = torch.tensor((x.shape[1], ), + dtype=torch.long, + device=device) + + y = model.infer(x=x, x_lengths=x_lengths, languageid=lang_id, **infer_args) + + y = sp.decode(y[0]) + logging.info(f"utt: {key} hyps: {y}") + t3 = time.time() + writer.write(f"{key} {y}\n") + + t4 = time.time() + tot_time = t4 - t1 + infer_time = t3 - t2 + logging.info( + ("utt %s total-time=%.3f read-time=%.3f " + "infer-time=%.3f " + "write-time=%.3f " + "infer-rt-factor=%.2f tot-rt-factor=%.2f"), + key, + tot_time, + t2 - t1, + infer_time, + t4 - t3, + x.shape[1] / fs / infer_time, + x.shape[1] / fs / tot_time, + ) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=("ASR decoding for RNN-T with Wav2vec features")) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--lang_input", dest="lang_input_spec", required=True) + parser.add_argument("--scp-sep", + default=" ", + help=("scp file field separator")) + + AR.add_class_args(parser) + parser.add_argument("--model-path", required=True) + parser.add_argument("--bpe-model", required=True) + parser.add_argument("--lang-file", required=True) + + HFWav2Vec2RNNFiLMTransducer.add_infer_args(parser, "infer-args") + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--use-gpu", + default=False, + action="store_true", + help="extract xvectors in gpu") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + decode_transducer(**namespace_to_dict(args)) diff --git a/hyperion/bin/finetune_wav2vec2rnn_film_transducer.py b/hyperion/bin/finetune_wav2vec2rnn_film_transducer.py new file mode 100755 index 00000000..5ff51348 --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2rnn_film_transducer.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2RNNFiLMTransducer) +from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + +model_dict = { + "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, + "hf_wav2vec2rnn_filmed_transducer": HFWav2Vec2RNNFiLMTransducer, + "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, + # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, + # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, + # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, + # "hf_wavlm2rnn_rnn_transducer": HFWavLM2RNNRNNTransducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + # FiLM: add language ID to the input + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + "language": language, + } + return batch + + + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate) + return data_loader + + +def init_model(in_model_file, rank, model_class, **kwargs): + model_args = model_class.filter_finetune_args(**kwargs["model"]) + # model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network ft args={}".format(model_args)) + model = TML.load(in_model_file) + model.change_config(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + + + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(**kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments( + "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" + ) + + + parser.add_argument("--in-model-file", required=True) + model_class.add_finetune_args(parser, prefix="model") + # model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser(description="Fine-tune Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/torch/models/transducer/rnn_film_transducer.py b/hyperion/torch/models/transducer/rnn_film_transducer.py index 444c4521..68066442 100644 --- a/hyperion/torch/models/transducer/rnn_film_transducer.py +++ b/hyperion/torch/models/transducer/rnn_film_transducer.py @@ -105,6 +105,7 @@ def infer(self, Args: x: input features with shape = (N, T, C) x_lengths: feature number for frames with shape = (N,) + lang: language id for each utterance with shape = (N,) decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. max_sym_per_utt: maximimum number of symbols in a single utterance. @@ -251,4 +252,4 @@ def add_infer_args(parser, prefix=None): @staticmethod def filter_infer_args(**kwargs): - return filter_func_args(RNNTransducer.infer, kwargs) + return filter_func_args(RNNFiLMTransducer.infer, kwargs) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py index 7f6b9ba7..dc28abb7 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py @@ -179,6 +179,7 @@ def forward( x: input features tensor with shape=(batch, in_feats, time) x_lengths: time lengths of the features with shape=(batch,) y: target classes torch.long tensor with shape=(batch,) + languageid: language id torch.long tensor with shape=(batch,) return_feat_layers: list of integers indicating, which wav2vec layers we should return. If None, no wav2vec layers are returned. return_enc_layers: list of integers indicating, which encoder layers @@ -208,7 +209,7 @@ def forward( def infer(self, x: torch.Tensor, x_lengths: torch.Tensor, - langugeid: torch.Tensor, + languageid: torch.Tensor, decoding_method="time_sync_beam_search", beam_width: int = 5, max_sym_per_frame: int = 3, @@ -218,20 +219,22 @@ def infer(self, Args: x: input features with shape = (N, T, C) x_lengths: feature number for frames with shape = (N,) + languageid: language id torch.long tensor with shape=(batch,) decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. max_sym_per_utt: maximimum number of symbols in a single utterance. Returns: List of list of integer indexes of the recognizer's symbols. """ - + # import pdb; pdb.set_trace() + languageid = languageid[0] feats, _, feat_lengths = self.forward_feats(x, x_lengths, languageid) feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) y = self.transducer.infer(feats, feat_lengths, - langugeid, + languageid, decoding_method=decoding_method, beam_width=beam_width, max_sym_per_frame=max_sym_per_frame, diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index 9f42a09c..91a30caf 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -356,7 +356,7 @@ def decode_greedy(self, blank_id = self.blank_id device = x.device - sos = torch.Tensor([blank_id], device=device, + sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1) pred_out, (h, c) = self.predictor(sos, lang_embedding) T = x.size(1) @@ -399,7 +399,7 @@ def decode_time_sync_beam_search(self, blank_id = self.blank_id device = x.device - sos = torch.Tensor([blank_id], device=device).reshape(1, 1) + sos = torch.tensor([blank_id], device=device).reshape(1, 1) pred_out, (h, c) = self.predictor(sos, lang_embedding) T = x.size(1) t = 0 @@ -424,12 +424,12 @@ def decode_time_sync_beam_search(self, cached_key = "_".join(map(str, y_star.ys)) if cached_key not in cache: - pred_in = torch.Tensor([y_star.ys[-1]], + pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) pred_out, pred_state = self.predictor( pred_in, - lang, + lang_embedding, y_star.pred_state, ) cache[cached_key] = (pred_out, pred_state) @@ -523,7 +523,7 @@ def decode_align_length_sync_beam_search( blank_id = self.blank_id device = x.device - sos = torch.Tensor([blank_id], device=device).reshape(1, 1) + sos = torch.tensor([blank_id], device=device).reshape(1, 1) pred_out, (h, c) = self.predictor(sos, lang_embedding) T = x.size(1) #t = 0 @@ -552,7 +552,7 @@ def decode_align_length_sync_beam_search( cached_key = "_".join(map(str, y_star.ys)) if cached_key not in cache: - pred_in = torch.Tensor([y_star.ys[-1]], + pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) pred_out, pred_state = self.predictor( From ca5327a9d1ae66b54340bc1c372ac670284236ef Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 18 May 2023 00:23:40 +0000 Subject: [PATCH 27/89] update language id trainer to use chunk for training --- egs/commonvoice/v1/conf/infer.yaml | 2 + ...c2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml | 65 +++++++++++++++++++ .../wav2vec2xlsr300m_ecapatdnn1024x3.yaml | 43 ++++++++++++ .../v1/global_conf/config_lid_v4.2_13langs.sh | 43 ++++++++++++ egs/commonvoice/v1/run_015_train_film_asr.sh | 10 +-- hyperion/torch/trainers/languageid_trainer.py | 12 ++-- hyperion/torch/trainers/torch_trainer.py | 1 + 7 files changed, 167 insertions(+), 9 deletions(-) create mode 100644 egs/commonvoice/v1/conf/infer.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh diff --git a/egs/commonvoice/v1/conf/infer.yaml b/egs/commonvoice/v1/conf/infer.yaml new file mode 100644 index 00000000..1f0ebfa7 --- /dev/null +++ b/egs/commonvoice/v1/conf/infer.yaml @@ -0,0 +1,2 @@ +beam_width: 5 +decoding_method: time_sync_beam_search \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml new file mode 100644 index 00000000..12b8c371 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.15 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 4200 + #hold_steps: 1500 + decay_steps: 16000 + hold_steps: 32000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3.yaml new file mode 100644 index 00000000..08964a38 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3.yaml @@ -0,0 +1,43 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh new file mode 100644 index 00000000..49721635 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh @@ -0,0 +1,43 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_resnet1d_v4.2_13_langs +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.2.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v2_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/run_015_train_film_asr.sh b/egs/commonvoice/v1/run_015_train_film_asr.sh index ba1197a8..fbf30558 100755 --- a/egs/commonvoice/v1/run_015_train_film_asr.sh +++ b/egs/commonvoice/v1/run_015_train_film_asr.sh @@ -14,13 +14,12 @@ set -e #module load cuda/11.6.0 #ml #nvidia-smi -#export CUDA_VISIBLE_DEVICES=0,1,2,3 +# export CUDA_VISIBLE_DEVICES=0,1,2,3 #export CONV_RSH=ssh #export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH -# export CUDA_VISIBLE_DEVICES=0,1 stage=1 -ngpu=1 +ngpu=2 config_file=default_config.sh interactive=false num_workers="" @@ -76,6 +75,7 @@ if [ $stage -le 1 ]; then --data.train.dataset.time-durs-file $train_dir/utt2dur \ --data.val.dataset.time-durs-file $val_dir/utt2dur \ --in-model-file $nnet_rnn_transducer \ + --master-port 1237 \ --num-gpus $ngpu fi @@ -90,7 +90,7 @@ if [ $stage -le 2 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - finetune_wav2vec2rnn_transducer.py $nnet_type \ + finetune_wav2vec2rnn_film_transducer.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ @@ -107,8 +107,8 @@ if [ $stage -le 2 ]; then --in-model-file $nnet_s1 \ --data.train.dataset.time-durs-file $train_dir/utt2dur \ --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1237 \ --num-gpus $ngpu - # --master-port 1236 \ fi diff --git a/hyperion/torch/trainers/languageid_trainer.py b/hyperion/torch/trainers/languageid_trainer.py index 0770cb8f..add56c1e 100644 --- a/hyperion/torch/trainers/languageid_trainer.py +++ b/hyperion/torch/trainers/languageid_trainer.py @@ -93,7 +93,7 @@ def train_epoch(self, data_loader): data_loader: pytorch data loader returning features and class labels. """ batch_keys = [ - self.input_key, f"{self.input_key}_lengths", self.target_key + self.input_key, self.target_key ] self.model.update_loss_margin(self.cur_epoch) @@ -107,8 +107,10 @@ def train_epoch(self, data_loader): if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - input_data, input_lengths, target = tensors_subset( + input_data, target = tensors_subset( data, batch_keys, self.device) + # input_data, input_lengths, target = tensors_subset( + # data, batch_keys, self.device) batch_size = input_data.shape[0] with self.amp_autocast(): @@ -152,7 +154,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): sw_update_bn: wheter or not, update batch-norm layers in SWA. """ batch_keys = [ - self.input_key, f"{self.input_key}_lengths", self.target_key + self.input_key, self.target_key ] metric_acc = MetricAcc(self.device) batch_metrics = ODict() @@ -165,8 +167,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() for batch, data in enumerate(data_loader): - input_data, input_lengths, target = tensors_subset( + input_data, target = tensors_subset( data, batch_keys, self.device) + # input_data, input_lengths, target = tensors_subset( + # data, batch_keys, self.device) batch_size = input_data.shape[0] # data, target = data.to(self.device), target.to(self.device) # batch_size = data.shape[0] diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 00a218f9..7ae7c50e 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -626,6 +626,7 @@ def load_last_checkpoint(self): for epoch in range(self.epochs, 0, -1): file_path = "%s/model_ep%04d.pth" % (self.exp_path, epoch) if os.path.isfile(file_path): + logging.info("Loading checkpoint %s" % file_path) return self.load_checkpoint(file_path) return None From 27d579cb7247bba1983cd4abac42e836e796355a Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Sat, 20 May 2023 17:42:13 -0400 Subject: [PATCH 28/89] sre21 16k recipe finished --- egs/sre21-av-a/v1.16k/README.md | 51 ++- .../v1.16k/conf/lresnet34_lid_v1.yaml | 59 --- ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml | 2 + ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml | 2 +- .../v1.16k/conf/train_lresnet34_lid_v1.yaml | 78 ++++ ...train_res2net50w26s4_xvec_stage1_v1.0.yaml | 80 ++++ ...train_res2net50w26s4_xvec_stage2_v1.0.yaml | 66 +++ ...train_res2net50w26s8_xvec_stage1_v1.0.yaml | 82 ++++ ...train_res2net50w26s8_xvec_stage2_v1.0.yaml | 66 +++ ...in_tseres2net50w26s4_xvec_stage1_v1.0.yaml | 83 ++++ ...in_tseres2net50w26s4_xvec_stage2_v1.0.yaml | 66 +++ ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 2 +- ...cs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh | 32 +- ...cs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh | 67 ---- ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 71 +--- ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 25 +- ...et50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh | 0 ...cs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh | 49 +++ ...et50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh | 0 .../v1.16k/run_011_train_xvector.sh | 1 - .../v1.16k/run_012_finetune_xvector.sh | 61 --- egs/sre21-av-a/v1.16k/run_014_train_lid.sh | 34 +- egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh | 2 +- ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml | 104 +++++ ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml | 66 +++ ...train_res2net50w26s8_xvec_stage1_v1.0.yaml | 82 ++++ ...train_res2net50w26s8_xvec_stage2_v1.0.yaml | 65 +++ ...in_tseres2net50w26s4_xvec_stage1_v1.0.yaml | 83 ++++ ...in_tseres2net50w26s4_xvec_stage2_v1.0.yaml | 66 +++ egs/sre21-av-a/v1.8k/default_config.sh | 2 +- ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 50 +-- ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 48 +++ ...statsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh | 68 ---- ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 58 +++ ...statsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh | 76 ---- egs/sre21-av-a/v1.8k/run_011_train_xvector.sh | 54 ++- .../v1.8k/run_012_finetune_xvector.sh | 61 --- egs/voxceleb/v1.1/run_002_compute_evad.sh | 1 - egs/voxceleb/v1.2/run_001_prepare_data.sh | 14 +- hyp_utils/create_data_link.pl | 132 ++++++ hyp_utils/create_data_split_dirs.sh | 46 +++ hyp_utils/create_data_split_links.sh | 23 ++ hyp_utils/create_split_dir.pl | 92 +++++ ...l_xvec_cosine_scoring_from_adv_test_wav.py | 8 +- ...osine_scoring_from_adv_test_wav_wavegan.py | 14 +- ...l_xvec_cosine_scoring_from_art_test_wav.py | 2 +- .../eval_xvec_cosine_scoring_from_test_wav.py | 4 +- ...sine_scoring_from_transfer_adv_test_wav.py | 2 +- ...sine_scoring_from_transfer_art_test_wav.py | 3 +- hyperion/bin/eval_xvec_logits_from_wav.py | 48 +-- hyperion/bin/extract_xvectors_from_wav.py | 2 +- .../bin/extract_xvectors_slidwin_from_wav.py | 14 +- hyperion/bin/finetune_xvector_from_wav.py | 77 +--- hyperion/data_prep/__init__.py | 2 +- hyperion/data_prep/voxceleb1.py | 338 ++++++++++++++++ hyperion/data_prep/voxceleb2.py | 12 +- hyperion/np/classifiers/__init__.py | 2 +- hyperion/torch/layers/global_pool.py | 7 +- hyperion/torch/models/xvectors/xvector.py | 6 +- hyperion/torch/narchs/audio_feats_mvn.py | 9 +- hyperion/torch/narchs/classif_head.py | 2 +- hyperion/torch/narchs/dc1d_decoder.py | 4 +- hyperion/torch/narchs/dc1d_encoder.py | 4 +- hyperion/torch/narchs/dc2d_decoder.py | 4 +- hyperion/torch/narchs/dc2d_encoder.py | 4 +- hyperion/torch/narchs/fcnet.py | 2 +- hyperion/torch/narchs/resnet.py | 16 +- hyperion/torch/narchs/resnet1d_decoder.py | 14 +- hyperion/torch/narchs/resnet1d_encoder.py | 20 +- hyperion/torch/narchs/resnet2d_decoder.py | 14 +- hyperion/torch/narchs/resnet2d_encoder.py | 19 +- hyperion/torch/narchs/resnet_factory.py | 4 +- hyperion/torch/narchs/spinenet.py | 16 +- hyperion/torch/narchs/spinenet_factory.py | 4 +- hyperion/torch/narchs/tdnn_factory.py | 4 +- .../torch/narchs/transformer_encoder_v1.py | 4 +- .../trainers/xvector_trainer_from_wav.py | 8 +- hyperion/torch/utils/masking.py | 7 +- hyperion/utils/__init__.py | 2 + hyperion/utils/dataset.py | 379 +++++++++++++++--- hyperion/utils/enrollment_map.py | 86 ++++ hyperion/utils/info_table.py | 12 +- hyperion/utils/segment_set.py | 4 + hyperion/utils/sparse_trial_key.py | 58 +++ hyperion/utils/trial_key.py | 82 +++- 85 files changed, 2625 insertions(+), 868 deletions(-) delete mode 100644 egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml delete mode 100644 egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh rename egs/sre21-av-a/v1.16k/global_conf/{ => deprecated}/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh (100%) create mode 100644 egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh rename egs/sre21-av-a/v1.16k/global_conf/{ => deprecated}/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh (100%) delete mode 100755 egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh create mode 100644 egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh delete mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh create mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh delete mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh delete mode 100755 egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh create mode 100755 hyp_utils/create_data_link.pl create mode 100755 hyp_utils/create_data_split_dirs.sh create mode 100755 hyp_utils/create_data_split_links.sh create mode 100755 hyp_utils/create_split_dir.pl create mode 100644 hyperion/data_prep/voxceleb1.py create mode 100644 hyperion/utils/enrollment_map.py diff --git a/egs/sre21-av-a/v1.16k/README.md b/egs/sre21-av-a/v1.16k/README.md index e35577d7..0f5d09ad 100644 --- a/egs/sre21-av-a/v1.16k/README.md +++ b/egs/sre21-av-a/v1.16k/README.md @@ -88,8 +88,6 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_011_train_xvector.sh` - Trains the x-vector network on 4sec chunks - - - `run_012_finetune_xvector.sh` - Fine-tune x-vector network on 10-15 secs utts - `run_013_prepare_langid_train_data.sh` @@ -110,8 +108,8 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_040_eval_be_v1.sh, run_041_eval_be_v2.sh, run_042_eval_be_v3.sh, run_042b_eval_be_v3.sh` - Evals different back-end versions: - V1: Back-end trained on all data without adaptation - - V2: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and langauge, global PLDA adapted to SRE-Vox-CHN - - V3: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and langauge, source dependent PLDA adapted to SRE-CHN or Vox-CHN + - V2: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and language, global PLDA adapted to SRE-Vox-CHN + - V3: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and language, source dependent PLDA adapted to SRE-CHN or Vox-CHN - V3b: V3 with hyperparmeters tuned for x-vectors trained on VoxCeleb only - `run_fus*.sh` @@ -120,4 +118,47 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs ## Results -TODO +The back-end used for these results is: +- back-end V2 (run_041_eval_be_v2.sh) +- Without S-Norm +- Scores are calibrated as indicated in the paper. + +## SRE16 Eval40% YUE + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.57 | 0.135 | 0.237 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.23 | 0.136 | 0.187 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.38 | 0.147 | 0.189 | + +## SRE-CTS Superset dev set + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.37 | 0.076 | 0.106 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 | + +## SRE-CTS Superset dev set + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.37 | 0.076 | 0.106 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 | + +## SRE21 Audio Dev (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.91 | 0.393 | 0.409 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 5.22 | 0.370 | 0.377 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.79 | 0.309 | 0.325 | + +## SRE21 Audio Eval (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.68 | 0.395 | 0.401 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.92 | 0.405 | 0.412 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.80 | 0.357 | 0.360 | diff --git a/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml b/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml deleted file mode 100644 index 5451702f..00000000 --- a/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml +++ /dev/null @@ -1,59 +0,0 @@ -min_chunk_length: 4.0 -max_chunk_length: 4.0 -return_fullseqs: false -wav_scale: 32767 -batch_size: 512 -var_batch_size: false -iters_per_epoch: 6.0 -train_aug_cfg: conf/reverb_noise_aug.yaml -val_aug_cfg: conf/reverb_noise_aug.yaml -feats: fbank64_stmn_nb_16k.yaml -pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 32 -embed_dim: 32 -num_embed_layers: 1 -hid_act: relu6 -loss_type: arc-softmax -s: 30.0 -margin: 0.3 -margin_warmup_epochs: 30.0 -dropout_rate: 0.0 -in_feats: 64 -resnet_type: lresnet34 -in_channels: 1 -conv_channels: 64 -base_channels: 64 -in_kernel_size: 3 -in_stride: 1 -in_norm: false -no_maxpool: true -optim: - opt_type: adam - lr: 0.02 - # lr: 0.01 - beta1: 0.9 - beta2: 0.95 - amsgrad: true - weight_decay: 1e-5 -lrsched: - lrsch_type: exp_lr - decay_rate: 0.5 - decay_steps: 8000 - hold_steps: 10000 - min_lr: 1.0e-05 - warmup_steps: 1000 - update_lr_on_opt_step: true -grad_acc_steps: 1 -epochs: 70 -log_interval: 100 -use_tensorboard: false -use_wandb: false -wandb: - mode: online -ddp_type: ddp -use_amp: true -swa_start: 0 -swa_lr: 0.001 -swa_anneal_epochs: 10 -num_gpus: 4 diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml index 01cfa082..d68ea26e 100644 --- a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -68,6 +68,7 @@ model: multilayer_concat: true endpoint_channels: 8192 dropout_rate: 0.0 + hid_act: relu6 pool_net: pool_type: ch-wise-att-mean+stddev inner_feats: 128 @@ -76,6 +77,7 @@ model: margin: 0.3 margin_warmup_epochs: 20.0 dropout_rate: 0.0 + hid_act: relu6 trainer: optim: opt_type: adam diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml index 24b1c081..e7f9969b 100644 --- a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -46,7 +46,7 @@ model: trainer: optim: opt_type: sgd - lr: 1e-3 + lr: 0.01 momentum: 0.9 weight_decay: 1e-5 lrsched: diff --git a/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml b/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml new file mode 100644 index 00000000..c46365db --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml @@ -0,0 +1,78 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + data_loader: + num_workers: 8 +feats: fbank64_stmn_nb_16k.yaml +model: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 64 + conv_channels: 64 + in_kernel_size: 3 + in_stride: 1 + in_norm: false + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 32 + embed_dim: 32 + num_embed_layers: 1 + hid_act: relu6 + loss_type: arc-softmax + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 30.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.02 + beta1: 0.9 + beta2: 0.95 + amsgrad: true + weight_decay: 1e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 10000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + epochs: 70 + log_interval: 100 + use_amp: true + diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..7a9234b6 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,80 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + pool_net: + pool_type: mean+stddev + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 50 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..9884bb4c --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 21 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..4c427202 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,82 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..f34b4896 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..10607607 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,83 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + se_r: 256 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..f34b4896 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index 1b7c3764..1da68697 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -27,7 +27,7 @@ nnet_num_epochs=75 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth -nnet=$nnet_dir/model_ep0004.pth + # xvector full net finetuning with out-of-domain ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh index 1903369e..6d14f27d 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh @@ -1,4 +1,4 @@ -# LResNet34 x-vector with mixed precision training +# Res2Net50 w26s4 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,50 +9,33 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.05 -nnet_type=res2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 50 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data -nnet_num_epochs=60 +nnet_base_cfg=conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s4_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0071.pth - +nnet=$nnet_dir/model_ep0061.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=21 ft_margin=0.5 -ft_margin_warmup=5 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0021.pth @@ -61,7 +44,4 @@ ft_nnet=$ft_nnet_dir/model_ep0021.pth plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh deleted file mode 100644 index 344e1288..00000000 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh +++ /dev/null @@ -1,67 +0,0 @@ -# LResNet34 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank80_stmn_16k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxcelebcat -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=res2net50 -dropout=0 -embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 50 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data -nnet_num_epochs=60 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth -#nnet=$nnet_dir/swa_model_ep0061.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=10 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=15 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index cae32b57..0b62008e 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,103 +9,40 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.02 -nnet_type=res2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 s=30 margin_warmup=20 margin=0.3 attstats_inner=128 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s8_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=10 -ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# xvector last-layer finetuning in-domain -reg_layers_classif=0 -reg_layers_enc="0 1 2 3 4" -nnet_adapt_data=voxcelebcat_sre_alllangs_mixfs_chnspks - -# ft2_batch_size_1gpu=4 -# ft2_eff_batch_size=128 # effective batch size -# ft2_ipe=4 -# ft2_lr=0.01 -# ft2_nnet_num_epochs=12 -# ft2_margin_warmup=3 -# ft2_reg_weight_embed=0.1 -# ft2_min_chunk=10 -# ft2_max_chunk=60 - -# ft2_opt_opt="--optim.opt-type sgd --optim.lr $ft2_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -# ft2_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -# ft2_nnet_name=${ft_nnet_name}.ft_eaffine_rege_w${ft2_reg_weigth_embed}_${ft2_min_chunk}_${ft2_max_chunk}_sgdcos_lr${ft2_lr}_b${ft2_eff_batch_size}_amp.v2 -# ft2_nnet_dir=exp/xvector_nnets/$ft2_nnet_name -# ft2_nnet=$ft2_nnet_dir/model_ep0010.pth - - -# xvector full nnet finetuning -ft2_batch_size_1gpu=6 -ft2_eff_batch_size=128 # effective batch size -ft2_ipe=1 -ft2_lr=0.01 -ft2_nnet_num_epochs=15 -ft2_margin=0.5 -ft2_margin_warmup=3 -ft2_reg_weight_embed=0.1 -ft2_reg_weight_enc=0.1 -ft2_min_chunk=10 -ft2_max_chunk=10 - -ft2_opt_opt="--optim.opt-type sgd --optim.lr $ft2_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft2_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft2_nnet_name=${ft_nnet_name}.ft_reg_wenc${ft2_reg_weight_enc}_we${ft2_reg_weight_embed}_${ft2_min_chunk}_${ft2_max_chunk}_sgdcos_lr${ft2_lr}_b${ft2_eff_batch_size}_amp.v1 -ft2_nnet_dir=exp/xvector_nnets/$ft2_nnet_name -ft2_nnet=$ft2_nnet_dir/model_ep0012.pth - - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index 96475c53..a57f16d9 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,21 +9,15 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.02 -nnet_type=tseres2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 se_r=256 s=30 @@ -31,13 +25,8 @@ margin_warmup=20 margin=0.3 attstats_inner=128 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 +nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_tseres2net50w26s4_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0075.pth nnet=$nnet_dir/swa_model_ep0076.pth @@ -49,12 +38,9 @@ ft_min_chunk=10 ft_max_chunk=15 ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -69,7 +55,4 @@ else plda_data=voxceleb2cat_train_augx${plda_num_augs} fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh similarity index 100% rename from egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh rename to egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh diff --git a/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh new file mode 100644 index 00000000..b5863308 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh @@ -0,0 +1,49 @@ +# Res2Net50 w26s8 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxcelebcat + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_args="--model.pool_net.pool-type mean+stddev" +nnet_name=${feat_type}_res2net50w26s8_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth +#nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +plda_type=splda diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh similarity index 100% rename from egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh rename to egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh diff --git a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh index 7f405952..d7ea8ed0 100755 --- a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh +++ b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh @@ -19,7 +19,6 @@ num_workers="" list_dir=data/${nnet_data}_proc_audio_no_sil -args="" if [ -n "$num_workers" ];then extra_args="--data.train.data_loader.num-workers $num_workers" fi diff --git a/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh b/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh deleted file mode 100755 index 58a3fdc9..00000000 --- a/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=3 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($ft_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $ft_eff_batch_size $ft_min_chunk $ft_max_chunk | awk '{ print int($2/($1*$4*2/($3+$4))+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_proc_audio_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - mkdir -p $ft_nnet_dir/log - $cuda_cmd --gpu $ngpu $ft_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $ft_min_chunk --max-chunk-length $ft_max_chunk \ - --iters-per-epoch $ft_ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $ft_opt_opt $ft_lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $ft_nnet_num_epochs \ - --cos-scale $s --margin $ft_margin --margin-warmup-epochs $ft_margin_warmup \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --in-model-path $nnet \ - --train-mode ft-full \ - --exp-path $ft_nnet_dir $args - -fi - - diff --git a/egs/sre21-av-a/v1.16k/run_014_train_lid.sh b/egs/sre21-av-a/v1.16k/run_014_train_lid.sh index 6251de97..35d2c0bc 100755 --- a/egs/sre21-av-a/v1.16k/run_014_train_lid.sh +++ b/egs/sre21-av-a/v1.16k/run_014_train_lid.sh @@ -10,19 +10,17 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 -lid_ipe=1 +num_workers="" + . parse_options.sh || exit 1; . $config_file . datapath.sh list_dir=data/train_lid_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then @@ -33,22 +31,20 @@ lid_nnet_dir=exp/lid_nnets/lresnet34_lid_v1 # Network Training if [ $stage -le 1 ]; then - train_exec=torch-train-resnet-xvec-from-wav.py mkdir -p $lid_nnet_dir/log $cuda_cmd \ --gpu $ngpu $lid_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec --cfg conf/lresnet34_lid_v1.yaml \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_train_lid/train.scp \ - --val-list $list_dir/lists_train_lid/val.scp \ - --class-file $list_dir/lists_train_lid/class2int \ - --iters-per-epoch $lid_ipe \ - --num-workers $num_workers \ - --num-gpus $ngpu \ - --exp-path $lid_nnet_dir $args - + train_xvector_from_wav.py resnet \ + --cfg conf/train_lresnet34_lid_v1.yaml \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_train_lid/train.scp \ + --data.train.dataset.class-file $list_dir/lists_train_lid/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_train_lid/val.scp \ + --trainer.exp-path $lid_nnet_dir $extra_args \ + --num-gpus $ngpu fi -exit diff --git a/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh b/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh index 0941951f..73cb9a3d 100755 --- a/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh +++ b/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh @@ -195,7 +195,7 @@ if [ $stage -le 5 ]; then #SRE superset and 16 echo "SRE Superset Dev" steps_be/eval_be_plda_snorm_v2_cts.sh \ - --cmd "$train_cmd --mem 8G" \ + --cmd "$train_cmd --mem 12G" \ --plda_type $plda_type --ncoh $ncoh --num-parts 100 \ data/sre_cts_superset_16k_dev/trials \ data/sre_cts_superset_16k_dev/utt2enroll \ diff --git a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..bc311234 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,104 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 8192 + dropout_rate: 0.0 + hid_act: relu6 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 30000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..031e9ca3 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..416926d0 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,82 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 35000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..16203033 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 diff --git a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..2d74799c --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,83 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + se_r: 256 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..f34b4896 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.8k/default_config.sh b/egs/sre21-av-a/v1.8k/default_config.sh index 91a20745..74b76b0a 120000 --- a/egs/sre21-av-a/v1.8k/default_config.sh +++ b/egs/sre21-av-a/v1.8k/default_config.sh @@ -1 +1 @@ -global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh \ No newline at end of file +global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh \ No newline at end of file diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index 69ad025b..65c2c924 100644 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,53 +9,19 @@ vad_config=conf/vad_8k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 lr=0.02 nnet_type=resnet1d -block_type=seres2bn # squeeze-excitation res2net bottleneck -channels=2048 -ep_channels=8192 -width_factor=1 -scale=8 -se_r=16 dropout=0 -attstats_inner=128 embed_dim=256 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet_enc.in-feats 64 \ - --resnet_enc.in-conv-channels $channels \ - --resnet_enc.in-kernel-size 5 \ - --resnet_enc.in-stride 1 \ - --resnet_enc.resb-type $block_type \ - --resnet_enc.resb-repeats 1 1 1 1 \ - --resnet_enc.resb-channels $channels \ - --resnet_enc.resb-kernel-sizes 3 \ - --resnet_enc.resb-dilations 2 3 4 5 \ - --resnet_enc.resb-strides 1 \ - --resnet_enc.res2net-width-factor $width_factor \ - --resnet_enc.res2net-scale $scale \ - --resnet_enc.se-r $se_r \ - --resnet_enc.multilayer \ - --resnet_enc.multilayer-concat \ - --resnet_enc.endpoint-channels $ep_channels \ - --pool_net.pool-type ch-wise-att-mean+stddev \ - --pool_net.inner-feats $attstats_inner \ - --embed-dim $embed_dim" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 30000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - +nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml nnet_name=${feat_type}_ecapatdnn2048x4_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=75 nnet_dir=exp/xvector_nnets/$nnet_name @@ -63,18 +29,14 @@ nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -82,8 +44,10 @@ ft_nnet=$ft_nnet_dir/model_ep0007.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh new file mode 100644 index 00000000..824361d0 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -0,0 +1,48 @@ +# Res2Net50 w26s8 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxcelebcat_sre_alllangs_mixfs + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s8_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth +nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +plda_type=splda diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh deleted file mode 100644 index e1a923d7..00000000 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ /dev/null @@ -1,68 +0,0 @@ -# LResNet34 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank64_stmn_8k.yaml -feat_type=fbank64_stmn - -#vad -vad_config=conf/vad_8k.yaml - -# x-vector training -nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=res2net50 -dropout=0 -embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - -s=30 -margin_warmup=20 -margin=0.3 -attstats_inner=128 - -nnet_opt="--resnet-type $nnet_type --in-feats 64 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 35000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth -nnet=$nnet_dir/swa_model_ep0076.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=10 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=15 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh new file mode 100644 index 00000000..58010842 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -0,0 +1,58 @@ +# Time SE Res2Net50 w26s4 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxcelebcat_sre_alllangs_mixfs + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 +se_r=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_tseres2net50w26s4_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0075.pth +nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_batch_size_1gpu=8 +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_ipe=1 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda + diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh deleted file mode 100644 index 9f5c8e70..00000000 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ /dev/null @@ -1,76 +0,0 @@ -# Time SE Res2Net50 w26s4 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank80_stmn_8k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_8k.yaml - -# x-vector training -nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=tseres2net50 -dropout=0 -embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 -se_r=256 - -s=30 -margin_warmup=20 -margin=0.3 -attstats_inner=128 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0075.pth -nnet=$nnet_dir/swa_model_ep0076.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=15 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=21 -ft_nnet_num_epochs=45 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0014.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh index 9891e812..d7ea8ed0 100755 --- a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh +++ b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh @@ -10,22 +10,17 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" . parse_options.sh || exit 1; . $config_file . datapath.sh -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/${nnet_data}_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then @@ -35,6 +30,49 @@ fi # Network Training if [ $stage -le 1 ]; then + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type \ + --cfg $nnet_base_cfg $nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu \ + +fi + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + mkdir -p $ft_nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $ft_nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type \ + --cfg $ft_nnet_base_cfg $ft_nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet \ + --trainer.exp-path $ft_nnet_dir \ + --num-gpus $ngpu \ + +fi +exit + +# Network Training +if [ $stage -le 1 ]; then + if [[ ${nnet_type} =~ resnet1d ]]; then train_exec=torch-train-resnet1d-xvec-from-wav.py elif [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]] || [[ ${nnet_type} =~ res2net ]] || [[ ${nnet_type} =~ res2next ]]; then diff --git a/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh b/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh deleted file mode 100755 index 58a3fdc9..00000000 --- a/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=3 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($ft_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $ft_eff_batch_size $ft_min_chunk $ft_max_chunk | awk '{ print int($2/($1*$4*2/($3+$4))+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_proc_audio_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - mkdir -p $ft_nnet_dir/log - $cuda_cmd --gpu $ngpu $ft_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $ft_min_chunk --max-chunk-length $ft_max_chunk \ - --iters-per-epoch $ft_ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $ft_opt_opt $ft_lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $ft_nnet_num_epochs \ - --cos-scale $s --margin $ft_margin --margin-warmup-epochs $ft_margin_warmup \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --in-model-path $nnet \ - --train-mode ft-full \ - --exp-path $ft_nnet_dir $args - -fi - - diff --git a/egs/voxceleb/v1.1/run_002_compute_evad.sh b/egs/voxceleb/v1.1/run_002_compute_evad.sh index 7a2a9be5..4e82a87a 100755 --- a/egs/voxceleb/v1.1/run_002_compute_evad.sh +++ b/egs/voxceleb/v1.1/run_002_compute_evad.sh @@ -41,7 +41,6 @@ if [ $stage -le 1 ]; then fi fi -#Train datasets if [ $stage -le 2 ];then if [ "$do_voxsrc22" == "true" ];then extra_data="voxsrc22_dev" diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh index 831eb1bc..f956bc8c 100755 --- a/egs/voxceleb/v1.2/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh @@ -20,17 +20,17 @@ if [ $stage -le 1 ];then prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ --cat-videos --use-kaldi-ids \ --output-dir data/voxceleb2cat_train - #local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train fi -exit + if [ $stage -le 2 ];then # prepare voxceleb1 for test - # This script is for the old version of the dataset - # local/make_voxceleb1_oeh.pl $voxceleb1_root data - # Use this for the newer version of voxceleb1: - local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data + #hyp_utils/conda_env.sh \ + prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test + #local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi - +exit if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then local/prepare_voxsrc22_dev.py \ --vox1-corpus-dir $voxceleb1_root \ diff --git a/hyp_utils/create_data_link.pl b/hyp_utils/create_data_link.pl new file mode 100755 index 00000000..850f29f0 --- /dev/null +++ b/hyp_utils/create_data_link.pl @@ -0,0 +1,132 @@ +#!/usr/bin/env perl + +# Copyright 2013 Guoguo Chen +# 2014 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0. +# +# This script distributes data onto different file systems by making symbolic +# links. It is supposed to use together with utils/create_split_dir.pl, which +# creates a "storage" directory that links to different file systems. +# +# If a sub-directory egs/storage does not exist, it does nothing. If it exists, +# then it selects pseudo-randomly a number from those available in egs/storage/* +# creates a link such as +# +# egs/egs.3.4.ark -> storage/4/egs.3.4.ark +# +use strict; +use warnings; +use File::Basename; +use File::Spec; +use Getopt::Long; + +sub GetGCD { + my ($a, $b) = @_; + while ($a != $b) { + if ($a > $b) { + $a = $a - $b; + } else { + $b = $b - $a; + } + } + return $a; +} + +my $Usage = < storage/4/egs.3.4.ark + +Usage: utils/create_data_link.pl [ ... ] + e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark foo/bar/egs.3.5.ark + (note: the dirname, e.g. foo/bar/, must be the same in all cases). + +See also utils/remove_data_links.sh +EOU + +GetOptions(); + +if (@ARGV == 0) { + die $Usage; +} + +my $example_fullpath = $ARGV[0]; + +# Check if the storage has been created. If so, do nothing. +my $dirname = dirname($example_fullpath); +if (! -d "$dirname/storage") { + exit(0); +} + +# Storage exists, create symbolic links in the next few steps. + +# First, get a list of the available storage directories, and check if they are +# properly created. +opendir(my $dh, "$dirname/storage/") || die "$0: Fail to open $dirname/storage/\n"; +my @storage_dirs = grep(/^[0-9]*$/, readdir($dh)); +closedir($dh); +my $num_storage = scalar(@storage_dirs); +for (my $x = 1; $x <= $num_storage; $x++) { + (-d "$dirname/storage/$x") || die "$0: $dirname/storage/$x does not exist\n"; +} + +# Second, get the coprime list. +my @coprimes; +for (my $n = 1; $n <= $num_storage; $n++) { + if (GetGCD($n, $num_storage) == 1) { + push(@coprimes, $n); + } +} + +my $ret = 0; + +foreach my $fullpath (@ARGV) { + if ($dirname ne dirname($fullpath)) { + die "Mismatch in directory names of arguments: $example_fullpath versus $fullpath"; + } + + # Finally, work out the directory index where we should put the data to. + my $basename = basename($fullpath); + my $filename_numbers = $basename; + $filename_numbers =~ s/[^0-9]+/ /g; + my @filename_numbers = split(" ", $filename_numbers); + my $total = 0; + my $index = 0; + foreach my $x (@filename_numbers) { + if ($index >= scalar(@coprimes)) { + $index = 0; + } + $total += $x * $coprimes[$index]; + $index++; + } + my $dir_index = $total % $num_storage + 1; + + # Make the symbolic link. + if (-e $fullpath) { + unlink($fullpath); + } + if (symlink("storage/$dir_index/$basename", $fullpath) != 1) { # failure + $ret = 1; # will exit with error status. + } +} + +exit($ret); + +## testing: +# rm -rf foo bar +# mkdir -p bar/{1,2,3,4} +# mkdir -p foo/storage +# for x in 1 2 3 4; do ln -s ../../bar/$x foo/storage/$x; done +# utils/create_data_link.pl utils/create_data_link.pl foo/1.3.ark foo/2.3.ark +# ls -l foo +# total 0 +# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 1.3.ark -> storage/3/1.3.ark +# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 2.3.ark -> storage/4/2.3.ark +# drwxr-xr-x 2 dpovey fax 38 Sep 2 17:40 storage diff --git a/hyp_utils/create_data_split_dirs.sh b/hyp_utils/create_data_split_dirs.sh new file mode 100755 index 00000000..877b9e3f --- /dev/null +++ b/hyp_utils/create_data_split_dirs.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +storage_name=$(date +'%m_%d_%H_%M') + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo "$0 exp/vad_dir $USER/hyp-data/voxceleb/v1/vad/storage b0" +fi +output_dir=$1 +storage_dir=$2 +nodes=$3 + +link_dir=$output_dir/storage + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $linkdir ]; then + echo "Prepare to distribute data over multiple $nodes nodes" + dir_name=$storage_dir/$storage_name/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + hyp_utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $link_dir + elif [ "$nodes" == "b1" ];then + hyp_utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $link_dir + elif [ "$nodes" == "c0" ];then + hyp_utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $link_dir + elif [ "$nodes" == "fs01" ];then + hyp_utils/create_split_dir.pl \ + /export/fs01/$dir_name $link_dir + else + echo "we don't distribute data between multiple machines" + fi +fi + + + diff --git a/hyp_utils/create_data_split_links.sh b/hyp_utils/create_data_split_links.sh new file mode 100755 index 00000000..fb5b8ca0 --- /dev/null +++ b/hyp_utils/create_data_split_links.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +storage_name=$(date +'%m_%d_%H_%M') + +echo "$0 $@" # Print the command line for logging +if [ $# -ne 3 ]; then + echo "Usage: $0 < " + echo "$0 exp/vad_dir/vad.JOB.ark 40" +fi +output_file_pattern=$1 +nj=$2 + +for n in $(seq $nj); do + # the next command does nothing unless output_dir/storage exists, see + # utils/create_data_link.pl for more info. + output_file=$(echo $output_file_pattern | sed 's@\.JOB\.[^\.]*$@.'$n'.@') + hyp_utils/create_data_link.pl $output_file +done + diff --git a/hyp_utils/create_split_dir.pl b/hyp_utils/create_split_dir.pl new file mode 100755 index 00000000..ab952357 --- /dev/null +++ b/hyp_utils/create_split_dir.pl @@ -0,0 +1,92 @@ +#!/usr/bin/env perl + +# Copyright 2013 Guoguo Chen +# Apache 2.0. +# +# This script creates storage directories on different file systems, and creates +# symbolic links to those directories. For example, a command +# +# utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage +# +# will mkdir -p all of those directories, and will create links +# +# egs/storage/1 -> /export/gpu-03/egs/storage +# egs/storage/2 -> /export/gpu-03/egs/storage +# ... +# +use strict; +use warnings; +use File::Spec; +use Getopt::Long; + +my $Usage = < + e.g.: utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage + +Allowed options: + --suffix : Common suffix to (string, default = "") + +See also create_data_link.pl, which is intended to work with the resulting +directory structure, and remove_data_links.sh +EOU + +my $suffix=""; +GetOptions('suffix=s' => \$suffix); + +if (@ARGV < 2) { + die $Usage; +} + +my $ans = 1; + +my $dir = pop(@ARGV); +system("mkdir -p $dir 2>/dev/null"); + +my @all_actual_storage = (); +foreach my $file (@ARGV) { + push @all_actual_storage, File::Spec->rel2abs($file . "/" . $suffix); +} + +my $index = 1; +foreach my $actual_storage (@all_actual_storage) { + my $pseudo_storage = "$dir/$index"; + + # If the symbolic link already exists, delete it. + if (-l $pseudo_storage) { + print STDERR "$0: link $pseudo_storage already exists, not overwriting.\n"; + $index++; + next; + } + + # Create the destination directory and make the link. + system("mkdir -p $actual_storage 2>/dev/null"); + if ($? != 0) { + print STDERR "$0: error creating directory $actual_storage\n"; + exit(1); + } + { # create a README file for easier deletion. + open(R, ">$actual_storage/README.txt"); + my $storage_dir = File::Spec->rel2abs($dir); + print R "# This directory is linked from $storage_dir, as part of Kaldi striped data\n"; + print R "# The full list of directories where this data resides is:\n"; + foreach my $d (@all_actual_storage) { + print R "$d\n"; + } + close(R); + } + my $ret = symlink($actual_storage, $pseudo_storage); + + # Process the returned values + $ans = $ans && $ret; + if (! $ret) { + print STDERR "Error linking $actual_storage to $pseudo_storage\n"; + } + + $index++; +} + +exit($ans == 1 ? 0 : 1); diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index 10ea491c..50fd5088 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -49,11 +49,11 @@ def __init__( self.sigma = sigma def forward(self, s_t): - # print('sigma0=', self.sigma) + if self.sigma > 0: s_t = s_t + self.sigma * torch.randn_like(s_t) - # print('sigma1=', self.sigma) - f_t = self.feat_extractor(s_t) + + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] @@ -320,7 +320,7 @@ def eval_cosine_scoring( ) s.save_txt(score_file) - logging.info("saving stats to %s" % (stats_file)) + logging.info("saving stats to %s", stats_file) attack_stats.to_csv(stats_file) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index a6f535b3..5697404d 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -84,7 +84,7 @@ def forward(self, s_t): s_t = self.wav_scale * s_t # End of pre-processing defense - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] @@ -289,13 +289,11 @@ def eval_cosine_scoring_wavegan( vad = torch.tensor(vad, dtype=torch.bool).to(device) model.vad_t = vad logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key.seg_set[j], - speech_frames, - tot_frames, - speech_frames / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, ) t2 = time.time() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index 5ba42477..0ca1f740 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -113,7 +113,7 @@ def forward(self, s_t): s_t = s_t[0, 0] f_t = s_t - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index c3732bd3..49a762af 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -125,7 +125,7 @@ def eval_cosine_scoring( audio_reader = AR(test_wav_file, **audio_args) if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) + logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((ndx.num_models, ndx.num_tests), dtype="float32") @@ -144,7 +144,7 @@ def eval_cosine_scoring( t2 = time.time() s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) - x_t = feat_extractor(s) + x_t, _ = feat_extractor(s) t4 = time.time() tot_frames = x_t.shape[1] if vad_spec is not None: diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index c00cf286..b2c111ca 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -45,7 +45,7 @@ def __init__( def forward(self, s_t): f_t = s_t - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index 4f2b82ab..8b6c8dae 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -59,8 +59,7 @@ def __init__( self.threshold = threshold def forward(self, s_t): - f_t = s_t - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index 2f5cf3da..98ba76b5 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -137,7 +137,7 @@ def eval_xvec( with AR(input_spec, **ar_args) as reader: if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) + logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): @@ -160,7 +160,7 @@ def eval_xvec( x[None, :], dtype=torch.get_default_dtype() ).to(device) - x = feat_extractor(x) + x, _ = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if vad_spec is not None: @@ -169,13 +169,11 @@ def eval_xvec( x = x[:, vad] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key, - x.shape[1], - tot_frames, - x.shape[1] / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, ) if random_utt_length: @@ -200,27 +198,23 @@ def eval_xvec( read_time = t2 - t1 tot_time = read_time + t8 - t3 logging.info( - ( - "utt %s total-time=%.3f read-time=%.3f " - "aug-time=%.3f feat-time=%.3f " - "vad-time=%.3f embed-time=%.3f write-time=%.3f " - "rt-factor=%.2f" - ) - % ( - key, - tot_time, - read_time, - t4 - t3, - t5 - t4, - t6 - t5, - t7 - t6, - t8 - t7, - x0.shape[0] / fs[0] / tot_time, - ) + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f", + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, ) if write_num_frames_spec is not None: - logging.info("writing num-frames to %s" % (write_num_frames_spec)) + logging.info("writing num-frames to %s", write_num_frames_spec) u2nf = Utt2Info.create(keys, info) u2nf.save(write_num_frames_spec) diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index 1da1ac05..f49a5fb0 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -163,7 +163,7 @@ def extract_xvectors( x[None, :], dtype=torch.get_default_dtype() ).to(device) - x = feat_extractor(x) + x, _ = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if vad_spec is not None: diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index a31bd614..9dc0aa2c 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -155,7 +155,7 @@ def extract_xvectors( x[None, :], dtype=torch.get_default_dtype() ).to(device) - x = feat_extractor(x) + x, _ = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if vad_spec is not None: @@ -164,13 +164,11 @@ def extract_xvectors( x = x[:, vad] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key, - x.shape[1], - tot_frames, - x.shape[1] / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, ) t6 = time.time() diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index 227892ea..7d602709 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -10,8 +10,12 @@ import time from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, set_float_cpu @@ -239,72 +243,3 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) - - -# if __name__ == "__main__": - -# parser = ArgumentParser(description="Fine-tune x-vector model from audio files") -# parser.add_argument("--cfg", action=ActionConfigFile) - -# train_parser = ArgumentParser(prog="") -# AD.add_class_args(train_parser, prefix="dataset", skip={}) -# Sampler.add_class_args(train_parser, prefix="sampler") -# train_parser.add_argument( -# "--data_loader.num-workers", -# type=int, -# default=5, -# help="num_workers of data loader", -# ) - -# val_parser = ArgumentParser(prog="") -# AD.add_class_args(val_parser, prefix="dataset", skip={}) -# Sampler.add_class_args(val_parser, prefix="sampler") -# val_parser.add_argument( -# "--data_loader.num-workers", -# type=int, -# default=5, -# help="num_workers of data loader", -# ) -# data_parser = ArgumentParser(prog="") -# data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) -# data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) -# parser.add_argument("--data", action=ActionParser(parser=data_parser)) -# parser.link_arguments( -# "data.train.dataset.class_file", "data.val.dataset.class_file" -# ) -# parser.link_arguments( -# "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" -# ) -# parser.link_arguments( -# "data.train.sampler.batch_size", "data.val.sampler.batch_size" -# ) - -# AF.add_class_args(parser, prefix="feats") -# parser.add_argument("--in-model-path", required=True) - -# XVec.add_finetune_args(parser, prefix="model") -# Trainer.add_class_args( -# parser, prefix="trainer", train_modes=XVec.valid_train_modes() -# ) -# ddp.add_ddp_args(parser) - -# parser.add_argument("--seed", type=int, default=1123581321, help="random seed") -# parser.add_argument( -# "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int -# ) -# parser.add_argument("--local_rank", default=0, type=int) - -# args = parser.parse_args() -# gpu_id = args.local_rank -# del args.local_rank - -# if gpu_id == 0: -# try: -# config_file = Path(args.exp_path) / "config.yaml" -# parser.save(args, str(config_file), format="yaml", overwrite=True) -# except: -# pass - -# # torch docs recommend using forkserver -# multiprocessing.set_start_method("forkserver") -# train_xvec(gpu_id, args) diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py index 7caae8c4..9ae59246 100644 --- a/hyperion/data_prep/__init__.py +++ b/hyperion/data_prep/__init__.py @@ -3,6 +3,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -# from .data_prep import data_prep_registry from .data_prep import DataPrep from .voxceleb2 import VoxCeleb2DataPrep +from .voxceleb1 import VoxCeleb1DataPrep diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py new file mode 100644 index 00000000..00b2e380 --- /dev/null +++ b/hyperion/data_prep/voxceleb1.py @@ -0,0 +1,338 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +import glob + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class VoxCeleb1DataPrep(DataPrep): + """Class for preparing VoxCeleb1 database into tables, + It prepares the full voxceleb either to train or test with + Original/Entire/Hard. + We don't consider preparing dev for train and test for test Original + + Attributes: + corpus_dir: input data directory + task: train/test + cat_videos: concatenate utterances from the same video. + output_dir: output data directory + use_kaldi_ids: puts speaker-id in front of segment id like kaldi + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + task: str, + cat_videos: bool, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, + ): + use_kaldi_ids = True + super().__init__( + corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads + ) + + self.task = task + assert ( + cat_videos == False or task == "train" + ), "cat-videos is only available for train task" + + self.cat_videos = cat_videos + + @staticmethod + def dataset_name(): + return "voxceleb1" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--task", + default="test", + choices=["test", "train"], + help="""if we prepare the data for [test, train]""", + ) + parser.add_argument( + "--cat-videos", + default=False, + action=ActionYesNo, + help="""concatenate utterances from the same video.""", + ) + + def _get_metadata(self): + file_name = "vox1_meta.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.openslr.org/resources/49/vox1_meta.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_meta = pd.read_csv(file_path, sep="\t") + df_meta.rename(columns=str.strip, inplace=True) + df_meta = df_meta.applymap(lambda x: str.strip(x) if isinstance(x, str) else x) + df_meta.set_index("VoxCeleb1 ID", inplace=True) + return df_meta + + def _get_langs_est(self): + file_name = "lang_vox2_final.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_lang = pd.read_csv(file_path, sep=",") + + if self.cat_videos: + + def get_video(x): + x = re.sub("/[^/]*.wav$", "", x) + return re.sub("/", "-", x) + + elif self.use_kaldi_ids: + + def get_video(x): + x = re.sub(".wav$", "", x) + return re.sub("/", "-", x) + + else: + + def get_video(x): + x = re.sub(".wav$", "", x) + x = re.sub("^[^/]*/", "", x) + return re.sub("/", "-", x) + + df_lang["id"] = df_lang["filename"].apply(get_video) + df_lang.drop(["filename"], axis=1, inplace=True) + df_lang.drop_duplicates(inplace=True) + df_lang.set_index("id", inplace=True) + df_lang["lang"] = df_lang["lang"].apply(str.lower) + return df_lang + + @staticmethod + def make_cat_list(lists_cat_dir, rec_id, rec_files, video_idx, i): + list_file = lists_cat_dir / f"{rec_id}.txt" + with open(list_file, "w") as fw: + rec_idx = (video_idx == i).nonzero()[0] + recs_i = [f"file {rec_files[j]}" for j in rec_idx] + recs_i.sort() + recs_i = "\n".join(recs_i) + fw.write(f"{recs_i}\n") + + file_path = ( + f"ffmpeg -v 8 -f concat -safe 0 -i {list_file} -f wav -acodec pcm_s16le -|" + ) + return file_path + + def make_trials(self): + url_base = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta" + trials_file_names = [ + "veri_test2.txt", + "list_test_hard2.txt", + "list_test_all2.txt", + ] + trials_names = ["trials_o", "trials_h", "trials_e"] + + trials = {} + dfs = [] + logging.info("making trials") + for trial_name, file_name in zip(trials_names, trials_file_names): + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = f"{url_base}/{file_name}" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_in = pd.read_csv( + file_path, + header=None, + sep=" ", + names=["key", "enroll_file", "test_file"], + ) + key = ["target" if k == 1 else "nontarget" for k in df_in["key"]] + + def get_modelid(s): + s = re.sub(r"\.wav", "", s) + return re.sub(r"/", "-", s) + + if self.use_kaldi_ids: + get_segmentid = get_modelid + else: + + def get_segmentid(s): + s = get_modelid(s) + return re.sub(r"[^-]*-", "", s) + + modelid = [get_modelid(f) for f in df_in["enroll_file"]] + segmentid = [get_segmentid(f) for f in df_in["test_file"]] + df_out = pd.DataFrame( + {"modelid": modelid, "segmentid": segmentid, "targettype": key} + ) + df_out.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / f"{trial_name}.csv" + df_out.to_csv(file_path, index=False) + dfs.append(df_out) + trials[trial_name] = file_path + + df_out = pd.concat(dfs, ignore_index=True) + df_out.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / "trials.csv" + df_out.to_csv(file_path, index=False) + trials["trials"] = file_path + + logging.info("making enrollment map") + modelid = df_out["modelid"].sort_values().unique() + if self.use_kaldi_ids: + segmentid = modelid + else: + segmentid = [re.sub(r"[^-]*-", "", s) for s in modelid] + + df_out = pd.DataFrame({"modelid": modelid, "segmentid": segmentid}) + file_path = self.output_dir / "enrollment.csv" + df_out.to_csv(file_path, index=False) + enrollments = {"enrollment": file_path} + + return enrollments, trials + + def prepare(self): + + logging.info("getting audio meta-data") + df_meta = self._get_metadata() + logging.info("getting language estimations") + df_lang = self._get_langs_est() + rec_dir = self.corpus_dir + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + speakers = [f.parents[1].name for f in rec_files] + video_ids = [f.parent.name for f in rec_files] + if self.cat_videos: + lists_cat_dir = self.output_dir / "lists_cat" + lists_cat_dir.mkdir(exist_ok=True, parents=True) + uniq_video_ids, uniq_video_idx, video_idx = np.unique( + video_ids, return_index=True, return_inverse=True + ) + rec_ids = uniq_video_ids + speakers = [speakers[i] for i in uniq_video_idx] + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)] + + file_paths = [] + futures = [] + logging.info("making video cat lists") + logging.info("submitting threats...") + with ThreadPoolExecutor(max_workers=self.num_threads) as pool: + for i, rec_id in tqdm(enumerate(rec_ids)): + future = pool.submit( + VoxCeleb1DataPrep.make_cat_list, + lists_cat_dir, + rec_id, + rec_files, + video_idx, + i, + ) + futures.append(future) + + logging.info("waiting threats...") + file_paths = [f.result() for f in tqdm(futures)] + video_ids = uniq_video_ids + + else: + file_names = [f.with_suffix("").name for f in rec_files] + if self.use_kaldi_ids: + rec_ids = [ + f"{s}-{v}-{f}" for s, v, f in zip(speakers, video_ids, file_names) + ] + else: + rec_ids = [f"{v}-{f}" for v, f in zip(video_ids, file_names)] + + file_paths = [str(r) for r in rec_files] + + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": file_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + "video_ids": video_ids, + "speaker": speakers, + "gender": df_meta.loc[speakers, "Gender"], + "nationality": df_meta.loc[speakers, "Nationality"], + "language_est": [ + df_lang.loc[r, "lang"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + "language_est_conf": [ + df_lang.loc[r, "confidence"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + "duration": recs.loc[rec_ids, "duration"].values, + } + ) + segments = SegmentSet(segments) + segments.sort() + + logging.info("making speaker info file") + uniq_speakers = np.unique(speakers) + speakers = pd.DataFrame( + { + "id": uniq_speakers, + "vgg_id": df_meta.loc[uniq_speakers, "VGGFace1 ID"], + "gender": df_meta.loc[uniq_speakers, "Gender"], + "nationality": df_meta.loc[uniq_speakers, "Nationality"], + } + ) + speakers = ClassInfo(speakers) + + logging.info("making language info file") + languages = np.unique(df_lang["lang"]) + languages = ClassInfo(pd.DataFrame({"id": languages})) + + if self.task == "test": + enrollments, trials = self.make_trials() + + logging.info("making dataset") + dataset = Dataset( + segments, + classes={"speaker": speakers, "languages": languages}, + recordings={"recordings": recs}, + enrollments=enrollments, + trials=trials, + sparse_trials=False, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments, %d speakers", len(segments), len(speakers) + ) diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index a1a9f0c3..1a32420f 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging +import glob import re from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -39,8 +40,7 @@ def __init__( target_sample_freq: int, num_threads: int = 10, ): - if cat_videos: - use_kaldi_ids = True + use_kaldi_ids = True super().__init__( corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads ) @@ -143,6 +143,12 @@ def prepare(self): rec_dir = self.corpus_dir / self.subset logging.info("searching audio files in %s", str(rec_dir)) rec_files = list(rec_dir.glob("**/*.m4a")) + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + speakers = [f.parents[1].name for f in rec_files] video_ids = [f.parent.name for f in rec_files] if self.cat_videos: @@ -176,7 +182,7 @@ def prepare(self): video_ids = uniq_video_ids else: - file_names = [f.name for f in rec_files] + file_names = [f.with_suffix("").name for f in rec_files] if self.use_kaldi_ids: rec_ids = [ f"{s}-{v}-{f}" for s, v, f in zip(speakers, video_ids, file_names) diff --git a/hyperion/np/classifiers/__init__.py b/hyperion/np/classifiers/__init__.py index d9d02ed0..60582016 100644 --- a/hyperion/np/classifiers/__init__.py +++ b/hyperion/np/classifiers/__init__.py @@ -10,4 +10,4 @@ from .linear_svmc import LinearSVMC from .logistic_regression import LogisticRegression from .q_scoring_homo_gbe import QScoringHomoGBE -from .svmc import GaussianSVMC +from .svmc import SVMC diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 5e38494f..8fe67792 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -42,8 +42,9 @@ def _standardize_weights(self, x, x_lengths=None, weights=None): multiplied by the input data. """ if weights is None: + time_dim = self.dim if self.dim >= 0 else x.dim() + self.dim return seq_lengths_to_mask( - x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=self.dim + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=time_dim ) if weights.dim() == x.dim(): @@ -599,7 +600,7 @@ def _standardize_weights(self, x, x_lengths=None, weights=None): """standardizes the weights to have shape (batch, max_length).""" if weights is None: return seq_lengths_to_mask( - x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=1 + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=2 ) if weights.dim() == x.dim(): @@ -797,7 +798,7 @@ def forward(self, x, x_lengths=None, weights=None): if attn.dtype == torch.half: min_value = -65504 else: - min_value = -1e200 + min_value = -1e20 mask = weights.eq(0) attn = attn.masked_fill(mask, min_value) diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 8556104a..d67785d2 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -355,7 +355,7 @@ def forward_hid_feats( max_in_length = x.size(-1) x = self._pre_enc(x) h_enc, x = self.encoder_net.forward_hid_feats( - x, return_enc_layers, return_logits=True + x, return_enc_layers, return_output=True ) output = {"h_enc": h_enc} if not return_logits and return_classif_layers is None: @@ -363,7 +363,7 @@ def forward_hid_feats( x, x_lengths = self._post_enc(x, x_lengths, max_in_length) p = self.pool_net(x, x_lengths=x_lengths) - h_classif, y_pred = self.classif_net.forward_hid_feats( + h_classif = self.classif_net.forward_hid_feats( p, y, return_classif_layers, return_logits=return_logits ) if return_logits: @@ -750,7 +750,7 @@ def add_class_args(parser, prefix=None, skip=set()): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index 160ee61b..a9ad224e 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -32,7 +32,12 @@ def __init__( if mvn is not None: mvn = MVN.filter_args(**mvn) self.mvn_cfg = mvn - if mvn["norm_mean"] or mvn["norm_var"]: + if ( + ("norm_mean" in mvn) + and mvn["norm_mean"] + or ("norm_var" in mvn) + and mvn["norm_var"] + ): self.mvn = MVN(**mvn) self.spec_augment = None @@ -79,7 +84,7 @@ def forward(self, x, x_lengths=None): if self.trans: f = f.transpose(1, 2).contiguous() - return f + return f, f_lengths def get_config(self): config = { diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index 9f9b280b..e5d90f4f 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -402,7 +402,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc1d_decoder.py b/hyperion/torch/narchs/dc1d_decoder.py index f5ab74d5..172a3d70 100644 --- a/hyperion/torch/narchs/dc1d_decoder.py +++ b/hyperion/torch/narchs/dc1d_decoder.py @@ -31,7 +31,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -389,7 +389,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc1d_encoder.py b/hyperion/torch/narchs/dc1d_encoder.py index 0c331a5e..6cf7f4ca 100644 --- a/hyperion/torch/narchs/dc1d_encoder.py +++ b/hyperion/torch/narchs/dc1d_encoder.py @@ -28,7 +28,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -362,7 +362,7 @@ def add_class_args(parser, prefix=None, head_channels=False, in_feats=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc2d_decoder.py b/hyperion/torch/narchs/dc2d_decoder.py index 4106cbfd..68679e0b 100644 --- a/hyperion/torch/narchs/dc2d_decoder.py +++ b/hyperion/torch/narchs/dc2d_decoder.py @@ -31,7 +31,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -410,7 +410,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc2d_encoder.py b/hyperion/torch/narchs/dc2d_encoder.py index ce7b9677..bc7e4b33 100644 --- a/hyperion/torch/narchs/dc2d_encoder.py +++ b/hyperion/torch/narchs/dc2d_encoder.py @@ -29,7 +29,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -367,7 +367,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/fcnet.py b/hyperion/torch/narchs/fcnet.py index cdbf1940..a47f304e 100644 --- a/hyperion/torch/narchs/fcnet.py +++ b/hyperion/torch/narchs/fcnet.py @@ -125,7 +125,7 @@ def __init__( in_units, hid_units, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, dropout_rate=0, norm_layer=None, diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py index 858cf4ea..5d3b9793 100644 --- a/hyperion/torch/narchs/resnet.py +++ b/hyperion/torch/narchs/resnet.py @@ -10,10 +10,16 @@ import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear -from ..layer_blocks import (Res2NetBasicBlock, Res2NetBNBlock, - ResNetBasicBlock, ResNetBNBlock, - ResNetEndpointBlock, ResNetInputBlock, - SEResNetBasicBlock, SEResNetBNBlock) +from ..layer_blocks import ( + Res2NetBasicBlock, + Res2NetBNBlock, + ResNetBasicBlock, + ResNetBNBlock, + ResNetEndpointBlock, + ResNetInputBlock, + SEResNetBasicBlock, + SEResNetBNBlock, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from ..utils import scale_seq_lengths, seq_lengths_to_mask @@ -69,7 +75,7 @@ def __init__( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, diff --git a/hyperion/torch/narchs/resnet1d_decoder.py b/hyperion/torch/narchs/resnet1d_decoder.py index 0c577174..9332724f 100644 --- a/hyperion/torch/narchs/resnet1d_decoder.py +++ b/hyperion/torch/narchs/resnet1d_decoder.py @@ -9,9 +9,13 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC1dDecBlock, ResNet1dBasicDecBlock, - ResNet1dBNDecBlock, SEResNet1dBasicDecBlock, - SEResNet1dBNDecBlock) +from ..layer_blocks import ( + DC1dDecBlock, + ResNet1dBasicDecBlock, + ResNet1dBNDecBlock, + SEResNet1dBasicDecBlock, + SEResNet1dBNDecBlock, +) from ..layers import ActivationFactory as AF from ..layers import ICNR1d from ..layers import NormLayer1dFactory as NLF @@ -34,7 +38,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -450,7 +454,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py index 5bdad186..97b244f3 100644 --- a/hyperion/torch/narchs/resnet1d_encoder.py +++ b/hyperion/torch/narchs/resnet1d_encoder.py @@ -12,10 +12,16 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC1dEncBlock, Res2Net1dBasicBlock, - Res2Net1dBNBlock, ResNet1dBasicBlock, - ResNet1dBNBlock, ResNet1dEndpoint, - SEResNet1dBasicBlock, SEResNet1dBNBlock) +from ..layer_blocks import ( + DC1dEncBlock, + Res2Net1dBasicBlock, + Res2Net1dBNBlock, + ResNet1dBasicBlock, + ResNet1dBNBlock, + ResNet1dEndpoint, + SEResNet1dBasicBlock, + SEResNet1dBNBlock, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF from ..utils import seq_lengths_to_mask @@ -37,7 +43,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, drop_connect_rate=0, @@ -472,7 +478,7 @@ def forward_hid_feats(self, x, x_lengths=None, layers=None, return_output=False) if self.head_channels > 0: x = self.head_block(x) - return x + return h, x def get_config(self): @@ -675,7 +681,7 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet2d_decoder.py b/hyperion/torch/narchs/resnet2d_decoder.py index 426b37f5..0afa1acc 100644 --- a/hyperion/torch/narchs/resnet2d_decoder.py +++ b/hyperion/torch/narchs/resnet2d_decoder.py @@ -10,9 +10,13 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC2dDecBlock, ResNet2dBasicDecBlock, - ResNet2dBNDecBlock, SEResNet2dBasicDecBlock, - SEResNet2dBNDecBlock) +from ..layer_blocks import ( + DC2dDecBlock, + ResNet2dBasicDecBlock, + ResNet2dBNDecBlock, + SEResNet2dBasicDecBlock, + SEResNet2dBNDecBlock, +) from ..layers import ActivationFactory as AF from ..layers import ICNR2d from ..layers import NormLayer2dFactory as NLF @@ -35,7 +39,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -457,7 +461,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet2d_encoder.py b/hyperion/torch/narchs/resnet2d_encoder.py index 84e6599e..a7fd047e 100644 --- a/hyperion/torch/narchs/resnet2d_encoder.py +++ b/hyperion/torch/narchs/resnet2d_encoder.py @@ -11,10 +11,15 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC2dEncBlock, Res2Net2dBasicBlock, - Res2Net2dBNBlock, ResNet2dBasicBlock, - ResNet2dBNBlock, SEResNet2dBasicBlock, - SEResNet2dBNBlock) +from ..layer_blocks import ( + DC2dEncBlock, + Res2Net2dBasicBlock, + Res2Net2dBNBlock, + ResNet2dBasicBlock, + ResNet2dBNBlock, + SEResNet2dBasicBlock, + SEResNet2dBNBlock, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from ..utils import seq_lengths_to_mask @@ -38,7 +43,7 @@ class ResNet2dEncoder(NetArch): resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -65,7 +70,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -511,7 +516,7 @@ def add_class_args(parser, prefix=None, skip=set()): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet_factory.py b/hyperion/torch/narchs/resnet_factory.py index 2d17a6d7..35ed9af0 100644 --- a/hyperion/torch/narchs/resnet_factory.py +++ b/hyperion/torch/narchs/resnet_factory.py @@ -146,7 +146,7 @@ def create( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, @@ -341,7 +341,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/spinenet.py b/hyperion/torch/narchs/spinenet.py index 117c0733..4349dbe1 100644 --- a/hyperion/torch/narchs/spinenet.py +++ b/hyperion/torch/narchs/spinenet.py @@ -11,9 +11,17 @@ import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear -from ..layer_blocks import (BlockSpec, Res2NetBasicBlock, Res2NetBNBlock, - ResNetBasicBlock, ResNetBNBlock, ResNetInputBlock, - SpineConv, SpineEndpoints, SpineResample) +from ..layer_blocks import ( + BlockSpec, + Res2NetBasicBlock, + Res2NetBNBlock, + ResNetBasicBlock, + ResNetBNBlock, + ResNetInputBlock, + SpineConv, + SpineEndpoints, + SpineResample, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from .net_arch import NetArch @@ -111,7 +119,7 @@ def __init__( do_endpoint_conv=True, concat_ax=3, upsampling_type="nearest", - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, diff --git a/hyperion/torch/narchs/spinenet_factory.py b/hyperion/torch/narchs/spinenet_factory.py index 092cbd0e..871b37e9 100644 --- a/hyperion/torch/narchs/spinenet_factory.py +++ b/hyperion/torch/narchs/spinenet_factory.py @@ -44,7 +44,7 @@ def create( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, @@ -243,7 +243,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/tdnn_factory.py b/hyperion/torch/narchs/tdnn_factory.py index 901cc9d0..77f69b9c 100644 --- a/hyperion/torch/narchs/tdnn_factory.py +++ b/hyperion/torch/narchs/tdnn_factory.py @@ -21,7 +21,7 @@ def create( kernel_size=3, dilation=1, dilation_factor=1, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_units=0, out_act=None, dropout_rate=0, @@ -194,7 +194,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/transformer_encoder_v1.py b/hyperion/torch/narchs/transformer_encoder_v1.py index 4468185e..f8b50491 100644 --- a/hyperion/torch/narchs/transformer_encoder_v1.py +++ b/hyperion/torch/narchs/transformer_encoder_v1.py @@ -64,7 +64,7 @@ def __init__( in_layer_type="conv2d-sub", rel_pos_enc=False, causal_pos_enc=False, - hid_act="relu6", + hid_act="relu", norm_before=True, concat_after=False, padding_idx=-1, @@ -408,7 +408,7 @@ def add_class_args(parser, prefix=None, in_feats=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 9541d7b0..52474baa 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -109,10 +109,10 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) with torch.no_grad(): - feats = self.feat_extractor(input_data) + feats, feats_lengths = self.feat_extractor(input_data) with amp.autocast(enabled=self.use_amp): - output = self.model(feats, y=target) + output = self.model(feats, feats_lengths, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: @@ -162,9 +162,9 @@ def validation_epoch(self, data_loader, swa_update_bn=False): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - feats = self.feat_extractor(input_data) + feats, feats_lengths = self.feat_extractor(input_data) with amp.autocast(enabled=self.use_amp): - output = self.model(feats) + output = self.model(feats, feats_lengths) loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() diff --git a/hyperion/torch/utils/masking.py b/hyperion/torch/utils/masking.py index fb93b439..934b4b90 100644 --- a/hyperion/torch/utils/masking.py +++ b/hyperion/torch/utils/masking.py @@ -17,9 +17,7 @@ def scale_seq_lengths(lengths, max_out_length, max_in_length=None): if max_in_length == max_out_length: return lengths - return torch.div(lengths * max_out_length, - max_in_length, - rounding_mode="floor") + return torch.div(lengths * max_out_length, max_in_length, rounding_mode="floor") def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): @@ -29,7 +27,7 @@ def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): lengths: sequence lengths with shape=(batch,). If None, it returns None max_length: maximum length of the sequence. dtype: dtype for the mask. - time_dim: dimension corresponding to time in the mask. This will + time_dim: dimension > 0 corresponding to time in the mask. This will return a view of the mask which will adapt to the shape of the tensor where we want to apply the mask. This has to be a positive integer. @@ -40,6 +38,7 @@ def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): if lengths is None: return None + assert time_dim > 0 assert lengths.dim() == 1 if max_length is None: diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index db035987..51b476aa 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -5,6 +5,7 @@ from .class_info import ClassInfo from .dataset import Dataset +from .enrollment_map import EnrollmentMap from .feature_set import FeatureSet from .hyp_dataclass import HypDataClass from .kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix @@ -12,6 +13,7 @@ from .recording_set import RecordingSet from .rttm import RTTM from .scp_list import SCPList + # from .ext_segment_list import ExtSegmentList from .segment_list import SegmentList from .segment_set import SegmentSet diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index efb7c114..e6c9e861 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -4,7 +4,7 @@ """ from pathlib import Path -from typing import Dict, Optional +from typing import Dict, Optional, Union import yaml @@ -13,41 +13,184 @@ from .misc import PathLike from .recording_set import RecordingSet from .segment_set import SegmentSet +from .enrollment_map import EnrollmentMap +from .trial_key import TrialKey +from .trial_ndx import TrialNdx +from .sparse_trial_key import SparseTrialKey class Dataset: """ Class that contains all objects (segments, recordings, features, class_infos) that conform a dataset + + Attributes: + segments: SegmentSet object or path to it. + classes: Dictionary of ClassInfo objects or paths to then + recordings: Dictionary of RecordingSet objects or paths to then + features: Dictionary of FeatureSet objects or paths to then + enrollments: Dictionary of EnrollmentMap objects or paths to then + trials: Dictionary of TrialKey/TrialNdx/SparseTrialKey objects + or paths to then + sparse_trials: load trial keys using the SparseTrialKey class instead + of TrialKey class. + table_sep: Column separator when reading/writting tables + """ def __init__( self, - segments: SegmentSet, - classes: Optional[Dict[str, ClassInfo]] = None, - recordings: Optional[Dict[str, RecordingSet]] = None, - features: Optional[Dict[str, FeatureSet]] = None, + segments: Union[SegmentSet, PathLike], + classes: Optional[Dict[str, Union[ClassInfo, PathLike]]] = None, + recordings: Optional[Dict[str, Union[RecordingSet, PathLike]]] = None, + features: Optional[Dict[str, Union[FeatureSet, PathLike]]] = None, + enrollments: Optional[Dict[str, Union[EnrollmentMap, PathLike]]] = None, + trials: Optional[ + Dict[str, Union[TrialKey, TrialNdx, SparseTrialKey, PathLike]] + ] = None, + sparse_trials: bool = False, + table_sep: Optional[str] = None, ): - self._segments = segments - self._classes = classes - self._recordings = recordings - self._features = features - @property - def segments(self): + if isinstance(segments, SegmentSet): + self._segments = segments + self._segments_path = None + else: + assert isinstance(segments, (str, Path)) + self._segments = None + self._segments_path = Path(segments) + + self._classes, self._classes_paths = self._parse_dict_args(classes, ClassInfo) + + self._recordings, self._recordings_paths = self._parse_dict_args( + recordings, RecordingSet + ) + + self._features, self._features_paths = self._parse_dict_args( + features, FeatureSet + ) + self._enrollments, self._enrollments_paths = self._parse_dict_args( + enrollments, EnrollmentMap, + ) + self._trials, self._trials_paths = self._parse_dict_args( + trials, (TrialKey, TrialNdx, SparseTrialKey), + ) + + self.sparse_trials = sparse_trials + self.table_sep = table_sep + + def _parse_dict_args(self, data, types): + if data is None: + return None, None + + assert isinstance(data, dict) + objects = {k: (v if isinstance(v, types) else None) for k, v in data.items()} + paths = { + k: (v if isinstance(v, (str, Path)) else None) for k, v in data.items() + } + + return objects, paths + + def segments(self, keep_loaded: bool = True): + if self._segments is None: + assert self._segments_path is not None + segments = SegmentSet.load(self.segments_path, sep=self.table_sep) + if keep_loaded: + self._segments = segments + return segments + return self._segments - @property - def recordings(self): - return self._recordings + def recordings_value(self, key: str, keep_loaded: bool = True): + if self._recordings[key] is None: + assert self._recordings_paths[key] is not None + recordings = RecordingSet.load( + self._recordings_paths[key], sep=self.table_sep + ) + if keep_loaded: + self._recordings[key] = recordings + + return self._recordings[key] - @property - def features(self): - return self._features + def features_value(self, key: str, keep_loaded: bool = True): + if self._features[key] is None: + assert self._features_paths[key] is not None + features = FeatureSet.load(self._features_paths[key], sep=self.table_sep) + if keep_loaded: + self._features[key] = features + + return self._features[key] + + def classes_value(self, key: str, keep_loaded: bool = True): + if self._classes[key] is None: + assert self._classes_paths[key] is not None + classes = ClassInfo.load(self._classes_paths[key], self.table_sep) + if keep_loaded: + self._classes[key] = classes + + return self._classes[key] + + def enrollments_value(self, key: str, keep_loaded: bool = True): + if self._enrollments[key] is None: + assert self._enrollments_paths[key] is not None + enrollments = EnrollmentMap.load( + self._enrollments_paths[key], sep=self.table_sep + ) + if keep_loaded: + self._enrollments[key] = enrollments + + return self._enrollments[key] + + def trials_value(self, key: str, keep_loaded: bool = True): + if self._trials[key] is None: + assert self._trials_paths[key] is not None + try: + if self.sparse_trials: + trials = SparseTrialKey.load(self._trials_paths[key]) + else: + trials = TrialKey.load(self._trials_paths[key]) + except: + trials = TrialNdx.load(self._trials_paths[key]) + + if keep_loaded: + self._trials[key] = trials + + return self._trials[key] + + def recordings(self, keep_loaded: bool = True): + if self._recordings is None: + yield from () + else: + for key in self._recordings.keys(): + yield key, self.recordings_value(key, keep_loaded) + + def features(self, keep_loaded: bool = True): + if self._features is None: + yield from () + else: + for key in self._features.keys(): + yield key, self.features_value(key, keep_loaded) + + def classes(self, keep_loaded: bool = True): + if self._classes is None: + yield from () + else: + for key in self._classes.keys(): + yield key, self.classes_value(key, keep_loaded) + + def enrollments(self, keep_loaded: bool = True): + if self._enrollments is None: + yield from () + else: + for key in self._enrollments.keys(): + yield key, self.enrollments_value(key, keep_loaded) - @property - def classes(self): - return self._classes + def trials(self, keep_loaded: bool = True): + if self._trials is None: + yield from () + else: + for key in self._trials.keys(): + yield key, self.trials_value(key, keep_loaded) @staticmethod def resolve_dataset_path(dataset_path): @@ -69,64 +212,128 @@ def resolve_file_path(dataset_dir, file_path): return dataset_dir / file_path - def save(self, dataset_path: PathLike): + def save( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + ): """Saves all the dataset objects. Args: - dataset_path: str/Path indicating directory - to save the dataset or .yaml file to save - the dataset info. + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DateSet object """ + table_sep = self.table_sep if table_sep is None else table_sep + if update_paths: + self.table_sep = table_sep + + table_ext = ".tsv" if table_sep == "\t" else ".csv" dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) dataset = {} - if self.segments is not None: - file_name = "segments.csv" - dataset["segments"] = file_name - file_path = dataset_dir / file_name - self.segments.save(file_path) + file_name = f"segments{table_ext}" + dataset["segments"] = file_name + file_path = dataset_dir / file_name + self.segments().save(file_path, sep=table_sep) + if update_paths: + self._segments_path = file_path - if self.recordings is not None: - file_names = {} - for k, v in self.recordings.items(): - file_name = k + ".csv" - file_names[k] = file_name - file_path = dataset_dir / file_name - v.save(file_path) + file_names = {} + for k, v in self.recordings(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._recordings_paths[k] = file_path + if file_names: dataset["recordings"] = file_names - if self.features is not None: - file_names = {} - for k, v in self.features.items(): - file_name = k + ".csv" - file_names[k] = file_name - file_path = dataset_dir / file_name - v.save(file_path) + file_names = {} + for k, v in self.features(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._features_paths[k] = file_path + if file_names: dataset["features"] = file_names - if self.classes is not None: - file_names = {} - for k, v in self.classes.items(): - file_name = k + ".csv" - file_names[k] = file_name - file_path = dataset_dir / file_name - v.save(file_path) + file_names = {} + for k, v in self.classes(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._classes_paths[k] = file_path + if file_names: dataset["classes"] = file_names + file_names = {} + for k, v in self.enrollments(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._enrollments_paths[k] = file_path + + if file_names: + dataset["enrollments"] = file_names + + file_names = {} + for k, v in self.trials(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path) + if update_paths: + self._trials_paths[k] = file_path + + if file_names: + dataset["trials"] = file_names + with open(dataset_file, "w") as f: yaml.dump(dataset, f) + def update_from_disk(self): + self.segments() + for k, v in self.recordings(): + pass + + for k, v in self.features(): + pass + + for k, v in self.classes(): + pass + + for k, v in self.enrollments(): + pass + + for k, v in self.trials(): + pass + @classmethod - def load(cls, dataset_path: PathLike): + def load( + cls, dataset_path: PathLike, lazy: bool = True, sparse_trials: bool = False + ): """Loads all the dataset objects. Args: dataset_path: str/Path indicating directory to save the dataset or .yaml file to save the dataset info. + lazy: load data structures lazily when they are needed. + sparse_trials: load trial keys using the SparseTrialKey class instead of TrialKey class """ dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) @@ -134,27 +341,79 @@ def load(cls, dataset_path: PathLike): dataset = yaml.safe_load(f) assert "segments" in dataset - segments = SegmentSet.load( - Dataset.resolve_file_path(dataset_dir, dataset["segments"]) - ) + segments = Dataset.resolve_file_path(dataset_dir, dataset["segments"]) classes = None recordings = None features = None + enrollments = None + trials = None if "classes" in dataset: classes = {} for k, v in dataset["classes"]: - classes[k] = ClassInfo.load(Dataset.resolve_file_path(dataset_dir, v)) + classes[k] = Dataset.resolve_file_path(dataset_dir, v) if "recordings" in dataset: recordings = {} for k, v in dataset["recordings"]: - recordings[k] = RecordingSet.load( - Dataset.resolve_file_path(dataset_dir, v) - ) + recordings[k] = Dataset.resolve_file_path(dataset_dir, v) if "features" in dataset: features = {} for k, v in dataset["features"]: - features[k] = FeatureSet.load(Dataset.resolve_file_path(dataset_dir, v)) + features[k] = Dataset.resolve_file_path(dataset_dir, v) + + if "enrollments" in dataset: + enrollments = {} + for k, v in dataset["enrollments"]: + enrollments[k] = Dataset.resolve_file_path(dataset_dir, v) + + if "trials" in dataset: + trials = {} + for k, v in dataset["trials"]: + trials[k] = Dataset.resolve_file_path(dataset_dir, v) + + dataset = cls( + segments, + classes, + recordings, + features, + enrollments, + trials, + sparse_trials=sparse_trials, + ) + if not lazy: + dataset.update_from_disk() + + return dataset + + # dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) + # with open(dataset_file, "w") as f: + # dataset = yaml.safe_load(f) + + # assert "segments" in dataset + # segments = SegmentSet.load( + # Dataset.resolve_file_path(dataset_dir, dataset["segments"]) + # ) + # classes = None + # recordings = None + # features = None + # if "classes" in dataset: + # classes = {} + # for k, v in dataset["classes"]: + # classes[k] = ClassInfo.load(Dataset.resolve_file_path(dataset_dir, v)) + + # if "recordings" in dataset: + # recordings = {} + # for k, v in dataset["recordings"]: + # recordings[k] = RecordingSet.load( + # Dataset.resolve_file_path(dataset_dir, v) + # ) + + # if "features" in dataset: + # features = {} + # for k, v in dataset["features"]: + # features[k] = FeatureSet.load(Dataset.resolve_file_path(dataset_dir, v)) - return cls(segments, classes, recordings, features) + # dataset = cls(segments, classes, recordings, features) + # if not lazy: + # dataset.update_from_disk() diff --git a/hyperion/utils/enrollment_map.py b/hyperion/utils/enrollment_map.py new file mode 100644 index 00000000..024e5b74 --- /dev/null +++ b/hyperion/utils/enrollment_map.py @@ -0,0 +1,86 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import re +from collections import OrderedDict +from copy import deepcopy +from pathlib import Path + +import numpy as np +import pandas as pd + +from .list_utils import split_list, split_list_group_by_key +from .info_table import InfoTable + + +class EnrollmentMap(InfoTable): + """Class to store the mapping between enrollment id + and segmentids + """ + + def __init__(self, df): + if "modelid" in df: + df.rename(columns={"modelid": "id"}, inplace=True) + super().__init__(df) + + def split(self, idx, num_parts): + """Splits the mapping into num_parts and return part idx. + + Args: + idx: Part to return from 1 to num_parts. + num_parts: Number of parts to split the list. + group_by: All the lines with the same value in column + groub_by_field go to the same part + + Returns: + Sub InfoTable object + """ + _, idx1 = split_list_group_by_key(self.df["id"], idx, num_parts) + + df = self.df.iloc[idx1] + return EnrollmentMap(df) + + def save(self, file_path, sep=None, nist_compatible=True): + if nist_compatible: + # For compatibility with NIST SRE files the index column "id" + # is saved as modelid + self.df.rename(columns={"id": "modelid"}, inplace=True) + + super().save(file_path, sep) + if nist_compatible: + self.df.rename(columns={"modelid": "id"}, inplace=True) + + @classmethod + def load(cls, file_path, sep=None): + """Loads EnrollmentMap from file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + dtype: Dictionary with the dtypes of each column. + name: name for the data to be loaded + Returns: + EnrollmentMap object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext in ["", ".scp"]: + # if no extension we load as kaldi utt2spk file + df = pd.read_csv( + file_path, + sep=" ", + header=None, + names=["segmentid", "modelid"], + dtype={"segmentid": np.str, "modelid": np.str}, + ) + df = df[["modelid", "segmentid"]] + else: + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + + return cls(df) diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 5a4f27d2..6bcd4aca 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -119,7 +119,7 @@ def from_dict(cls, df_dict): @classmethod def load(cls, file_path, sep=None, name="class_id"): - """Loads utt2info list from text file. + """Loads table from file. Args: file_path: File to read the list. @@ -127,7 +127,7 @@ def load(cls, file_path, sep=None, name="class_id"): dtype: Dictionary with the dtypes of each column. name: name for the data to be loaded Returns: - Utt2Info object + InfoTable object """ file_path = Path(file_path) ext = file_path.suffix @@ -156,7 +156,7 @@ def sort(self, column="id", ascending=True): self.df.sort_values(by=column, inplace=True, ascending=ascending) def split(self, idx, num_parts, group_by=None): - """Splits SCPList into num_parts and return part idx. + """Splits the table into num_parts and return part idx. Args: idx: Part to return from 1 to num_parts. @@ -177,13 +177,13 @@ def split(self, idx, num_parts, group_by=None): @classmethod def merge(cls, tables): - """Merges several Utt2Info tables. + """Merges several tables. Args: - info_lists: List of Utt2Info + info_lists: List of InfoTables Returns: - Utt2Info object concatenation the info_lists. + InfoTable object concatenation the info_lists. """ df_list = [table.df for table in tables] df = pd.concat(df_list) diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py index d51edc34..1852d25d 100644 --- a/hyperion/utils/segment_set.py +++ b/hyperion/utils/segment_set.py @@ -7,6 +7,10 @@ class SegmentSet(InfoTable): + """Class to store information about a speech segment + Internally, it uses a pandas table. + """ + def __init__(self, df): super().__init__(df) if "start" in df and "recording_id" not in df: diff --git a/hyperion/utils/sparse_trial_key.py b/hyperion/utils/sparse_trial_key.py index 5afc72a0..1bc321a7 100644 --- a/hyperion/utils/sparse_trial_key.py +++ b/hyperion/utils/sparse_trial_key.py @@ -5,8 +5,10 @@ import copy import os.path as path +from pathlib import Path import numpy as np +import pandas as pd import scipy.sparse as sparse from .list_utils import * @@ -79,6 +81,28 @@ def save_txt(self, file_path): for r, c in zip(non.row, non.col): f.write("%s %s nontarget\n" % (self.model_set[r], self.seg_set[c])) + def save_table(self, file_path, sep=None): + """Saves object to txt file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}targettype\n") + self.tar.eliminate_zeros() + self.non.eliminate_zeros() + tar = self.tar.tocoo() + for r, c in zip(tar.row, tar.col): + f.write(f"{self.model_set[r]}{sep}{self.seg_set[c]}{sep}target\n") + non = self.non.tocoo() + for r, c in zip(non.row, non.col): + f.write(f"{self.model_set[r]}{sep}{self.seg_set[c]}{sep}nontarget\n") + @classmethod def load_h5(cls, file_path): raise NotImplementedError() @@ -113,6 +137,40 @@ def load_txt(cls, file_path): non[item[0], item[1]] = True return cls(model_set, seg_set, tar.tocsr(), non.tocsr()) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from txt file + + Args: + file_path: File to read the list. + + Returns: + TrialKey object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + is_tar = (df["targettype"] == "target").values + model_set, _, model_idx = np.unique( + models, return_index=True, return_inverse=True + ) + seg_set, _, seg_idx = np.unique( + segments, return_index=True, return_inverse=True + ) + tar = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") + non = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") + for item in zip(model_idx, seg_idx, is_tar): + if item[2]: + tar[item[0], item[1]] = True + else: + non[item[0], item[1]] = True + return cls(model_set, seg_set, tar.tocsr(), non.tocsr()) + @classmethod def merge(cls, key_list): raise NotImplementedError() diff --git a/hyperion/utils/trial_key.py b/hyperion/utils/trial_key.py index 9552d7c0..4a99461b 100644 --- a/hyperion/utils/trial_key.py +++ b/hyperion/utils/trial_key.py @@ -5,9 +5,11 @@ import copy import os.path as path +from pathlib import Path import h5py import numpy as np +import pandas as pd from .list_utils import * from .trial_ndx import TrialNdx @@ -82,18 +84,20 @@ def sort(self): if self.trial_cond is not None: self.trial_cond = self.trial_cond[:, ix] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + ext = file_path.suffix + if ext in (".h5", ".hdf5"): self.save_h5(file_path) - else: + elif ext in ("", ".txt"): self.save_txt(file_path) + else: + self.save_table(file_path, sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -132,20 +136,40 @@ def save_txt(self, file_path): file_path: File to write the list. """ with open(file_path, "w") as f: - idx = (self.tar.T == True).nonzero() + idx = (self.tar.T).nonzero() for item in zip(idx[0], idx[1]): f.write( "%s %s target\n" % (self.model_set[item[1]], self.seg_set[item[0]]) ) - idx = (self.non.T == True).nonzero() + idx = (self.non.T).nonzero() for item in zip(idx[0], idx[1]): f.write( "%s %s nontarget\n" % (self.model_set[item[1]], self.seg_set[item[0]]) ) + def save_table(self, file_path, sep=None): + """Saves object to txt file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}targettype\n") + I, J = np.logical_or(self.tar, self.non).nonzero() + for i, j in zip(I, J): + target_type = "target" if self.tar[i, j] else "nontarget" + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{target_type}\n" + ) + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -154,11 +178,13 @@ def load(cls, file_path): Returns: TrialKey object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + _, file_ext = path.splitext(file_path) + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -240,6 +266,40 @@ def load_txt(cls, file_path): non[item[0], item[1]] = True return cls(model_set, seg_set, tar, non) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from txt file + + Args: + file_path: File to read the list. + + Returns: + TrialKey object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + is_tar = (df["targettype"] == "target").values + model_set, _, model_idx = np.unique( + models, return_index=True, return_inverse=True + ) + seg_set, _, seg_idx = np.unique( + segments, return_index=True, return_inverse=True + ) + tar = np.zeros((len(model_set), len(seg_set)), dtype="bool") + non = np.zeros((len(model_set), len(seg_set)), dtype="bool") + for i, j, target_type in zip(model_idx, seg_idx, is_tar): + if target_type: + tar[i, j] = True + else: + non[i, j] = True + return cls(model_set, seg_set, tar, non) + @classmethod def merge(cls, key_list): """Merges several key objects. From df8a24fc651240ab2fe193f900f624282e8fa9e0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 20 May 2023 22:05:30 +0000 Subject: [PATCH 29/89] add initialize model for joint-training and film training --- ...lize_model.py => initailize_film_model.py} | 1 + .../v1/local/initailize_lid_model.py | 49 +++++++++++++++++++ 2 files changed, 50 insertions(+) rename egs/commonvoice/v1/local/{initailize_model.py => initailize_film_model.py} (99%) create mode 100644 egs/commonvoice/v1/local/initailize_lid_model.py diff --git a/egs/commonvoice/v1/local/initailize_model.py b/egs/commonvoice/v1/local/initailize_film_model.py similarity index 99% rename from egs/commonvoice/v1/local/initailize_model.py rename to egs/commonvoice/v1/local/initailize_film_model.py index 7ae9db8e..2b15c236 100644 --- a/egs/commonvoice/v1/local/initailize_model.py +++ b/egs/commonvoice/v1/local/initailize_film_model.py @@ -1,4 +1,5 @@ import torch +import sys # arguments example # pretrained_model = 'exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth' diff --git a/egs/commonvoice/v1/local/initailize_lid_model.py b/egs/commonvoice/v1/local/initailize_lid_model.py new file mode 100644 index 00000000..9a2c1a06 --- /dev/null +++ b/egs/commonvoice/v1/local/initailize_lid_model.py @@ -0,0 +1,49 @@ +import torch +import sys +# arguments example +# ASR_model = 'exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth' +# LID_model = "exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe.s1_initial/model_ep0000.pth" +# output_model = "model_initialized.pth" + +# python local/initailize_lid_model.py /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v4.2_13_langs.s1/model_ep0003.pth /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v4.2_13_langs.s3/model_ep0001.pth + +ASR_model = torch.load(sys.argv[1]) +LID_model = torch.load(sys.argv[2]) + +output_model = sys.argv[3] + + +def copy_model_parameters(ASR_model, LID_model): + ASR_state_dict = ASR_model["model_state_dict"] + LID_state_dict = LID_model["model_state_dict"] + + update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in LID_state_dict and param.shape == LID_state_dict[name].shape and "hf_feats" in name} + # remove feature fuser + + new_LID_state_dict = LID_state_dict.copy() + new_LID_state_dict.update(update_state_dict) + + LID_model["model_state_dict"] = new_LID_state_dict + + unchanged_parameters = [] + changed_parameters = [] + unloaded_parameters = [] + for name, param in LID_state_dict.items(): + if torch.all(torch.eq(param, new_LID_state_dict[name])): + unchanged_parameters.append(name) + else: + changed_parameters.append(name) + + for name, param in ASR_state_dict.items(): + if name not in changed_parameters: + unloaded_parameters.append(name) + + print(f"Unchanged parameters: {unchanged_parameters}") + print(f"Unloaded parameters: {unloaded_parameters}") + print(f"Changed parameters: {changed_parameters}") + LID_model["epoch"] =1 + torch.save(LID_model, output_model) + + + +copy_model_parameters(ASR_model, LID_model) \ No newline at end of file From 159ff073e14fb0e13101afc8c71de6bb4bffda2e Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 21 May 2023 05:46:18 +0000 Subject: [PATCH 30/89] Add transducer and languageid joint training --- ..._k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml | 84 +++++++ ...nfig_pruned_transducer_lid_v1.0_13langs.sh | 48 ++++ .../finetune_wav2vec2transducer_languageid.py | 123 ++++++++-- .../hf_wav2rnn_transducer_languageid.py | 213 ++++++++++-------- .../hf_wav2vec2rnn_transducer_languageid.py | 53 ++++- .../trainers/transducer_languageid_trainer.py | 3 +- 6 files changed, 395 insertions(+), 129 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml new file mode 100644 index 00000000..972f7c1c --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml @@ -0,0 +1,84 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.0002 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.01 + data_loader: + num_workers: 8 +model: + transducer: + decoder: + prune_range: 15 + override_dropouts: false + languageid: + # resnet_enc: + # num_classes: 13 + cos_scale: 32.0 + + loss_weight_transducer: 0.005 + loss_weight_lid: 1.0 + lid_length: 3.0 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh new file mode 100644 index 00000000..b4437442 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh @@ -0,0 +1,48 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer_resnet1d + +# nnet_s1_transducer_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml +# nnet_s1_transducer_args="" + +nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2 +nnet_transducer_dir=exp/transducer_nnets/$nnet_transducer_name +nnet_transducer=$nnet_transducer_dir/model_ep0008.pth + +nnet_lid_name=${hf_model_name}_resnet1d_v4.0_13_langs.s3 +nnet_lid_dir=exp/resnet1d_nnets/$nnet_lid_name +nnet_lid=$nnet_lid_dir/model_ep0003.pth + +nnet_name=${hf_model_name}_rnnt_k2_pruned_resnet1d.v1.0_13_langs_8000_bpe + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/hyperion/bin/finetune_wav2vec2transducer_languageid.py b/hyperion/bin/finetune_wav2vec2transducer_languageid.py index 0628f3da..68d8dacf 100755 --- a/hyperion/bin/finetune_wav2vec2transducer_languageid.py +++ b/hyperion/bin/finetune_wav2vec2transducer_languageid.py @@ -37,13 +37,13 @@ def transducer_language_collate(batch): audio = [] audio_length = [] - text = [] + target = [] language = [] for record in batch: wav = torch.as_tensor(record["x"]) audio.append(wav) audio_length.append(wav.shape[0]) - text.append(record["text"]) + target.append(record["text"]) language.append(record["language"]) audio = pad_sequence(audio).transpose(0, 1) audio_length = torch.as_tensor(audio_length) @@ -52,21 +52,24 @@ def transducer_language_collate(batch): sort_idx = torch.argsort(audio_length, descending=True) audio = audio[sort_idx] audio_length = audio_length[sort_idx] - text = [text[k] for k in sort_idx] - text = k2.RaggedTensor(text) + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + language = [language[k] for k in sort_idx] language = torch.as_tensor(language) + # FiLM: add language ID to the input batch = { "x": audio, "x_lengths": audio_length, - "text": text, - "languageid": language, + "text": target, + "language": language, } return batch + def init_data(partition, rank, num_gpus, **kwargs): data_kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**data_kwargs["dataset"]) @@ -97,25 +100,95 @@ def init_data(partition, rank, num_gpus, **kwargs): return data_loader -def init_model(num_classes, in_model_transducer, in_model_lid, rank, model_class, **kwargs): +def check_update_parameters(joint_state_dict, new_joint_state_dict, rank): + unchanged_parameters = [] + changed_parameters = [] + unloaded_parameters = [] + for name, param in joint_state_dict.items(): + new_param = new_joint_state_dict[name].to(param.device) + if torch.all(torch.eq(param, new_param)): + unchanged_parameters.append(name) + else: + changed_parameters.append(name) + # logging + if rank == 0: + logging.info("Unchanged parameters: {}".format(unchanged_parameters)) + logging.info("Changed parameters: {}".format(changed_parameters)) + + +def remove_module_from_state_dict(state_dict): + new_state_dict = {} + for name, param in state_dict.items(): + if name.startswith("module."): + new_state_dict[name[len("module."):]] = param + else: + new_state_dict[name] = param + return new_state_dict + + +def copy_model_parameters(joint_model, wav2transducer_state_dict, wav2lid_state_dict, rank): + joint_state_dict = joint_model.state_dict() + wav2transducer_state_dict = remove_module_from_state_dict(wav2transducer_state_dict) + wav2lid_state_dict = remove_module_from_state_dict(wav2lid_state_dict) + + + hf_feats_update_state_dict = {name: param for name, param in wav2transducer_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "hf_feats" in name} + transducer_update_state_dict = {name: param for name, param in wav2transducer_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "transducer" in name} + languageid_update_state_dict = {name: param for name, param in wav2lid_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "languageid" in name} + + new_joint_state_dict = joint_state_dict.copy() + new_joint_state_dict.update(hf_feats_update_state_dict) + new_joint_state_dict.update(transducer_update_state_dict) + new_joint_state_dict.update(languageid_update_state_dict) + + new_joint_state_dict["transducer_fuser"] = wav2transducer_state_dict["feat_fuser"] + new_joint_state_dict["languageid_fuser"] = wav2lid_state_dict["feat_fuser"] + + + check_update_parameters(joint_state_dict, new_joint_state_dict, rank) + joint_model.load_state_dict(new_joint_state_dict) + +def init_model(in_model_transducer, in_model_lid, rank, model_class, **kwargs): + # load pretrained models + model_wav2transducer = torch.load(in_model_transducer) + model_wav2lid = torch.load(in_model_lid) + if rank == 0: + logging.info("init joint model") + logging.info("hf_feats network ft args={}".format(model_wav2transducer["model_cfg"]["hf_feats"])) + logging.info("transducer network ft args={}".format(model_wav2transducer["model_cfg"]["transducer"])) + logging.info("languageid network ft args={}".format(model_wav2lid["model_cfg"]["languageid"])) + logging.info("feat_fusion_start={}".format(model_wav2transducer["model_cfg"]["feat_fusion_start"])) + logging.info("feat_fusion_method_transducer={}".format(model_wav2transducer["model_cfg"]["feat_fusion_method"])) + logging.info("feat_fusion_method_languageid={}".format(model_wav2lid["model_cfg"]["feat_fusion_method"])) + + # init joint model + model = model_class(hf_feats=model_wav2transducer["model_cfg"]["hf_feats"], + transducer=model_wav2transducer["model_cfg"]["transducer"], + languageid=model_wav2lid["model_cfg"]["languageid"], + feat_fusion_start=model_wav2transducer["model_cfg"]["feat_fusion_start"], + feat_fusion_method_transducer=model_wav2transducer["model_cfg"]["feat_fusion_method"], + feat_fusion_method_languageid=model_wav2lid["model_cfg"]["feat_fusion_method"], + loss_weight_transducer=kwargs["model"]["loss_weight_transducer"], + loss_weight_lid=kwargs["model"]["loss_weight_lid"], + lid_length=kwargs["model"]["lid_length"], + ) + + copy_model_parameters(model, model_wav2transducer["model_state_dict"], model_wav2lid["model_state_dict"], rank) + + + # add finetune args model_args = model_class.filter_finetune_args(**kwargs["model"]) + # model_args = model_class.filter_args(**kwargs["model"]) if rank == 0: logging.info("model network ft args={}".format(model_args)) - model_wav2transducer = TML.load(in_model_transducer) - model_wav2lid = TML.load(in_model_lid) - model_args["languageid"]["num_classes"] = num_classes - logging.info(model_args) - model = model_class(model_wav2transducer.hf_feats, model_wav2transducer.transducer, model_wav2lid.languageid) + model_args["languageid"]["num_classes"] = model_wav2lid["model_cfg"]["languageid"]["num_classes"] model.change_config(**model_args) if rank == 0: logging.info("model={}".format(model)) return model - - - def train_model(gpu_id, args): config_logger(args.verbose) @@ -126,24 +199,24 @@ def train_model(gpu_id, args): torch.manual_seed(args.seed) set_float_cpu("float32") - ddp_args = ddp.filter_ddp_args(**kwargs) - device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs["rank"] = rank + # ddp_args = ddp.filter_ddp_args(**kwargs) + # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + # kwargs["rank"] = rank - # # for Debug - # rank = 0 - # kwargs["rank"] = 0 - # device = "cpu" - # world_size=1 + # for Debug + rank = 0 + kwargs["rank"] = 0 + device = torch.device("cuda:0") + world_size=1 train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + model = init_model(**kwargs) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {} + metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py index b9f39de8..90211ec9 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py @@ -15,15 +15,18 @@ from ...torch_model import TorchModel from ...utils import remove_silence from ..transducer import RNNTransducer, RNNTransducerOutput +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID @dataclass class RNNTransducerLanguageIDOutput(HypDataClass): - loss: torch.Tensor - loss_transducer: torch.Tensor - loss_lid: torch.Tensor - loss_transducer_simple: Optional[torch.Tensor] = None - loss_transducer_pruned: Optional[torch.Tensor] = None - h_feats: Optional[List[torch.Tensor]] = None + loss: torch.Tensor # Total loss + loss_transducer: torch.Tensor # Loss from the transducer + loss_lid: torch.Tensor # Loss from the language ID + loss_transducer_simple: Optional[torch.Tensor] = None # Simple loss from the transducer, if available + loss_transducer_pruned: Optional[torch.Tensor] = None # Pruned loss from the transducer, if available + h_feats: Optional[List[torch.Tensor]] = None # Hidden features, if available + logits: Optional[torch.Tensor] = None # Logits from languageid, if available + class HFWav2RNNTransducerLanguageID(TorchModel): """Abstract Base class for combined transducer language identification models that use a Hugging Face Model as feature extractor. @@ -40,61 +43,76 @@ class HFWav2RNNTransducerLanguageID(TorchModel): def __init__(self, hf_feats: TorchModel, - transducer: TorchModel, - languageid: TorchModel, - transducer_fuser: TorchModel, - languageid_fuser: TorchModel, + transducer: Union[Dict, TorchModel], + languageid: Union[Dict, TorchModel], feat_fusion_start: int = 0, - feat_fusion_method: str = "weighted-avg", + feat_fusion_method_transducer: str = "weighted-avg", + feat_fusion_method_languageid: str = "weighted-avg", loss_weight_transducer: float = 0.005, - loss_weight_lid: float = 1.0,): + loss_weight_lid: float = 1.0, + lid_length: float = 3.0, + ): super().__init__() self.hf_feats = hf_feats - # if isinstance(transducer, dict): - # transducer["decoder"]["in_feats"] = hf_feats.hidden_size - # #transducer["joiner"]["in_feats"] = hf_feats.hidden_size - # if "class_name" in transducer: - # del transducer["class_name"] - - # transducer["encoder"] = None - # transducer = RNNTransducer(**transducer) - # else: - # assert isinstance(transducer, RNNTransducer) - # if transducer.encoder is None: - # assert transducer.decoder.in_feats == hf_feats.hidden_size - # #assert transducer.joiner.in_feats == hf_feats.hidden_size + if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + #transducer["joiner"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer["encoder"] = None + transducer = RNNTransducer(**transducer) + else: + assert isinstance(transducer, RNNTransducer) + if transducer.encoder is None: + assert transducer.decoder.in_feats == hf_feats.hidden_size + #assert transducer.joiner.in_feats == hf_feats.hidden_size + + if isinstance(languageid, dict): + languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size + if "class_name" in languageid: + del languageid["class_name"] + languageid = ResNet1dLanguageID(**languageid) + else: + assert isinstance(languageid, ResNet1dLanguageID) + assert languageid.encoder_net.in_feats == hf_feats.hidden_size + self.transducer = transducer self.languageid = languageid - self.transducer_fuser = transducer_fuser - self.languageid_fuser = languageid_fuser self.feat_fusion_start = feat_fusion_start - self.feat_fusion_method = feat_fusion_method + self.feat_fusion_method_transducer = feat_fusion_method_transducer + self.feat_fusion_method_languageid = feat_fusion_method_languageid self.loss_weight_transducer = loss_weight_transducer self.loss_weight_lid = loss_weight_lid + self.lid_length = lid_length self._hf_context = contextlib.nullcontext() - - # def _make_fuser(self, transducer_fuser, languageid_fuser): - # if self.feat_fusion_method == "last": - # self.feat_fuser = None - # return - - # num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start - # layer_dim = self.hf_feats.hidden_size - # if self.feat_fusion_method == "weighted-avg": - # self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) - # elif self.feat_fusion_method == "linear": - # self.feat_fuser = nn.Linear(num_layers, 1, bias=False) - # self.feat_fuser.weight.data = torch.ones(1, - # num_layers) / num_layers - # elif self.feat_fusion_method == "cat": - # self.feat_fuser = nn.Linear(num_layers * layer_dim, - # layer_dim, - # bias=False) - - def _fuse_hid_feats(self, hid_feats): + self.transducer_fuser = self._make_fuser(self.feat_fusion_method_transducer) + self.languageid_fuser = self._make_fuser(self.feat_fusion_method_languageid) + + def _make_fuser(self, method): + if method == "last": + feat_fuser = None + return feat_fuser + num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if method == "weighted-avg": + feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif method == "linear": + feat_fuser = nn.Linear(num_layers, 1, bias=False) + feat_fuser.weight.data = torch.ones(1, + num_layers) / num_layers + elif method == "cat": + feat_fuser = nn.Linear(num_layers * layer_dim, + layer_dim, + bias=False) + + return feat_fuser + + + def _fuse_hid_feats(self, hid_feats, feat_fusion_method, feat_fuser): """Fuses the hidden features from the Wav2Vec model. Args: @@ -108,25 +126,20 @@ def _fuse_hid_feats(self, hid_feats): return hid_feats[0] hid_feats = hid_feats[self.feat_fusion_start:] - if self.feat_fusion_method == "weighted-avg": + if feat_fusion_method == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) - norm_weights_transducer = nn.functional.softmax(self.transducer_fuser, dim=-1) - norm_weights_languageid = nn.functional.softmax(self.languageid_fuser, dim=-1) - feats_transducer = torch.sum(hid_feats * norm_weights_transducer, dim=-1) - feats_languageid = torch.sum(hid_feats * norm_weights_languageid, dim=-1) - elif self.feat_fusion_method == "linear": + norm_weights = nn.functional.softmax(feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif feat_fusion_method == "linear": hid_feats = torch.stack(hid_feats, dim=-1) - feats_transducer = self.transducer_fuser(hid_feats).squeeze(dim=-1) - feats_languageid = self.languageid_fuser(hid_feats).squeeze(dim=-1) - elif self.feat_fusion_method == "cat": + feats = feat_fuser(hid_feats).squeeze(dim=-1) + elif feat_fusion_method == "cat": hid_feats = torch.cat(hid_feats, dim=-1) - feats_transducer = self.transducer_fuser(hid_feats) - feats_languageid = self.languageid_fuser(hid_feats) - elif self.feat_fusion_method == "last": - feats_transducer = hid_feats[-1] - feats_languageid = hid_feats[-1] + feats = feat_fuser(hid_feats) + elif feat_fusion_method == "last": + feats = hid_feats[-1] - return feats_transducer, feats_languageid + return feats def forward_feats(self, x, @@ -135,7 +148,7 @@ def forward_feats(self, chunk_length=0, detach_chunks=False): return_hid_states = (False if return_feat_layers is None - and self.feat_fusion_method == "last" else True) + and self.feat_fusion_method_transducer == "last" else True) with self._hf_context: hf_output = self.hf_feats( x, @@ -147,7 +160,8 @@ def forward_feats(self, feat_lengths = hf_output["hidden_states_lengths"] if return_hid_states: hid_feats = hf_output["hidden_states"] - feats_transducer, feats_languageid = self._fuse_hid_feats(hid_feats) + feats_transducer = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_transducer, self.transducer_fuser) + feats_languageid = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_languageid, self.languageid_fuser) else: hid_feats = None feats_transducer = hf_output["last_hidden_state"] @@ -166,6 +180,25 @@ def forward_feats(self, hid_feats = None return feats_transducer, feats_languageid, hid_feats, feat_lengths + + def languageid_chunk(self, feats, lengths): + sr = self.hf_feats.get_config()["sample_frequency"] + strides = self.hf_feats.get_config()["conv_stride"] + + total_stride = torch.prod(torch.tensor(strides, dtype=torch.float32)) + + chunk_length = int(self.lid_length * sr / total_stride) + + # Check if all samples are longer than chunk_length + if any(len < chunk_length for len in lengths): + return feats + + start_indices = [torch.randint(0, len - chunk_length + 1, (1,)).item() for len in lengths] + + chunks = torch.stack([feats[i, :, start:start + chunk_length] for i, start in enumerate(start_indices)]) + + return chunks + def forward( self, @@ -199,6 +232,8 @@ def forward( feats_transducer, feats_languageid, hid_feats, feat_lengths = self.forward_feats( x, x_lengths, return_feat_layers) + feats_languageid = self.languageid_chunk(feats_languageid, feat_lengths) + feats_transducer = feats_transducer.permute(0, 2, 1) # (N, C, T) ->(N, T, C) logits = self.languageid( @@ -221,7 +256,13 @@ def forward( if return_feat_layers: trans_output.h_feats = hid_feats - output = RNNTransducerLanguageIDOutput(self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, trans_output.loss, loss_lid,trans_output.loss_simple, trans_output.loss_pruned,trans_output.h_feats) + output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, + loss_transducer=trans_output.loss, + loss_lid=loss_lid, + loss_transducer_simple=trans_output.loss_simple, + loss_transducer_pruned=trans_output.loss_pruned, + h_feats=trans_output.h_feats, + logits=logits if return_logits else None) return output def infer(self, @@ -255,16 +296,16 @@ def infer(self, max_sym_per_utt=max_sym_per_utt) return y - def freeze_feat_fuser(self): - if self.feat_fuser is None: - return + # def freeze_feat_fuser(self): + # if self.feat_fuser is None: + # return - if self.feat_fusion_method == "weighted-avg": - self.feat_fuser.requires_grad = False - return + # if self.feat_fusion_method_transducer == "weighted-avg": + # self.feat_fuser.requires_grad = False + # return - for param in self.feat_fuser.parameters(): - param.requires_grad = False + # for param in self.feat_fuser.parameters(): + # param.requires_grad = False def freeze_hf_feats(self): self.hf_feats.freeze() @@ -341,6 +382,7 @@ def filter_args(**kwargs): "feat_fusion_method", "loss_weight_transducer", "loss_weight_lid", + "languageid", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args @@ -357,9 +399,11 @@ def get_config(self): "transducer": tran_cfg, "languageid": lid_cfg, "feat_fusion_start": self.feat_fusion_start, - "feat_fusion_method": self.feat_fusion_method, + "feat_fusion_method_transducer": self.feat_fusion_method_transducer, + "feat_fusion_method_lid": self.feat_fusion_method_lid, "loss_weight_transducer": self.loss_weight_transducer, "loss_weight_lid": self.loss_weight_lid, + "lid_length": self.lid_length, } base_config = super().get_config() @@ -395,25 +439,6 @@ def add_class_args(parser, prefix=None, skip=set()): "in [weighted-avg, linear, cat, last]"), ) - parser.add_argument( - "--loss-weight-transducer", - default=0.005, - type=float, - help=""" - The weight of the transducer loss - """, - ) - - parser.add_argument( - "--loss-weight-lid", - default=1.0, - type=float, - help=""" - The weight of the lid loss - """, - ) - - if prefix is not None: outer_parser.add_argument( diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py index 4fa19144..10bdc53b 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py @@ -39,15 +39,19 @@ def __init__( transducer: Union[Dict, RNNTransducer], languageid: Union[Dict, ResNet1dLanguageID], feat_fusion_start: int = 0, - feat_fusion_method: str = "weighted-avg", + feat_fusion_method_transducer: str = "weighted-avg", + feat_fusion_method_languageid: str = "weighted-avg", + loss_weight_transducer: float = 0.005, + loss_weight_lid: float = 1.0, + lid_length: float = 3.0, ): - # if isinstance(hf_feats, dict): - # if "class_name" in hf_feats: - # del hf_feats["class_name"] - # hf_feats = HFWav2Vec2(**hf_feats) - # else: - # assert isinstance(hf_feats, HFWav2Vec2) + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) # if isinstance(languageid, dict): # languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size @@ -64,7 +68,7 @@ def __init__( super().__init__(hf_feats, transducer, languageid, feat_fusion_start, - feat_fusion_method) + feat_fusion_method_transducer, feat_fusion_method_languageid, loss_weight_transducer, loss_weight_lid, lid_length) @staticmethod def filter_args(**kwargs): @@ -96,6 +100,12 @@ def add_class_args(parser, prefix=None): @staticmethod def filter_finetune_args(**kwargs): base_args = {} + + valid_args = ( + "loss_weight_transducer", + "loss_weight_lid", + "lid_length", + ) child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) base_args["hf_feats"] = child_args child_args = RNNTransducer.filter_finetune_args(**kwargs["transducer"]) @@ -110,6 +120,33 @@ def add_finetune_args(parser, prefix=None): outer_parser = parser parser = ArgumentParser(prog="") + parser.add_argument( + "--loss-weight-transducer", + default=0.005, + type=float, + help=""" + The weight of the transducer loss + """, + ) + + parser.add_argument( + "--loss-weight-lid", + default=1.0, + type=float, + help=""" + The weight of the lid loss + """, + ) + + parser.add_argument( + "--lid-length", + default=3.0, + type=float, + help=""" + The length of the chunks for language id + """, + ) + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") RNNTransducer.add_finetune_args(parser, prefix="transducer") ResNet1dLanguageID.add_finetune_args(parser, prefix="languageid") diff --git a/hyperion/torch/trainers/transducer_languageid_trainer.py b/hyperion/torch/trainers/transducer_languageid_trainer.py index 2e9df702..8a06ebda 100644 --- a/hyperion/torch/trainers/transducer_languageid_trainer.py +++ b/hyperion/torch/trainers/transducer_languageid_trainer.py @@ -135,9 +135,8 @@ def train_epoch(self, data_loader): for k, v in output.items(): if "loss" in k and v is not None: batch_metrics[k] = output[k].item() - for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output["logits"], languageid) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics From 29fdfb7e45bd089de6a5fbfaf50cf11efebcbd03 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 21 May 2023 05:55:00 +0000 Subject: [PATCH 31/89] add asr_lid run script --- egs/commonvoice/v1/run_020_train_asr_lid.sh | 140 ++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100755 egs/commonvoice/v1/run_020_train_asr_lid.sh diff --git a/egs/commonvoice/v1/run_020_train_asr_lid.sh b/egs/commonvoice/v1/run_020_train_asr_lid.sh new file mode 100755 index 00000000..67ee65d4 --- /dev/null +++ b/egs/commonvoice/v1/run_020_train_asr_lid.sh @@ -0,0 +1,140 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +# export CUDA_VISIBLE_DEVICES=0 + +#ml purge +#module load namd/2.14-cuda-smp +#module load cuda/11.6.0 +#ml +#nvidia-smi +#export CUDA_VISIBLE_DEVICES=0,1,2,3 +#export CONV_RSH=ssh +#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH + + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# # Network Training +# if [ $stage -le 1 ]; then + +# mkdir -p $nnet_s1_dir/log +# $cuda_cmd \ +# --gpu $ngpu $nnet_s1_dir/log/train.log \ +# hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ +# train_wav2vec2rnn_transducer.py $nnet_type \ +# --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ +# --data.train.dataset.audio-file $train_dir/wav.scp \ +# --data.train.dataset.segments-file $train_dir/utt2seg.csv \ +# --data.train.dataset.class-names "language" \ +# --data.train.dataset.class-files $train_dir/langs \ +# --data.train.dataset.bpe-model $bpe_model \ +# --data.train.dataset.text-file $train_dir/text \ +# --data.val.dataset.audio-file $val_dir/wav.scp \ +# --data.val.dataset.segments-file $val_dir/utt2seg.csv \ +# --data.val.dataset.class-names "language" \ +# --data.val.dataset.class-files $train_dir/langs \ +# --data.val.dataset.text-file $val_dir/text \ +# --trainer.exp-path $nnet_s1_dir $args \ +# --data.train.dataset.time-durs-file $train_dir/utt2dur \ +# --data.val.dataset.time-durs-file $val_dir/utt2dur \ +# --num-gpus $ngpu + +# fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer_languageid.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-transducer $nnet_transducer \ + --in-model-lid $nnet_lid \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + From 3d33522319c94b7b5ab3f7a1631a8e64260a7e57 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 21 May 2023 13:49:09 +0000 Subject: [PATCH 32/89] update film model --- ...2base_rnnt_film_k2_pruned_stage1_v2.0.yaml | 91 +++++++++++ ...g_pruned_filmed_transducer_v2.0_13langs.sh | 50 ++++++ hyperion/torch/layer_blocks/__init__.py | 2 +- hyperion/torch/layer_blocks/film_blocks.py | 69 +++++++- .../layer_blocks/transducer_film_predictor.py | 152 +++++++++++++++++- .../narchs/rnn_film_transducer_decoder.py | 7 +- 6 files changed, 358 insertions(+), 13 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml new file mode 100644 index 00000000..ba71c8ff --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml @@ -0,0 +1,91 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm_residual + joiner: + hid_feats: 512 + feat_fusion_method: film-weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh new file mode 100644 index 00000000..e056cf03 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh @@ -0,0 +1,50 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + +nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe +nnet_s2_transducer_name=$nnet_transducer_name.s2 +nnet_s2_transducer_dir=exp/transducer_nnets/$nnet_s2_transducer_name +nnet_rnn_transducer=$nnet_s2_transducer_dir/model_ep0010.pth + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v2.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0007.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py index 62c096b2..61d97285 100644 --- a/hyperion/torch/layer_blocks/__init__.py +++ b/hyperion/torch/layer_blocks/__init__.py @@ -9,7 +9,7 @@ from .dc2d_blocks import DC2dDecBlock, DC2dEncBlock from .etdnn_blocks import ETDNNBlock from .fc_blocks import FCBlock -from .film_blocks import FiLM, LSTMWithFiLM +from .film_blocks import FiLM, RNNWithFiLM, RNNWithFiLMResidual from .mbconv_blocks import MBConvBlock, MBConvInOutBlock from .res2net1d_blocks import Res2Net1dBasicBlock, Res2Net1dBNBlock from .res2net2d_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py index 5caeab76..9503fcfe 100644 --- a/hyperion/torch/layer_blocks/film_blocks.py +++ b/hyperion/torch/layer_blocks/film_blocks.py @@ -10,6 +10,7 @@ def __init__(self, input_size, condition_size): self.linear_shift = nn.Linear(condition_size, input_size) def forward(self, x, lang_condition): + # import pdb; pdb.set_trace() if x.ndim == 3: gamma = self.linear_scale(lang_condition).unsqueeze(1).expand_as(x) beta = self.linear_shift(lang_condition).unsqueeze(1).expand_as(x) @@ -22,28 +23,80 @@ def forward(self, x, lang_condition): -class LSTMWithFiLM(nn.Module): - def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True): - super(LSTMWithFiLM, self).__init__() +class RNNWithFiLM(nn.Module): + def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm"): + super(RNNWithFiLM, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers self.dropout = dropout - self.batch_first = batch_first + self.batch_first = batch_first + self.rnn_type = rnn_type + if self.rnn_type == "lstm": + self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) + elif self.rnn_type == "gru": + self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) + self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)]) + self.dropout_layer = nn.Dropout(dropout) + + def forward(self, x, states, lang_condition): + outputs = [] + new_h, new_c = [], [] + if self.rnn_type == "lstm": + rnns = self.lstms + elif self.rnn_type == "gru": + rnns = self.grus + + for i, (rnn, film) in enumerate(zip(rnns, self.films)): + if states: + x, (h_i, c_i) = rnn(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0))) + else: + x, (h_i, c_i) = rnn(x) + x = film(x, lang_condition) + new_h.append(h_i) + new_c.append(c_i) + if i != self.num_layers - 1: + x = self.dropout_layer(x) + outputs.append(x) + new_h = torch.cat(new_h, dim=0) + new_c = torch.cat(new_c, dim=0) + return x, (new_h, new_c) - self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) + +class RNNWithFiLMResidual(nn.Module): + def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm_residual"): + super(RNNWithFiLMResidual, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.batch_first = batch_first + self.rnn_type = rnn_type + if self.rnn_type == "lstm_residual": + self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) + elif self.rnn_type == "gru_residual": + self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)]) self.dropout_layer = nn.Dropout(dropout) def forward(self, x, states, lang_condition): outputs = [] new_h, new_c = [], [] - for i, (lstm, film) in enumerate(zip(self.lstms, self.films)): + + if self.rnn_type == "lstm_residual": + rnns = self.lstms + elif self.rnn_type == "gru_residual": + rnns = self.grus + + for i, (rnn, film) in enumerate(zip(rnns, self.films)): if states: - x, (h_i, c_i) = lstm(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0))) + x, (h_i, c_i) = rnn(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0))) else: - x, (h_i, c_i) = lstm(x) + x, (h_i, c_i) = rnn(x) x = film(x, lang_condition) + if i != 0: + x = x + residual + residual = x new_h.append(h_i) new_c.append(c_i) if i != self.num_layers - 1: diff --git a/hyperion/torch/layer_blocks/transducer_film_predictor.py b/hyperion/torch/layer_blocks/transducer_film_predictor.py index dbb93218..cb628a2c 100644 --- a/hyperion/torch/layer_blocks/transducer_film_predictor.py +++ b/hyperion/torch/layer_blocks/transducer_film_predictor.py @@ -12,7 +12,7 @@ from ...utils.misc import filter_func_args from ..layers import ActivationFactory as AF -from .film_blocks import FiLM, LSTMWithFiLM +from .film_blocks import FiLM, RNNWithFiLM, RNNWithFiLMResidual class TransducerRNNFiLMPredictor(nn.Module): """ RNN-T prediction network with LSTM or GRU @@ -46,14 +46,25 @@ def __init__(self, padding_idx=blank_id, ) self.embed_dropout = nn.Dropout(embed_dropout_rate) - if rnn_type == "lstm": - self.rnn = LSTMWithFiLM( + if rnn_type in ["lstm","gru"]: + self.rnn = RNNWithFiLM( input_size=embed_dim, hidden_size=hid_feats, num_layers=num_layers, dropout=rnn_dropout_rate, condition_size=condition_size, batch_first=True, + rnn_type=rnn_type + ) + elif rnn_type in ["lstm_residual","gru_residual"]: + self.rnn = RNNWithFiLMResidual( + input_size=embed_dim, + hidden_size=hid_feats, + num_layers=num_layers, + dropout=rnn_dropout_rate, + condition_size=condition_size, + batch_first=True, + rnn_type=rnn_type ) else: raise Exception(f"Unknown RNN type {rnn_type}") @@ -126,3 +137,138 @@ def change_config( self.rnn.p = self.rnn_dropout_rate self.embed_dropout_rate = embed_dropout_rate self.embed_dropout = nn.Dropout(self.embed_dropout_rate) + +class TransducerConvPredictor(nn.Module): + """ RNN-T prediction network based on Convolutions + Implmentation based on: + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7/decoder.py + + Attributes: + vocab_size: Number of tokens of the modeling unit including blank. + embed_dim: Dimension of the input embedding. + blank_id: The ID of the blank symbol. + out_feats: Output dimension of the predictor. + embed_dropout_rate: Dropout rate for the embedding layer. + """ + + def __init__( + self, + vocab_size: int, + embed_dim: int, + condition_size: int, + out_feats: Optional[int] = None, + context_size: int = 2, + embed_dropout_rate: float = 0.0, + hid_act: str = "relu", + blank_id: int = 0, + ): + super().__init__() + self.embedding = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=embed_dim, + padding_idx=blank_id, + ) + self.embed_dropout = nn.Dropout(embed_dropout_rate) + assert context_size >= 1, context_size + if context_size > 1: + self.conv = nn.Conv1d( + in_channels=embed_dim, + out_channels=embed_dim, + kernel_size=context_size, + padding=0, + groups=out_feats // 4, + bias=False, + ) + + self.blank_id = blank_id + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.embed_dropout_rate = embed_dropout_rate + self.context_size = context_size + self.hid_act = AF.create(hid_act) + + if out_feats is None: + out_feats = embed_dim + + self.out_feats = out_feats + if out_feats != embed_feats: + self.output_proj = nn.Linear(embed_dim, out_feats) + else: + self.output_proj = None + + def get_config(self): + hid_act = AF.get_config(self.hid_act) + config = { + "pred_type": "conv", + "vocab_size": self.vocab_size, + "embed_dim": self.embed_dim, + "out_feats": self.out_feats, + "context_size": self.context_size, + "embed_dropout_rate": self.embed_dropout_rate, + "blank_id": self.blank_id, + "hid_act": hid_act, + } + return config + + def forward( + self, + y: torch.Tensor, + states: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, None]: + """ + Args: + y: + A 2-D tensor of shape (N, U). + # need_pad: + # True to left pad the input. Should be True during training. + # False to not pad the input. Should be False during inference. + Returns: + Return a tensor of shape (N, U, decoder_dim). + """ + y = y.to(torch.int64) + embed = self.embedding(y) + if self.context > 1: + embed = embed.transpose(1, 2) + if states is None: + embed = F.pad(embedding_out, pad=(self.context_size - 1, 0)) + else: + raise NotImplementedError() + embed = self.conv(embed).transpose(1, 2) + + out = self.hid_act(embed) + if self.output_proj: + out = self.output_proj(out) + + return out, None + + # # this stuff about clamp() is a temporary fix for a mismatch + # # at utterance start, we use negative ids in beam_search.py + # if torch.jit.is_tracing(): + # # This is for exporting to PNNX via ONNX + # embedding_out = self.embedding(y) + # else: + # embedding_out = self.embedding(y.clamp(min=0)) * (y >= 0).unsqueeze(-1) + # if self.context_size > 1: + # embedding_out = embedding_out.permute(0, 2, 1) + # if need_pad is True: + # embedding_out = F.pad(embedding_out, pad=(self.context_size - 1, 0)) + # else: + # # During inference time, there is no need to do extra padding + # # as we only need one output + # assert embedding_out.size(-1) == self.context_size + # embedding_out = self.conv(embedding_out) + # embedding_out = embedding_out.permute(0, 2, 1) + # embedding_out = F.relu(embedding_out) + # return embedding_out + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + ): + logging.info("changing predictor config") + + if override_dropouts: + logging.info("overriding predictor dropouts") + self.embed_dropout_rate = embed_dropout_rate + self.embed_dropout = nn.Dropout(self.embed_dropout_rate) diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index 91a30caf..2797d5a3 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -750,7 +750,6 @@ def add_class_args(parser, type=int, required=True, help=("output prediction dimension")) - RNNFiLMTransducerDecoder.add_pred_args(parser) RNNFiLMTransducerDecoder.add_joiner_args(parser) parser.add_argument( @@ -782,6 +781,12 @@ def add_class_args(parser, type=Optional[int], help="""how many symbols to keep for each frame in k2 rnn-t pruned loss.""") + + parser.add_argument("--condition-size", + type=int, + required=True, + help=("condition vector dimension")) + parser.add_argument( "--lm-scale", default=0.25, From a5971ab3655f073ae48ef90272ebfe7cdf9f8b24 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Sun, 21 May 2023 18:21:48 -0400 Subject: [PATCH 33/89] update transducer_languageid joint model --- .../hf_wav2rnn_transducer_languageid.py | 62 +++++++++++-------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py index 90211ec9..b4f3b7dd 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py @@ -112,7 +112,7 @@ def _make_fuser(self, method): return feat_fuser - def _fuse_hid_feats(self, hid_feats, feat_fusion_method, feat_fuser): + def _fuse_hid_feats(self, hid_feats): """Fuses the hidden features from the Wav2Vec model. Args: @@ -126,20 +126,24 @@ def _fuse_hid_feats(self, hid_feats, feat_fusion_method, feat_fuser): return hid_feats[0] hid_feats = hid_feats[self.feat_fusion_start:] - if feat_fusion_method == "weighted-avg": + if self.feat_fusion_method_transducer == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) - norm_weights = nn.functional.softmax(feat_fuser, dim=-1) - feats = torch.sum(hid_feats * norm_weights, dim=-1) - elif feat_fusion_method == "linear": + norm_transducer_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) + norm_lid_weights = nn.functional.softmax(self.languageid_fuser, dim=-1) + feats_transducer = torch.sum(hid_feats * norm_transducer_weights, dim=-1) + feats_languageid = torch.sum(hid_feats * norm_lid_weights, dim=-1) + elif self.feat_fusion_method_transducer == "linear": hid_feats = torch.stack(hid_feats, dim=-1) - feats = feat_fuser(hid_feats).squeeze(dim=-1) - elif feat_fusion_method == "cat": + feats_transducer = self.transducer_fuser(hid_feats).squeeze(dim=-1) + feats_languageid = self.languageid_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method_transducer == "cat": hid_feats = torch.cat(hid_feats, dim=-1) - feats = feat_fuser(hid_feats) - elif feat_fusion_method == "last": + feats_transducer = self.transducer_fuser(hid_feats) + feats_languageid = self.languageid_fuser(hid_feats) + elif self.feat_fusion_method_transducer == "last": feats = hid_feats[-1] - return feats + return feats_transducer, feats_languageid def forward_feats(self, x, @@ -160,8 +164,8 @@ def forward_feats(self, feat_lengths = hf_output["hidden_states_lengths"] if return_hid_states: hid_feats = hf_output["hidden_states"] - feats_transducer = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_transducer, self.transducer_fuser) - feats_languageid = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_languageid, self.languageid_fuser) + feats_transducer, feats_languageid = self._fuse_hid_feats(hid_feats) + # feats_languageid = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_languageid, self.languageid_fuser) else: hid_feats = None feats_transducer = hf_output["last_hidden_state"] @@ -181,23 +185,23 @@ def forward_feats(self, return feats_transducer, feats_languageid, hid_feats, feat_lengths - def languageid_chunk(self, feats, lengths): - sr = self.hf_feats.get_config()["sample_frequency"] - strides = self.hf_feats.get_config()["conv_stride"] + # def languageid_chunk(self, feats, lengths): + # sr = self.hf_feats.get_config()["sample_frequency"] + # strides = self.hf_feats.get_config()["conv_stride"] - total_stride = torch.prod(torch.tensor(strides, dtype=torch.float32)) + # total_stride = torch.prod(torch.tensor(strides, dtype=torch.float32)) - chunk_length = int(self.lid_length * sr / total_stride) + # chunk_length = int(self.lid_length * sr / total_stride) - # Check if all samples are longer than chunk_length - if any(len < chunk_length for len in lengths): - return feats + # # Check if all samples are longer than chunk_length + # if any(len < chunk_length for len in lengths): + # return feats - start_indices = [torch.randint(0, len - chunk_length + 1, (1,)).item() for len in lengths] + # start_indices = [torch.randint(0, len - chunk_length + 1, (1,)).item() for len in lengths] - chunks = torch.stack([feats[i, :, start:start + chunk_length] for i, start in enumerate(start_indices)]) + # chunks = torch.stack([feats[i, :, start:start + chunk_length] for i, start in enumerate(start_indices)]) - return chunks + # return chunks def forward( @@ -231,8 +235,14 @@ def forward( """ feats_transducer, feats_languageid, hid_feats, feat_lengths = self.forward_feats( x, x_lengths, return_feat_layers) + + lid_len = int(self.lid_length * 50) + lid_start = torch.randint(0, torch.min(feat_lengths).item() - lid_len + 1, (1,)).item() + + feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len] + - feats_languageid = self.languageid_chunk(feats_languageid, feat_lengths) + # feats_languageid = self.languageid_chunk(feats_languageid, feat_lengths) feats_transducer = feats_transducer.permute(0, 2, 1) # (N, C, T) ->(N, T, C) @@ -261,8 +271,8 @@ def forward( loss_lid=loss_lid, loss_transducer_simple=trans_output.loss_simple, loss_transducer_pruned=trans_output.loss_pruned, - h_feats=trans_output.h_feats, - logits=logits if return_logits else None) + h_feats=trans_output.h_feats) + #logits=[logit.item() for logit in logits] if return_logits else None) return output def infer(self, From 92f33d3c5d9cbfe02afa67c589236f78f622e420 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 22 May 2023 01:20:09 +0000 Subject: [PATCH 34/89] update ASR and LID joint training code --- ...ransducer_ecapadnn1024x3_stage1_v1.0.yaml} | 60 +++- ...nfig_pruned_transducer_lid_v1.0_13langs.sh | 26 +- .../v1/local/initailize_joint_model.py | 56 ++++ egs/commonvoice/v1/run_020_train_asr_lid.sh | 53 ++-- .../identificate_wav2vec2resnet1d.sh | 2 +- ...train_wav2vec2rnn_transducer_languageid.py | 270 ++++++++++++++++++ .../hf_wav2rnn_transducer_languageid.py | 59 +++- .../hf_wav2vec2rnn_transducer_languageid.py | 8 +- .../trainers/transducer_languageid_trainer.py | 2 +- 9 files changed, 477 insertions(+), 59 deletions(-) rename egs/commonvoice/v1/conf/{train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml => train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml} (59%) create mode 100644 egs/commonvoice/v1/local/initailize_joint_model.py create mode 100755 hyperion/bin/train_wav2vec2rnn_transducer_languageid.py diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml similarity index 59% rename from egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml rename to egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml index 972f7c1c..dfc64d75 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml @@ -19,7 +19,7 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.0002 + num_chunks_per_seg_epoch: 0.1 data_loader: num_workers: 8 @@ -43,22 +43,72 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.01 + num_chunks_per_seg_epoch: 1.0 data_loader: num_workers: 8 model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m transducer: decoder: prune_range: 15 - override_dropouts: false + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + languageid: - # resnet_enc: - # num_classes: 13 + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish loss_weight_transducer: 0.005 loss_weight_lid: 1.0 lid_length: 3.0 + # feat_fusion_method: weighted-avg + feat_fusion_start: 2 trainer: optim: diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh index b4437442..aaafecc1 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh @@ -12,10 +12,10 @@ dev_data=13_langs_dev_proc_audio test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" lans="sl ga-IE cv br tr cy tt ca kab de fr it en" -language=13_langs +language=13_langs_weighted # bpe_model=data/13_langs_lang_bpe_4000/bpe.model -bpe_model=data/13_langs_lang_bpe_8000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model # bpe_model=data/13_langs_lang_bpe_16000/bpe.model # x-vector cfg @@ -24,17 +24,23 @@ nnet_type=hf_wav2vec2rnn_transducer_resnet1d # nnet_s1_transducer_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v3.1.yaml # nnet_s1_transducer_args="" -nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2 -nnet_transducer_dir=exp/transducer_nnets/$nnet_transducer_name -nnet_transducer=$nnet_transducer_dir/model_ep0008.pth +# nnet_transducer_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2 +# nnet_transducer_dir=exp/transducer_nnets/$nnet_transducer_name +# nnet_transducer=$nnet_transducer_dir/model_ep0008.pth -nnet_lid_name=${hf_model_name}_resnet1d_v4.0_13_langs.s3 -nnet_lid_dir=exp/resnet1d_nnets/$nnet_lid_name -nnet_lid=$nnet_lid_dir/model_ep0003.pth +# nnet_lid_name=${hf_model_name}_resnet1d_v4.0_13_langs.s3 +# nnet_lid_dir=exp/resnet1d_nnets/$nnet_lid_name +# nnet_lid=$nnet_lid_dir/model_ep0003.pth -nnet_name=${hf_model_name}_rnnt_k2_pruned_resnet1d.v1.0_13_langs_8000_bpe +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml +nnet_s1_args="" -nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_ecapadnn1024x3_stage2_v1.0.yaml +nnet_name=${hf_model_name}_rnnt_k2_pruned_transducer_ecapadnn1024x3.v1.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0007.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v1.0.yaml nnet_s2_args="" nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name diff --git a/egs/commonvoice/v1/local/initailize_joint_model.py b/egs/commonvoice/v1/local/initailize_joint_model.py new file mode 100644 index 00000000..fd98d3f2 --- /dev/null +++ b/egs/commonvoice/v1/local/initailize_joint_model.py @@ -0,0 +1,56 @@ +import torch +import sys +# arguments example +# + +ASR_model = torch.load(sys.argv[1]) +LID_model = torch.load(sys.argv[2]) +joint_model = torch.load(sys.argv[3]) + +output_model = sys.argv[4] + + +def check_update_parameters(joint_state_dict, new_joint_state_dict): + unchanged_parameters = [] + changed_parameters = [] + unloaded_parameters = [] + for name, param in joint_state_dict.items(): + new_param = new_joint_state_dict[name].to(param.device) + if torch.all(torch.eq(param, new_param)): + unchanged_parameters.append(name) + else: + changed_parameters.append(name) + print("Unchanged parameters: {}".format(unchanged_parameters)) + print("Changed parameters: {}".format(changed_parameters)) + + + +def copy_model_parameters(ASR_model, LID_model, joint_model, output_model): + ASR_state_dict = ASR_model["model_state_dict"] + LID_state_dict = LID_model["model_state_dict"] + joint_state_dict = joint_model["model_state_dict"] + + hf_feats_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "hf_feats" in name} + transducer_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "transducer" in name} + languageid_update_state_dict = {name: param for name, param in LID_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "languageid" in name} + + + new_joint_state_dict = joint_state_dict.copy() + new_joint_state_dict.update(hf_feats_update_state_dict) + new_joint_state_dict.update(transducer_update_state_dict) + new_joint_state_dict.update(languageid_update_state_dict) + # import pdb;pdb.set_trace() + + new_joint_state_dict["module.transducer_fuser"] = ASR_state_dict["module.feat_fuser"] + new_joint_state_dict["module.languageid_fuser"] = LID_state_dict["module.feat_fuser"] + + + joint_model["model_state_dict"] = new_joint_state_dict + joint_model["epoch"] =1 + + check_update_parameters(joint_state_dict, new_joint_state_dict) + torch.save(joint_model, output_model) + + + +copy_model_parameters(ASR_model, LID_model, joint_model, output_model) \ No newline at end of file diff --git a/egs/commonvoice/v1/run_020_train_asr_lid.sh b/egs/commonvoice/v1/run_020_train_asr_lid.sh index 67ee65d4..4b312e76 100755 --- a/egs/commonvoice/v1/run_020_train_asr_lid.sh +++ b/egs/commonvoice/v1/run_020_train_asr_lid.sh @@ -52,32 +52,33 @@ if [ "$use_wandb" == "true" ];then fi -# # Network Training -# if [ $stage -le 1 ]; then - -# mkdir -p $nnet_s1_dir/log -# $cuda_cmd \ -# --gpu $ngpu $nnet_s1_dir/log/train.log \ -# hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ -# train_wav2vec2rnn_transducer.py $nnet_type \ -# --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ -# --data.train.dataset.audio-file $train_dir/wav.scp \ -# --data.train.dataset.segments-file $train_dir/utt2seg.csv \ -# --data.train.dataset.class-names "language" \ -# --data.train.dataset.class-files $train_dir/langs \ -# --data.train.dataset.bpe-model $bpe_model \ -# --data.train.dataset.text-file $train_dir/text \ -# --data.val.dataset.audio-file $val_dir/wav.scp \ -# --data.val.dataset.segments-file $val_dir/utt2seg.csv \ -# --data.val.dataset.class-names "language" \ -# --data.val.dataset.class-files $train_dir/langs \ -# --data.val.dataset.text-file $val_dir/text \ -# --trainer.exp-path $nnet_s1_dir $args \ -# --data.train.dataset.time-durs-file $train_dir/utt2dur \ -# --data.val.dataset.time-durs-file $val_dir/utt2dur \ -# --num-gpus $ngpu - -# fi +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2rnn_transducer_languageid.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1238 \ + --num-gpus $ngpu + +fi if [ $stage -le 2 ]; then diff --git a/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh index 8b31ac2f..5a2bbc27 100755 --- a/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh +++ b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh @@ -78,7 +78,7 @@ if [ $stage -le 1 ];then cat $output_dir/languageid.* > $output_dir/langs - # python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text + python steps_lid/lid_score.py $output_dir/langs >> $output_dir/scores # python steps_transducer/word2char.py $data_dir/text $data_dir/text_char # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text diff --git a/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py b/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py new file mode 100755 index 00000000..85689ac3 --- /dev/null +++ b/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2RNNTransducerResnet1D) +from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + + +model_dict = { + "hf_wav2vec2rnn_transducer_resnet1d": HFWav2Vec2RNNTransducerResnet1D, +} + + +def transducer_language_collate(batch): + audio = [] + audio_length = [] + target = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + # FiLM: add language ID to the input + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + "language": language, + } + return batch + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs, + collate_fn=transducer_language_collate) + return data_loader + +def init_model(blank_id, vocab_size, num_classes, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + # TODO: check model_args + model_args["transducer"]["decoder"]["blank_id"] = blank_id + model_args["transducer"]["decoder"]["vocab_size"] = vocab_size + model_args["languageid"]["num_classes"] = num_classes + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + #torch.backends.cudnn.deterministic = True + #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.enabled = False + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = torch.device("cuda:{}".format(gpu_id)) + # world_size=1 + + # import pdb; pdb.set_trace() + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + # model = init_model_from_transducer(**kwargs) + model = init_model(train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + list(train_loader.dataset.num_classes.values())[0], + **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + # import pdb; pdb.set_trace() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + parser.link_arguments("data.train.dataset.bpe_model", + "data.val.dataset.bpe_model") + + # parser.add_argument("--in-model-file", required=True) + model_class.add_class_args(parser, prefix="model") + + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py index b4f3b7dd..8c7d54d7 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py @@ -47,7 +47,7 @@ def __init__(self, languageid: Union[Dict, TorchModel], feat_fusion_start: int = 0, feat_fusion_method_transducer: str = "weighted-avg", - feat_fusion_method_languageid: str = "weighted-avg", + feat_fusion_method_lid: str = "weighted-avg", loss_weight_transducer: float = 0.005, loss_weight_lid: float = 1.0, lid_length: float = 3.0, @@ -84,13 +84,13 @@ def __init__(self, self.feat_fusion_start = feat_fusion_start self.feat_fusion_method_transducer = feat_fusion_method_transducer - self.feat_fusion_method_languageid = feat_fusion_method_languageid + self.feat_fusion_method_lid = feat_fusion_method_lid self.loss_weight_transducer = loss_weight_transducer self.loss_weight_lid = loss_weight_lid self.lid_length = lid_length self._hf_context = contextlib.nullcontext() self.transducer_fuser = self._make_fuser(self.feat_fusion_method_transducer) - self.languageid_fuser = self._make_fuser(self.feat_fusion_method_languageid) + self.languageid_fuser = self._make_fuser(self.feat_fusion_method_lid) def _make_fuser(self, method): if method == "last": @@ -165,7 +165,7 @@ def forward_feats(self, if return_hid_states: hid_feats = hf_output["hidden_states"] feats_transducer, feats_languageid = self._fuse_hid_feats(hid_feats) - # feats_languageid = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_languageid, self.languageid_fuser) + # feats_languageid = self._fuse_hid_feats(hid_feats, self.feat_fusion_method_lid, self.languageid_fuser) else: hid_feats = None feats_transducer = hf_output["last_hidden_state"] @@ -235,11 +235,12 @@ def forward( """ feats_transducer, feats_languageid, hid_feats, feat_lengths = self.forward_feats( x, x_lengths, return_feat_layers) - - lid_len = int(self.lid_length * 50) - lid_start = torch.randint(0, torch.min(feat_lengths).item() - lid_len + 1, (1,)).item() - feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len] + lid_len = int(self.lid_length * 50) + min_len = torch.min(feat_lengths).item() + if min_len > lid_len: + lid_start = torch.randint(0, min_len - lid_len + 1, (1,)).item() + feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len] # feats_languageid = self.languageid_chunk(feats_languageid, feat_lengths) @@ -271,8 +272,8 @@ def forward( loss_lid=loss_lid, loss_transducer_simple=trans_output.loss_simple, loss_transducer_pruned=trans_output.loss_pruned, - h_feats=trans_output.h_feats) - #logits=[logit.item() for logit in logits] if return_logits else None) + h_feats=trans_output.h_feats, + logits=logits if return_logits else None) return output def infer(self, @@ -389,7 +390,8 @@ def filter_args(**kwargs): "hf_feats", "transducer", "feat_fusion_start", - "feat_fusion_method", + "feat_fusion_method_transducer", + "feat_fusion_method_lid", "loss_weight_transducer", "loss_weight_lid", "languageid", @@ -442,13 +444,46 @@ def add_class_args(parser, prefix=None, skip=set()): the wav2vec num_layers""", ) parser.add_argument( - "--feat-fusion-method", + "--feat-fusion-method-transducer", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]"), + ) + parser.add_argument( + "--feat-fusion-method-lid", default="weighted-avg", choices=["weighted-avg", "linear", "cat", "last"], help=("method to fuse the hidden layers from the wav2vec model " "in [weighted-avg, linear, cat, last]"), ) + parser.add_argument( + "--loss-weight-transducer", + default=0.005, + type=float, + help=""" + The weight of the transducer loss + """, + ) + + parser.add_argument( + "--loss-weight-lid", + default=1.0, + type=float, + help=""" + The weight of the lid loss + """, + ) + + parser.add_argument( + "--lid-length", + default=3.0, + type=float, + help=""" + The length of the chunks for language id + """, + ) if prefix is not None: outer_parser.add_argument( diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py index 10bdc53b..c8cd974b 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py @@ -40,7 +40,7 @@ def __init__( languageid: Union[Dict, ResNet1dLanguageID], feat_fusion_start: int = 0, feat_fusion_method_transducer: str = "weighted-avg", - feat_fusion_method_languageid: str = "weighted-avg", + feat_fusion_method_lid: str = "weighted-avg", loss_weight_transducer: float = 0.005, loss_weight_lid: float = 1.0, lid_length: float = 3.0, @@ -68,7 +68,7 @@ def __init__( super().__init__(hf_feats, transducer, languageid, feat_fusion_start, - feat_fusion_method_transducer, feat_fusion_method_languageid, loss_weight_transducer, loss_weight_lid, lid_length) + feat_fusion_method_transducer, feat_fusion_method_lid, loss_weight_transducer, loss_weight_lid, lid_length) @staticmethod def filter_args(**kwargs): @@ -76,8 +76,8 @@ def filter_args(**kwargs): child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) base_args["hf_feats"] = child_args child_args = RNNTransducer.filter_args(**kwargs["transducer"]) - child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"]) base_args["transducer"] = child_args + child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"]) base_args["languageid"] = child_args return base_args @@ -91,7 +91,7 @@ def add_class_args(parser, prefix=None): RNNTransducer.add_class_args(parser, prefix="transducer") # HFWav2RNNTransducer.add_class_args(parser) ResNet1dLanguageID.add_class_args(parser, prefix="languageid") - # HFWav2LanguageID.add_class_args(parser) + HFWav2RNNTransducerLanguageID.add_class_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, diff --git a/hyperion/torch/trainers/transducer_languageid_trainer.py b/hyperion/torch/trainers/transducer_languageid_trainer.py index 8a06ebda..d38ab9a9 100644 --- a/hyperion/torch/trainers/transducer_languageid_trainer.py +++ b/hyperion/torch/trainers/transducer_languageid_trainer.py @@ -192,7 +192,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics[k] = output[k].item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output["logits"], languageid) metric_acc.update(batch_metrics, batch_size) From 190ea29d37d2abecdc6f353e93e6f046dddd29dc Mon Sep 17 00:00:00 2001 From: ylu125 Date: Sun, 21 May 2023 22:00:20 -0400 Subject: [PATCH 35/89] update configuration --- ...2base_rnnt_film_k2_pruned_stage2_v2.0.yaml | 76 +++++++++++++++++++ ...g_pruned_filmed_transducer_v2.0_13langs.sh | 2 +- 2 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml new file mode 100644 index 00000000..5a1555dd --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40. + max_audio_length: 20. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 40. + max_audio_length: 20. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + transducer: + decoder: + prune_range: 15 + override_dropouts: false +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh index e056cf03..0f3845d7 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh @@ -34,7 +34,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v2.0_13_langs_weighted_8000_bpe nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0007.pth +nnet_s1=$nnet_s1_dir/model_ep0009.pth nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml nnet_s2_args="" From 16f8b499dce0d6a99ef9f7974b0a38ad2108c3ac Mon Sep 17 00:00:00 2001 From: ylu125 Date: Sun, 21 May 2023 22:55:57 -0400 Subject: [PATCH 36/89] update config with mean pruned rnn loss --- ...train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml index 56e08794..faa265a3 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.0.yaml @@ -9,14 +9,14 @@ data: wav_scale: 1 sampler: sampler_type: 'class_weighted_random_seg_chunk_sampler' - min_batch_size: 64 + min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 # weighted weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.1 + num_chunks_per_seg_epoch: 0.3 data_loader: num_workers: 8 val: @@ -29,14 +29,14 @@ data: wav_scale: 1 sampler: sampler_type: 'class_weighted_random_seg_chunk_sampler' - min_batch_size: 64 + min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 # weighted weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.1 + num_chunks_per_seg_epoch: 1.0 data_loader: num_workers: 8 model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml From 1ebb2195d18b047fe1e045e9f18d993b380d6701 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Sun, 21 May 2023 22:57:26 -0400 Subject: [PATCH 37/89] update config to use mean transducer loss --- ...e_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml index dfc64d75..275987d7 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml @@ -53,6 +53,7 @@ model: decoder: prune_range: 15 rnnt_loss: k2_pruned + reduction: mean simple_loss_scale: 0.2 predictor: embed_dim: 1024 @@ -104,7 +105,7 @@ model: dropout_rate: 0.3 hid_act: swish - loss_weight_transducer: 0.005 + loss_weight_transducer: 0.05 loss_weight_lid: 1.0 lid_length: 3.0 # feat_fusion_method: weighted-avg From c474869a61f335d5657b30a22b4f41ec7e36abe7 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 22 May 2023 03:00:00 +0000 Subject: [PATCH 38/89] update film parameter name --- ..._wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml | 11 ++++++----- .../wav2transducer/hf_wav2rnn_film_transducer.py | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml index 5a1555dd..a9a755ee 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml @@ -10,8 +10,8 @@ data: sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' - max_batch_length: 40. - max_audio_length: 20. + max_batch_length: 15. + max_audio_length: 15. min_batch_size: 1 drop_last: false # for class_weighted_random_bucketing_seg_sampler @@ -19,7 +19,7 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.3 + num_chunks_per_seg_epoch: 0.1 data_loader: num_workers: 1 @@ -34,8 +34,8 @@ data: sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' - max_batch_length: 40. - max_audio_length: 20. + max_batch_length: 15. + max_audio_length: 15. min_batch_size: 1 drop_last: true # for class_weighted_random_bucketing_seg_sampler @@ -50,6 +50,7 @@ model: transducer: decoder: prune_range: 15 + reduction: mean override_dropouts: false trainer: optim: diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py index dc28abb7..3f44c7c5 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py @@ -67,7 +67,7 @@ def _make_fuser(self): if self.feat_fusion_method == "film-weighted-avg": self.films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)]) self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) - elif self.feat_fusion_method == "weighted-avg-film": + elif self.feat_fusion_method == "film-fused-feature": self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) self.film = FiLM(layer_dim, self.transducer.decoder.condition_size) elif self.feat_fusion_method == "weighted-avg": @@ -102,7 +102,7 @@ def _fuse_hid_feats(self, hid_feats, lang): film_hid_feats = torch.stack(film_hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) feats = torch.sum(film_hid_feats * norm_weights, dim=-1) - elif self.feat_fusion_method == "weighted-avg-film": + elif self.feat_fusion_method == "film-fused-feature": hid_feats = torch.stack(hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) feats = torch.sum(hid_feats * norm_weights, dim=-1) From 1f7e70bcbf2f9dd8d83dcc59021897d398d789c1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 22 May 2023 04:39:09 +0000 Subject: [PATCH 39/89] update more options for film model --- hyperion/torch/narchs/rnn_film_transducer_decoder.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index 2797d5a3..cc1dd2e3 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -20,7 +20,8 @@ from ...utils.misc import filter_func_args from ...utils.text import add_sos -from ..layer_blocks import TransducerFiLMJoiner as Joiner +from ..layer_blocks import TransducerFiLMJoiner as FiLMJoiner +from ..layer_blocks import TransducerJoiner as Joiner from ..layer_blocks import TransducerRNNFiLMPredictor as RNNPredictor from .net_arch import NetArch @@ -131,6 +132,11 @@ def _make_joiner(self): # Add FiLM args to the joiner args if joiner_type == "basic": + pred_feats = self.predictor_args["out_feats"] + hid_feats = self.joiner_args["hid_feats"] + self.joiner = FiLMJoiner(self.in_feats, pred_feats, hid_feats, + self.vocab_size) + elif joiner_type == "original_joiner": pred_feats = self.predictor_args["out_feats"] hid_feats = self.joiner_args["hid_feats"] self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, From c6f5dee27519787cc856060372607bc0e4d47280 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 22 May 2023 04:43:31 +0000 Subject: [PATCH 40/89] add reduction option in fine-tune argument --- hyperion/torch/narchs/rnn_film_transducer_decoder.py | 9 +++++++++ hyperion/torch/narchs/rnn_transducer_decoder.py | 10 ++++++++++ 2 files changed, 19 insertions(+) diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index 2797d5a3..976b9872 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -640,12 +640,15 @@ def change_config( embed_dropout_rate: float = 0.0, rnn_dropout_rate: float = 0.0, prune_range: Optional[int] = None, + reduction: Optional[str] = None, ): logging.info("changing decoder config") self.predictor.change_config(override_dropouts, embed_dropout_rate, rnn_dropout_rate) if prune_range is not None: self.prune_range = prune_range + if reduction is not None: + self.reduction = reduction @staticmethod def filter_args(**kwargs): @@ -843,6 +846,12 @@ def add_finetune_args(parser, prefix=None, skip=set()): help="""how many symbols to keep for each frame in k2 rnn-t pruned loss.""") + parser.add_argument( + "--reduction", + default="sum", + choices=["sum", "mean"], + help="""type of reduction for rnn-t loss between sum or mean""") + if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index efc11113..44cf5350 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -618,12 +618,15 @@ def change_config( embed_dropout_rate: float = 0.0, rnn_dropout_rate: float = 0.0, prune_range: Optional[int] = None, + reduction: Optional[str] = None, ): logging.info("changing decoder config") self.predictor.change_config(override_dropouts, embed_dropout_rate, rnn_dropout_rate) if prune_range is not None: self.prune_range = prune_range + if reduction is not None: + self.reduction = reduction @staticmethod def filter_args(**kwargs): @@ -809,6 +812,13 @@ def add_finetune_args(parser, prefix=None, skip=set()): type=float, help=("dropout prob for decoder RNN ")) + + parser.add_argument( + "--reduction", + default="sum", + choices=["sum", "mean"], + help="""type of reduction for rnn-t loss between sum or mean""") + parser.add_argument( "--prune-range", default=5, From 8e82143904ccef496d71567b37bf609255f0c053 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 22 May 2023 19:52:19 +0000 Subject: [PATCH 41/89] update configuration --- ...se_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml index 275987d7..43e6ba3a 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage1_v1.0.yaml @@ -105,7 +105,7 @@ model: dropout_rate: 0.3 hid_act: swish - loss_weight_transducer: 0.05 + loss_weight_transducer: 0.1 loss_weight_lid: 1.0 lid_length: 3.0 # feat_fusion_method: weighted-avg From c6ec4e27bdf269159d70aa68b952168db7390408 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 23 May 2023 00:53:50 +0000 Subject: [PATCH 42/89] fix film bug --- hyperion/torch/narchs/rnn_film_transducer_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index 3790065c..e070f70b 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -135,12 +135,12 @@ def _make_joiner(self): pred_feats = self.predictor_args["out_feats"] hid_feats = self.joiner_args["hid_feats"] self.joiner = FiLMJoiner(self.in_feats, pred_feats, hid_feats, - self.vocab_size) + self.vocab_size, self.condition_size) elif joiner_type == "original_joiner": pred_feats = self.predictor_args["out_feats"] hid_feats = self.joiner_args["hid_feats"] self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, - self.vocab_size, self.condition_size) + self.vocab_size) else: raise ValueError(f"Unknown joiner type {joiner_type}") From 27878914b1bc20b2dbeb5c1139b6d23f2857cd07 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 25 May 2023 09:19:23 -0400 Subject: [PATCH 43/89] sre21 8k adapted to persephone branck --- egs/sre21-av-a/v1.16k/README.md | 22 +- .../v1.16k/local/score_sre21_official.sh | 2 +- egs/sre21-av-a/v1.8k/README.md | 53 ++- egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh | 2 +- egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh | 4 +- egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh | 2 +- egs/voxceleb/v1.1/local | 1 - .../{v1 => v1.1}/local/attack_analysis.py | 0 .../{v1 => v1.1}/local/attack_analysis.sh | 0 .../local/calibrate_voxceleb1_o_clean.sh | 0 egs/voxceleb/{v1 => v1.1}/local/make_musan.py | 0 egs/voxceleb/{v1 => v1.1}/local/make_musan.sh | 0 .../{v1 => v1.1}/local/make_rirs_data.sh | 0 .../{v1 => v1.1}/local/make_some_figs.py | 0 .../make_train_lists_sup_embed_with_augm.sh | 0 .../{v1 => v1.1}/local/make_trials_subset.py | 0 .../{v1 => v1.1}/local/make_vox2_trials.py | 0 .../{v1 => v1.1}/local/make_voxceleb1_o.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1_oeh.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1_old.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1_orig.pl | 0 .../local/make_voxceleb1_orig_v2.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1_v2.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1_v2_o.pl | 0 .../local/make_voxceleb1_v2_oeh.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1cat.pl | 0 .../local/make_voxceleb1cat_v2.pl | 0 .../{v1 => v1.1}/local/make_voxceleb2.pl | 0 .../{v1 => v1.1}/local/make_voxceleb2cat.pl | 0 .../local/prepare_voxsrc22_dev.py | 0 .../local/prepare_voxsrc22_test.py | 0 egs/voxceleb/{v1 => v1.1}/local/score_dcf.py | 0 .../{v1 => v1.1}/local/score_voxceleb1.sh | 0 .../local/score_voxceleb1_o_clean.sh | 0 .../local/score_voxceleb1_single_cond.sh | 0 .../{v1 => v1.1}/local/score_voxsrc22_dev.sh | 0 egs/voxceleb/v1.1/run_002_compute_evad.sh | 1 - egs/voxceleb/v1.2/hyp_utils | 1 + ...aseplus_ecapatdnn512x3_phase1_default.yaml | 6 - ...aseplus_ecapatdnn512x3_phase2_default.yaml | 12 - ...aseplus_ecapatdnn512x3_phase3_default.yaml | 11 - ...lmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml | 24 -- ...nn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh | 55 ---- egs/voxceleb/v2/local | 2 +- egs/voxceleb/v2/run_001_prepare_data.sh | 20 +- egs/voxceleb/v2/run_002_compute_evad.sh | 63 ++-- .../v2/run_003_prepare_noises_rirs.sh | 67 ++++ hyp_utils/conda_env.sh | 2 +- hyp_utils/create_data_split_dirs.sh | 3 +- hyperion/bin/hyperion_dataset.py | 93 ++++++ hyperion/bin/hyperion_tables.py | 129 ++++++++ hyperion/bin/train_xvector_from_wav.py | 10 +- hyperion/data_prep/data_prep.py | 1 - hyperion/data_prep/voxceleb1.py | 7 +- hyperion/data_prep/voxceleb2.py | 11 +- hyperion/data_prep/voxsrc22.py | 212 ++++++++++++ hyperion/torch/trainers/torch_trainer.py | 178 +++++----- hyperion/torch/trainers/xvector_trainer.py | 8 +- .../trainers/xvector_trainer_from_wav.py | 12 +- hyperion/utils/class_info.py | 27 +- hyperion/utils/dataset.py | 306 ++++++++++++++---- hyperion/utils/enrollment_map.py | 17 +- hyperion/utils/info_table.py | 7 +- 63 files changed, 1024 insertions(+), 347 deletions(-) delete mode 120000 egs/voxceleb/v1.1/local rename egs/voxceleb/{v1 => v1.1}/local/attack_analysis.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/attack_analysis.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/calibrate_voxceleb1_o_clean.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_musan.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_musan.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_rirs_data.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_some_figs.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_train_lists_sup_embed_with_augm.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_trials_subset.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_vox2_trials.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_o.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_oeh.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_old.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_orig.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_orig_v2.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_v2.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_v2_o.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_v2_oeh.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1cat.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1cat_v2.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb2.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb2cat.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/prepare_voxsrc22_dev.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/prepare_voxsrc22_test.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/score_dcf.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/score_voxceleb1.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/score_voxceleb1_o_clean.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/score_voxceleb1_single_cond.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/score_voxsrc22_dev.sh (100%) create mode 120000 egs/voxceleb/v1.2/hyp_utils delete mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml delete mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml delete mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml delete mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh create mode 100755 egs/voxceleb/v2/run_003_prepare_noises_rirs.sh create mode 100644 hyperion/bin/hyperion_dataset.py create mode 100755 hyperion/bin/hyperion_tables.py create mode 100644 hyperion/data_prep/voxsrc22.py diff --git a/egs/sre21-av-a/v1.16k/README.md b/egs/sre21-av-a/v1.16k/README.md index 0f5d09ad..d90dc0a4 100644 --- a/egs/sre21-av-a/v1.16k/README.md +++ b/egs/sre21-av-a/v1.16k/README.md @@ -7,6 +7,20 @@ The systems runs at 16 kHz, telephone data is upsampled to 16k using SoX This recipe is based on these works ``` +@inproceedings{Villalba2022, +author = {Jes\'us Villalba and Bengt J Borgstrom and Saurabh Kataria and Magdalena Rybicka and Carlos D Castillo and Jaejin Cho and L. Paola García-Perera and Pedro A. Torres-Carrasquillo and Najim Dehak}, +city = {ISCA}, +doi = {10.21437/Odyssey.2022-30}, +issue = {July}, +journal = {The Speaker and Language Recognition Workshop (Odyssey 2022)}, +month = {6}, +pages = {213-220}, +publisher = {ISCA}, +title = {Advances in Cross-Lingual and Cross-Source Audio-Visual Speaker Recognition: The JHU-MIT System for NIST SRE21}, +url = {https://www.isca-speech.org/archive/odyssey_2022/villalba22b_odyssey.html}, +year = {2022}, +} + @inproceedings{Villalba2020, address = {Tokyo, Japan}, author = {Villalba, Jes{\'{u}}s and Garcia-Romero, Daniel and Chen, Nanxin and Sell, Gregory and Borgstrom, Jonas and McCree, Alan and {Garcia Perera}, Leibny Paola and Kataria, Saurabh and Nidadavolu, Phani Sankar and Torres-Carrasquiilo, Pedro and Dehak, Najim}, @@ -139,14 +153,6 @@ The back-end used for these results is: | config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | | config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 | -## SRE-CTS Superset dev set - -| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | -| ------ | ---------- | ------------- | ------ | ------------- | ------------- | -| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.37 | 0.076 | 0.106 | -| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | -| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 | - ## SRE21 Audio Dev (official scoring tool) | Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | diff --git a/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh b/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh index a5bc03eb..e56906f6 100755 --- a/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh +++ b/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh @@ -18,7 +18,7 @@ echo "Score SRE21 ${track} ${subset} for $score_dir" soft_dir=./sre21/scoring_software -if [ ! -f $s_dir/sre_scorer.py ];then +if [ ! -f $soft_dir/sre_scorer.py ];then echo "downloading scoring tool" local/download_sre21_scoring_tool.sh fi diff --git a/egs/sre21-av-a/v1.8k/README.md b/egs/sre21-av-a/v1.8k/README.md index a105128c..b55f9bf0 100644 --- a/egs/sre21-av-a/v1.8k/README.md +++ b/egs/sre21-av-a/v1.8k/README.md @@ -10,6 +10,20 @@ copy the utt2est_lang files from the 16k data dirs to the VoxCeleb and SRE21 dat This recipe is based on these works ``` +@inproceedings{Villalba2022, +author = {Jes\'us Villalba and Bengt J Borgstrom and Saurabh Kataria and Magdalena Rybicka and Carlos D Castillo and Jaejin Cho and L. Paola García-Perera and Pedro A. Torres-Carrasquillo and Najim Dehak}, +city = {ISCA}, +doi = {10.21437/Odyssey.2022-30}, +issue = {July}, +journal = {The Speaker and Language Recognition Workshop (Odyssey 2022)}, +month = {6}, +pages = {213-220}, +publisher = {ISCA}, +title = {Advances in Cross-Lingual and Cross-Source Audio-Visual Speaker Recognition: The JHU-MIT System for NIST SRE21}, +url = {https://www.isca-speech.org/archive/odyssey_2022/villalba22b_odyssey.html}, +year = {2022}, +} + @inproceedings{Villalba2020, address = {Tokyo, Japan}, author = {Villalba, Jes{\'{u}}s and Garcia-Romero, Daniel and Chen, Nanxin and Sell, Gregory and Borgstrom, Jonas and McCree, Alan and {Garcia Perera}, Leibny Paola and Kataria, Saurabh and Nidadavolu, Phani Sankar and Torres-Carrasquiilo, Pedro and Dehak, Najim}, @@ -91,8 +105,6 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_011_train_xvector.sh` - Trains the x-vector network on 4sec chunks - - - `run_012_finetune_xvector.sh` - Fine-tune x-vector network on 10-15 secs utts - `run_030_extract_xvectors.sh` @@ -111,4 +123,39 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs ## Results -TODO +The back-end used for these results is: +- back-end V2 (run_041_eval_be_v2.sh) +- Without S-Norm +- Scores are calibrated as indicated in the paper. + +## SRE16 Eval40% YUE + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.922 | 0.154 | 0.200 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.168 | 0.127 | 0.134 | + + +## SRE-CTS Superset dev set + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.39 | 0.072 | 0.095 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.175 | 0.057 | 0.069 | + + +## SRE21 Audio Dev (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 6.65 | 0.418 | 0.436 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 3.73 | 0.319 | 0.325 | + + +## SRE21 Audio Eval (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.44 | 0.388 | 0.390 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.21 | 0.356 | 0.377 | + diff --git a/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh b/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh index a55761ae..92cbd887 100755 --- a/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh +++ b/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh @@ -153,7 +153,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh b/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh index f8eae0a1..6890eba9 100755 --- a/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh +++ b/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh @@ -187,7 +187,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 @@ -311,7 +311,7 @@ fi if [ $stage -le 7 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh b/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh index 263d7bbe..35afbb27 100755 --- a/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh +++ b/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh @@ -185,7 +185,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/voxceleb/v1.1/local b/egs/voxceleb/v1.1/local deleted file mode 120000 index 740b697d..00000000 --- a/egs/voxceleb/v1.1/local +++ /dev/null @@ -1 +0,0 @@ -../v1/local/ \ No newline at end of file diff --git a/egs/voxceleb/v1/local/attack_analysis.py b/egs/voxceleb/v1.1/local/attack_analysis.py similarity index 100% rename from egs/voxceleb/v1/local/attack_analysis.py rename to egs/voxceleb/v1.1/local/attack_analysis.py diff --git a/egs/voxceleb/v1/local/attack_analysis.sh b/egs/voxceleb/v1.1/local/attack_analysis.sh similarity index 100% rename from egs/voxceleb/v1/local/attack_analysis.sh rename to egs/voxceleb/v1.1/local/attack_analysis.sh diff --git a/egs/voxceleb/v1/local/calibrate_voxceleb1_o_clean.sh b/egs/voxceleb/v1.1/local/calibrate_voxceleb1_o_clean.sh similarity index 100% rename from egs/voxceleb/v1/local/calibrate_voxceleb1_o_clean.sh rename to egs/voxceleb/v1.1/local/calibrate_voxceleb1_o_clean.sh diff --git a/egs/voxceleb/v1/local/make_musan.py b/egs/voxceleb/v1.1/local/make_musan.py similarity index 100% rename from egs/voxceleb/v1/local/make_musan.py rename to egs/voxceleb/v1.1/local/make_musan.py diff --git a/egs/voxceleb/v1/local/make_musan.sh b/egs/voxceleb/v1.1/local/make_musan.sh similarity index 100% rename from egs/voxceleb/v1/local/make_musan.sh rename to egs/voxceleb/v1.1/local/make_musan.sh diff --git a/egs/voxceleb/v1/local/make_rirs_data.sh b/egs/voxceleb/v1.1/local/make_rirs_data.sh similarity index 100% rename from egs/voxceleb/v1/local/make_rirs_data.sh rename to egs/voxceleb/v1.1/local/make_rirs_data.sh diff --git a/egs/voxceleb/v1/local/make_some_figs.py b/egs/voxceleb/v1.1/local/make_some_figs.py similarity index 100% rename from egs/voxceleb/v1/local/make_some_figs.py rename to egs/voxceleb/v1.1/local/make_some_figs.py diff --git a/egs/voxceleb/v1/local/make_train_lists_sup_embed_with_augm.sh b/egs/voxceleb/v1.1/local/make_train_lists_sup_embed_with_augm.sh similarity index 100% rename from egs/voxceleb/v1/local/make_train_lists_sup_embed_with_augm.sh rename to egs/voxceleb/v1.1/local/make_train_lists_sup_embed_with_augm.sh diff --git a/egs/voxceleb/v1/local/make_trials_subset.py b/egs/voxceleb/v1.1/local/make_trials_subset.py similarity index 100% rename from egs/voxceleb/v1/local/make_trials_subset.py rename to egs/voxceleb/v1.1/local/make_trials_subset.py diff --git a/egs/voxceleb/v1/local/make_vox2_trials.py b/egs/voxceleb/v1.1/local/make_vox2_trials.py similarity index 100% rename from egs/voxceleb/v1/local/make_vox2_trials.py rename to egs/voxceleb/v1.1/local/make_vox2_trials.py diff --git a/egs/voxceleb/v1/local/make_voxceleb1_o.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_o.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_o.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_o.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_oeh.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_oeh.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_oeh.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_oeh.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_old.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_old.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_old.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_old.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_orig.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_orig.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_orig.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_orig.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_orig_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_orig_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_orig_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_orig_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2_o.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2_o.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2_oeh.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2_oeh.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat.pl b/egs/voxceleb/v1.1/local/make_voxceleb1cat.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1cat.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1cat.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1cat_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1cat_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb2.pl b/egs/voxceleb/v1.1/local/make_voxceleb2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb2cat.pl b/egs/voxceleb/v1.1/local/make_voxceleb2cat.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb2cat.pl rename to egs/voxceleb/v1.1/local/make_voxceleb2cat.pl diff --git a/egs/voxceleb/v1/local/prepare_voxsrc22_dev.py b/egs/voxceleb/v1.1/local/prepare_voxsrc22_dev.py similarity index 100% rename from egs/voxceleb/v1/local/prepare_voxsrc22_dev.py rename to egs/voxceleb/v1.1/local/prepare_voxsrc22_dev.py diff --git a/egs/voxceleb/v1/local/prepare_voxsrc22_test.py b/egs/voxceleb/v1.1/local/prepare_voxsrc22_test.py similarity index 100% rename from egs/voxceleb/v1/local/prepare_voxsrc22_test.py rename to egs/voxceleb/v1.1/local/prepare_voxsrc22_test.py diff --git a/egs/voxceleb/v1/local/score_dcf.py b/egs/voxceleb/v1.1/local/score_dcf.py similarity index 100% rename from egs/voxceleb/v1/local/score_dcf.py rename to egs/voxceleb/v1.1/local/score_dcf.py diff --git a/egs/voxceleb/v1/local/score_voxceleb1.sh b/egs/voxceleb/v1.1/local/score_voxceleb1.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxceleb1.sh rename to egs/voxceleb/v1.1/local/score_voxceleb1.sh diff --git a/egs/voxceleb/v1/local/score_voxceleb1_o_clean.sh b/egs/voxceleb/v1.1/local/score_voxceleb1_o_clean.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxceleb1_o_clean.sh rename to egs/voxceleb/v1.1/local/score_voxceleb1_o_clean.sh diff --git a/egs/voxceleb/v1/local/score_voxceleb1_single_cond.sh b/egs/voxceleb/v1.1/local/score_voxceleb1_single_cond.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxceleb1_single_cond.sh rename to egs/voxceleb/v1.1/local/score_voxceleb1_single_cond.sh diff --git a/egs/voxceleb/v1/local/score_voxsrc22_dev.sh b/egs/voxceleb/v1.1/local/score_voxsrc22_dev.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxsrc22_dev.sh rename to egs/voxceleb/v1.1/local/score_voxsrc22_dev.sh diff --git a/egs/voxceleb/v1.1/run_002_compute_evad.sh b/egs/voxceleb/v1.1/run_002_compute_evad.sh index 4e82a87a..27260be3 100755 --- a/egs/voxceleb/v1.1/run_002_compute_evad.sh +++ b/egs/voxceleb/v1.1/run_002_compute_evad.sh @@ -24,7 +24,6 @@ if [ $stage -le 1 ]; then dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage if [ "$nodes" == "b0" ];then utils/create_split_dir.pl \ - utils/create_split_dir.pl \ /export/b{04,05,06,07}/$dir_name $vaddir/storage elif [ "$nodes" == "b1" ];then utils/create_split_dir.pl \ diff --git a/egs/voxceleb/v1.2/hyp_utils b/egs/voxceleb/v1.2/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/v1.2/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml deleted file mode 100644 index 8574a1cf..00000000 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml +++ /dev/null @@ -1,6 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: wavlmbaseplus_ecapatdnn512x3.yaml -trainer: trainer_phase1_sgd_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml deleted file mode 100644 index 87b01a1f..00000000 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml +++ /dev/null @@ -1,12 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: - xvector: - cos_scale: 32.0 - margin: 0.2 - margin_warmup_epochs: 0 - intertop_k: 5 - intertop_margin: 0.1 -trainer: trainer_phase2_sgd_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml deleted file mode 100644 index d13931e0..00000000 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml +++ /dev/null @@ -1,11 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: - xvector: - cos_scale: 32.0 - margin: 0.4 - margin_warmup_epochs: 0 - intertop_margin: 0. -trainer: trainer_phase3_sgd_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml index 34c6e8dc..d4db70a7 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml @@ -41,29 +41,6 @@ data: num_hard_prototypes: 0 data_loader: num_workers: 8 - -train: - dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml - wav_scale: 1 - sampler: - batch_size: 32 - iters_per_epoch: 6 - data_loader: - num_workers: 8 - val: - dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml - wav_scale: 1 - sampler: - batch_size: 32 - iters_per_epoch: 6 - data_loader: - num_workers: 8 model: wavlmbaseplus_ecapatdnn512x3.yaml trainer: optim: @@ -84,5 +61,4 @@ trainer: epochs: 60 eff_batch_size: 1024 train_mode: hf-feats-frozen-nograd - \ No newline at end of file diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh deleted file mode 100644 index 942fb336..00000000 --- a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh +++ /dev/null @@ -1,55 +0,0 @@ -# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 - -# hugging face model -hf_model_name=wav2vec2base - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_wav2vec2resnet1d - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -dropout=0 -embed_dim=256 -lr=0.05 -s=30 -margin_warmup=20 -margin=0.3 -nnet_num_epochs=70 - - -lr=0.001 -#lr=0.005 -xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 20000 --trainer.lrsched.hold-steps 20000 --trainer.lrsched.min-lr 1e-6 --trainer.epochs 75 --model conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml --data.train.dataset.max-chunk-length 2 --data.train.dataset.min-chunk-length 2" - -nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v12 #v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0060.pth -nnet=$nnet_dir/swa_model_ep0076.pth -nnet=$nnet_dir/model_ep0060.pth -nnet=$nnet_dir/model_ep0030.pth -nnet=$nnet_dir/model_ep0040.pth -nnet=$nnet_dir/model_ep0020.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v2/local b/egs/voxceleb/v2/local index 740b697d..2ac14857 120000 --- a/egs/voxceleb/v2/local +++ b/egs/voxceleb/v2/local @@ -1 +1 @@ -../v1/local/ \ No newline at end of file +../v1.1/local \ No newline at end of file diff --git a/egs/voxceleb/v2/run_001_prepare_data.sh b/egs/voxceleb/v2/run_001_prepare_data.sh index 7bf15448..44385610 100755 --- a/egs/voxceleb/v2/run_001_prepare_data.sh +++ b/egs/voxceleb/v2/run_001_prepare_data.sh @@ -12,7 +12,7 @@ config_file=default_config.sh . parse_options.sh || exit 1; . datapath.sh - +. $config_file if [ $stage -le 1 ];then # Prepare the VoxCeleb2 dataset for training. @@ -26,3 +26,21 @@ if [ $stage -le 2 ];then # Use this for the newer version of voxceleb1: local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + local/prepare_voxsrc22_dev.py \ + --vox1-corpus-dir $voxceleb1_root \ + --voxsrc22-corpus-dir $voxsrc22_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then +# local/prepare_voxsrc22_test.py \ +# --corpus-dir $voxsrc22_root \ +# --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # # split vox2 into 2 parts, for cohort and qmf training + local/make_vox2_trials.py --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/v2/run_002_compute_evad.sh b/egs/voxceleb/v2/run_002_compute_evad.sh index eeae00ac..1248ad39 100755 --- a/egs/voxceleb/v2/run_002_compute_evad.sh +++ b/egs/voxceleb/v2/run_002_compute_evad.sh @@ -19,39 +19,40 @@ config_file=default_config.sh if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $vaddir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $vaddir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $vaddir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $vaddir/storage - else - echo "we don't distribute data between multiple machines" - fi + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" fi + fi fi -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ - --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_vad/$name $vaddir - utils/fix_data_dir.sh data/${name} - done +if [ $stage -le 2 ];then + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh \ + --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done fi - diff --git a/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..a448af9a --- /dev/null +++ b/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation + +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 16 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then + ln -s ../../sre19-cmn2/v1/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index ceee4e93..8d5c67c1 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -79,7 +79,7 @@ if [ $num_gpus -gt 0 ];then #export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters if [ $num_gpus -gt 1 ];then - [[ $(type -P "$torchrun") ]] && command="torchrun" \ + [[ $(type -P "torchrun") ]] && command="torchrun" \ || command="python -m torch.distributed.run" command="$command --nproc_per_node=$num_gpus --standalone --nnodes=1" fi diff --git a/hyp_utils/create_data_split_dirs.sh b/hyp_utils/create_data_split_dirs.sh index 877b9e3f..06c30779 100755 --- a/hyp_utils/create_data_split_dirs.sh +++ b/hyp_utils/create_data_split_dirs.sh @@ -25,8 +25,7 @@ if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $linkdir ]; then echo "Prepare to distribute data over multiple $nodes nodes" dir_name=$storage_dir/$storage_name/storage if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - hyp_utils/create_split_dir.pl \ + hyp_utils/create_split_dir.pl \ /export/b{04,05,06,07}/$dir_name $link_dir elif [ "$nodes" == "b1" ];then hyp_utils/create_split_dir.pl \ diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py new file mode 100644 index 00000000..9e7bac5c --- /dev/null +++ b/hyperion/bin/hyperion_dataset.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Optional, Union, List +from pathlib import Path + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import ( + PathLike, + Dataset, + InfoTable, + RecordingSet, + FeatureSet, + ClassInfo, + EnrollmentMap, + SegmentSet, +) + +subcommands = ["add_features"] +# table_dict = { +# "segments": SegmentSet, +# "recordings": RecordingSet, +# "features": FeatureSet, +# "classes": ClassInfo, +# "enrollments": EnrollmentMap, +# "generic": InfoTable, +# } + + +def add_common_args(parser): + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def make_add_features_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--features-name", required=True, help="""name of the feature""" + ) + parser.add_argument("--features-file", required=True, help="""feature set file""") + + add_common_args(parser) + return parser + + +def add_features( + dataset: PathLike, + features_name: str, + features_file: PathLike, +): + dataset = Dataset.load(dataset, lazy=True) + dataset.add_features(features_name, features_file) + dataset.save(dataset) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Tool to manipulates the Hyperion dataset") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in subcommands: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(k, subparser) + + args = parser.parse_args() + subcommand = args.subcommand + kwargs = namespace_to_dict(args)[args.subcommand] + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + + globals()[subcommand](**kwargs) diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py new file mode 100755 index 00000000..a79a1dca --- /dev/null +++ b/hyperion/bin/hyperion_tables.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Optional, Union, List +from pathlib import Path + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import ( + PathLike, + InfoTable, + RecordingSet, + FeatureSet, + ClassInfo, + EnrollmentMap, + SegmentSet, +) + +subcommands = ["cat"] +table_dict = { + "segments": SegmentSet, + "recordings": RecordingSet, + "features": FeatureSet, + "classes": ClassInfo, + "enrollments": EnrollmentMap, + "generic": InfoTable, +} + + +def add_common_args(parser): + parser.add_argument( + "--table-type", + default="generic", + choices=list(table_dict.keys()), + help=f"Type of table in {list(table_dict.keys())}", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def make_cat_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--input-files", default=None, nargs="+", help="optional list of input files" + ) + parser.add_argument( + "--output-file", + required=True, + help="""output file, if input-files is None, input files names are derived from it""", + ) + parser.add_argument( + "--num-tables", + default=0, + type=int, + help="""number of jobs we used to create the individual tables""", + ) + parser.add_argument( + "--base-idx", + default=1, + type=int, + help="""index of the first job, typically 0 or 1""", + ) + + add_common_args(parser) + return parser + + +def cat( + table_type: str, + input_files: Union[List[PathLike], None], + output_file: PathLike, + num_table: int, + base_idx: int = 1, +): + + assert input_files is not None or num_jobs != 0 + output_file = Path(output_file) + if input_files is None: + ext = output_file.suffix + input_file_base = output_file.with_suffix("") + input_files = [] + for i in range(num_tables): + idx = base_idx + i + input_file_i = input_file_base.with_suffix(f".{idx}{ext}") + input_files.append(input_file_i) + + table_class = table_dict[table_type] + tables = [] + for file_path in input_files: + tables.append(table_class.load(file_path)) + + output_table = table_class.cat(tables) + output_table.save(output_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in subcommands: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(k, subparser) + + args = parser.parse_args() + subcommand = args.subcommand + kwargs = namespace_to_dict(args)[args.subcommand] + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + + globals()[subcommand](**kwargs) diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index 5c999dd1..a210d429 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -6,12 +6,14 @@ import logging import multiprocessing import os -import sys -import time from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py index 19420761..d9828674 100644 --- a/hyperion/data_prep/data_prep.py +++ b/hyperion/data_prep/data_prep.py @@ -69,7 +69,6 @@ def get_recording_duration(self, recording_set): import itertools from ..utils import SCPList - # scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values) futures = [] logging.info("submitting threats...") with ThreadPoolExecutor(max_workers=self.num_threads) as pool: diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py index 00b2e380..c23b64ff 100644 --- a/hyperion/data_prep/voxceleb1.py +++ b/hyperion/data_prep/voxceleb1.py @@ -214,7 +214,12 @@ def get_segmentid(s): return enrollments, trials def prepare(self): - + logging.info( + "Peparing VoxCeleb1 for %s corpus_dir:%s -> data_dir:%s", + self.task, + self.corpus_dir, + self.output_dir, + ) logging.info("getting audio meta-data") df_meta = self._get_metadata() logging.info("getting language estimations") diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index 1a32420f..bef34ec9 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -136,6 +136,12 @@ def make_cat_list(lists_cat_dir, rec_id, rec_files, video_idx, i): return file_path def prepare(self): + logging.info( + "Peparing VoxCeleb2 %s corpus_dir:%s -> data_dir:%s", + self.subset, + self.corpus_dir, + self.output_dir, + ) logging.info("getting audio meta-data") df_meta = self._get_metadata() logging.info("getting language estimations") @@ -224,11 +230,6 @@ def prepare(self): "duration": recs.loc[rec_ids, "duration"].values, } ) - # print( - # recs.loc[rec_ids, "duration"], - # len(segments), - # len(recs.loc[rec_ids, "duration"]), - # ) segments = SegmentSet(segments) segments.sort() diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py new file mode 100644 index 00000000..1999262a --- /dev/null +++ b/hyperion/data_prep/voxsrc22.py @@ -0,0 +1,212 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import glob +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class VoxSRC22DataPrep(DataPrep): + """Class to prepare VoxSRC22 dev/test data + Attributes: + corpus_dir: input data directory + vox1_corpus_dir: input data directory for VoxCeleb1 + subset: subset of the data dev or test + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + vox1_corpus_dir: PathLike, + subset: str, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, + ): + use_kaldi_ids = False + super().__init__( + corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads + ) + + assert ( + vox1_corpus_dir is not None or subset == "test" + ), "dev set needs the VoxCeleb1 corpus dir" + self.subset = subset + self.vox1_corpus_dir = ( + None if vox1_corpus_dir is None else Path(vox1_corpus_dir) + ) + + @staticmethod + def dataset_name(): + return "voxceleb2" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--subset", + default="dev", + choices=["dev", "test"], + help="""vox2 subset in [dev, test]""", + ) + parser.add_argument( + "--vox1-corpus-dir", + default=None, + help="""corpus directory of voxceleb 1.""", + ) + + def prepare_track12_dev(self): + logging.info( + "Preparing VoxSRC22 %s corpus:%s + %s -> %s", + self.subset, + self.corpus_dir, + self.vox1_corpus_dir, + self.output_dir, + ) + logging.info("making trials") + trials_file = self.corpus_dir / "voxsrc2022_dev.txt" + df_in = pd.read_csv( + trials_file, + header=None, + sep=" ", + names=["key", "enroll_file", "test_file"], + ) + key = ["target" if k == 1 else "nontarget" for k in df_in["key"]] + + modelid = df_in["enroll_file"] + segmentid = df_in["test_file"] + df_trials = pd.DataFrame( + {"modelid": modelid, "segmentid": segmentid, "targettype": key} + ) + df_trials.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / "trials.csv" + df_trials.to_csv(file_path, index=False) + trials = {"trials": file_path} + modelid = df_trials["modelid"].sort_values().unique() + uniq_segmentid = df_trials["segmentid"].sort_values().unique() + uniq_segmentid = np.unique(np.concatenate((uniq_segmentid, modelid), axis=0)) + + logging.info("making enrollment map") + df_enroll = pd.DataFrame({"modelid": modelid, "segmentid": modelid}) + file_path = self.output_dir / "enrollment.csv" + df_enroll.to_csv(file_path, index=False) + enrollments = {"enrollment": file_path} + + logging.info("making RecordingSet") + vox1_segmentid = [] + vox22_segmentid = [] + for s in uniq_segmentid: + if "VoxSRC2022_dev" in s: + vox22_segmentid.append(s) + else: + vox1_segmentid.append(s) + + vox1_rec_files = [ + glob.glob(f"{self.vox1_corpus_dir}/**/{s}") for s in vox1_segmentid + ] + vox22_rec_files = [ + glob.glob(f"{self.corpus_dir}/**/{s}") for s in vox22_segmentid + ] + rec_ids = vox22_segmentid + vox1_segmentid + rec_files = vox22_rec_files + vox1_rec_files + + recs = pd.DataFrame({"id": rec_ids, "storage_path": rec_files}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame({"id": rec_ids,}) + segments = SegmentSet(segments) + segments.sort() + + logging.info("making dataset") + dataset = Dataset( + segments, + recordings={"recordings": recs}, + enrollments=enrollments, + trials=trials, + sparse_trials=False, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", len(segments), + ) + + # wav_file = voxsrc22_corpus_dir / file_id + # wav_file = vox1_corpus_dir / "wav" / file_id + # logging.info("searching audio files in %s", self.vox1_corpus_dir) + # vox1_rec_files = list(self.vox1_corpus_dir.glob("**/*.wav")) + # if not vox1_rec_files: + # # symlinks? try glob + # vox1_rec_files = [ + # Path(f) for f in glob.iglob(f"{self.vox1_corpus_dir}/**/*.wav", recursive=True) + # ] + + # vox1_rec_ids = [ f.parent.parent.name / f.parent.name / f.name for f in vox1_rec_files] + # rec_files = + + # rec_files = list(self.corpus_dir.glob("**/*.wav")) + # if not rec_files: + # # symlinks? try glob + # rec_files = [ + # Path(f) for f in glob.iglob(f"{self.corpus_dir}/**/*.wav", recursive=True) + # ] + + # u2s_file = output_dir / "utt2spk" + # logging.info("creating utt2spk file %s", u2s_file) + # file_ids = np.unique(np.concatenate((df_trials["enroll"], df_trials["test"]))) + # with open(u2s_file, "w") as f: + # for file_id in file_ids: + # f.write("%s %s\n" % (file_id, file_id)) + + # s2u_file = output_dir / "spk2utt" + # logging.info("creating spk2utt file %s", s2u_file) + # with open(s2u_file, "w") as f: + # for file_id in file_ids: + # f.write("%s %s\n" % (file_id, file_id)) + + # wav_file = output_dir / "wav.scp" + # logging.info("creating wav.scp file %s", wav_file) + # with open(wav_file, "w") as f: + # for file_id in file_ids: + # if "VoxSRC2022_dev" in file_id: + # wav_file = voxsrc22_corpus_dir / file_id + # else: + # wav_file = vox1_corpus_dir / "wav" / file_id + + # f.write("%s %s\n" % (file_id, wav_file)) + + def prepare_track12_test(self): + logging.info( + "Preparing VoxSRC22 %s corpus:%s -> %s", + self.subset, + self.corpus_dir, + self.output_dir, + ) + + def prepare(self): + if self.subset == "dev": + self.prepare_track12_dev() + else: + self.prepare_track12_test() diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index a6f20a8e..c8565d1d 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -21,13 +21,17 @@ from torch.optim.swa_utils import SWALR, AveragedModel from ...utils.misc import filter_func_args -from ..loggers import (CSVLogger, LoggerList, ProgLogger, TensorBoardLogger, - WAndBLogger) +from ..loggers import CSVLogger, LoggerList, ProgLogger, TensorBoardLogger, WAndBLogger from ..lr_schedulers import LRScheduler as LRS from ..lr_schedulers import LRSchedulerFactory as LRSF from ..optim import OptimizerFactory as OF -from ..utils import (FairFullyShardedDDP, FairShardedDDP, MetricAcc, TorchDDP, - tensors_subset) +from ..utils import ( + FairFullyShardedDDP, + FairShardedDDP, + MetricAcc, + TorchDDP, + tensors_subset, +) class DDPType(str, Enum): @@ -72,6 +76,7 @@ class TorchTrainer(object): input_key: dict. key for nnet input. target_key: dict. key for nnet targets. """ + def __init__( self, model, @@ -113,8 +118,9 @@ def __init__( self.exp_path = Path(exp_path) if loggers is None: - self.loggers = self._default_loggers(log_interval, use_tensorboard, - use_wandb, wandb) + self.loggers = self._default_loggers( + log_interval, use_tensorboard, use_wandb, wandb + ) elif isinstance(loggers, list): self.loggers = LoggerList(loggers) else: @@ -149,29 +155,23 @@ def __init__( self.rank = dist.get_rank() self.world_size = dist.get_world_size() if ddp_type == DDPType.DDP or ddp_type == DDPType.OSS_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm( - self.model) + self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) if self.rank == 0: logging.info( "training in multiple gpus with distributed-data-parallel" ) oss = False if ddp_type == DDPType.DDP else True - self.optimizer = self._make_optimizer(optim, - self.model, - oss=oss) + self.optimizer = self._make_optimizer(optim, self.model, oss=oss) self.model = TorchDDP( self.model, device_ids=[device], output_device=device, ) elif ddp_type == DDPType.OSS_SHARDED_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm( - self.model) + self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) if self.rank == 0: logging.info( "training in multiple gpus with fair sharded-distributed-data-parallel" ) - self.optimizer = self._make_optimizer(optim, - self.model, - oss=True) + self.optimizer = self._make_optimizer(optim, self.model, oss=True) self.model = FairShardedDDP(self.model, self.optimizer) else: if self.rank == 0: @@ -184,9 +184,7 @@ def __init__( mixed_precision=self.use_amp, move_params_to_cpu=cpu_offload, ) - self.optimizer = self._make_optimizer(optim, - self.model, - oss=False) + self.optimizer = self._make_optimizer(optim, self.model, oss=False) else: self.optimizer = self._make_optimizer(optim, self.model) @@ -216,9 +214,9 @@ def __init__( if self.rank == 0: logging.info("init SWA model") self.swa_model = AveragedModel(self.model) - self.swa_scheduler = SWALR(self.optimizer, - swa_lr=self.swa_lr, - anneal_epochs=self.swa_anneal_epochs) + self.swa_scheduler = SWALR( + self.optimizer, swa_lr=self.swa_lr, anneal_epochs=self.swa_anneal_epochs + ) def set_epoch(self, data_loader): try: @@ -252,8 +250,7 @@ def fit(self, train_data, val_data=None): if self.lr_scheduler is not None: # this is needed by cosine scheduler epoch_updates = int(len(train_data) / self.grad_acc_steps) - self.lr_scheduler.on_epoch_begin(epoch, - epoch_updates=epoch_updates) + self.lr_scheduler.on_epoch_begin(epoch, epoch_updates=epoch_updates) logs = self.train_epoch(train_data) if val_data is not None: @@ -275,8 +272,7 @@ def fit(self, train_data, val_data=None): self.save_checkpoint(logs) if self.in_swa: - self.loggers.on_epoch_begin(self.cur_epoch, - batches=len(train_data)) + self.loggers.on_epoch_begin(self.cur_epoch, batches=len(train_data)) self.model = self.swa_model.module logs = self.bn_update_epoch(train_data) @@ -351,16 +347,16 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.train() + self.model.train() else: log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + x, target = tensors_subset(data, batch_keys, self.device) + batch_size = x.size(0) with amp.autocast(enabled=self.use_amp): - output = self.model(input_data) + output = self.model(x) loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() @@ -381,9 +377,9 @@ def bn_update_epoch(self, data_loader): def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): if self.ddp: if self.ddp_type == DDPType.DDP: - nn.utils.clip_grad_norm_(model.parameters(), - grad_clip, - norm_type=grad_clip_norm) + nn.utils.clip_grad_norm_( + model.parameters(), grad_clip, norm_type=grad_clip_norm + ) return if self.ddp_type == DDPType.FULLY_SHARDED_DDP: # we have to use the member function in FullyShardedDDP class @@ -395,24 +391,26 @@ def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): optim.clip_grad_norm(grad_clip, norm_type=grad_clip_norm) # if no DDP clip normally - nn.utils.clip_grad_norm_(model.parameters(), - grad_clip, - norm_type=grad_clip_norm) + nn.utils.clip_grad_norm_( + model.parameters(), grad_clip, norm_type=grad_clip_norm + ) def update_model(self): """Updates the model and does gradding clipping.""" if self.use_amp: if self.grad_clip > 0: self.grad_scaler.unscale_(self.optimizer) - self._clip_grad_norm(self.model, self.optimizer, - self.grad_clip, self.grad_clip_norm) + self._clip_grad_norm( + self.model, self.optimizer, self.grad_clip, self.grad_clip_norm + ) self.grad_scaler.step(self.optimizer) self.grad_scaler.update() else: if self.grad_clip > 0: - self._clip_grad_norm(self.model, self.optimizer, - self.grad_clip, self.grad_clip_norm) + self._clip_grad_norm( + self.model, self.optimizer, self.grad_clip, self.grad_clip_norm + ) self.optimizer.step() @@ -441,20 +439,21 @@ def _make_lr_sched(self, lr_sched, optim): lr_sched = LRSF.create(optim, **args) return lr_sched - def _default_loggers(self, log_interval, use_tensorboard, use_wandb, - wandb): + def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb): """Creates the default data loaders""" prog_log = ProgLogger(interval=log_interval) csv_log = CSVLogger(self.exp_path / "train.log", append=True) loggers = [prog_log, csv_log] if use_tensorboard: loggers.append( - TensorBoardLogger(self.exp_path / "tb", interval=log_interval)) + TensorBoardLogger(self.exp_path / "tb", interval=log_interval) + ) if use_wandb: loggers.append( - WAndBLogger(**wandb, - path=self.exp_path / "wandb", - interval=log_interval)) + WAndBLogger( + **wandb, path=self.exp_path / "wandb", interval=log_interval + ) + ) return LoggerList(loggers) def _get_lr(self): @@ -478,7 +477,8 @@ def _compute_grad_acc_steps(self, data_loader): return self.grad_acc_steps = int( - math.ceil(self.eff_batch_size / batch_size / self.world_size)) + math.ceil(self.eff_batch_size / batch_size / self.world_size) + ) logging.info( "Setting grad_acc_steps=%d for " "eff_batch_size=%d, avg_batch_size=%d, world_size=%d", @@ -502,30 +502,24 @@ def checkpoint(self, logs=None): logs: logs containing the current value of the metrics. """ checkpoint = { - "epoch": - self.cur_epoch, - "rng_state": - torch.get_rng_state(), - "model_cfg": - self.model.get_config(), - "model_state_dict": - self.model.state_dict(), - "optimizer_state_dict": - self.optimizer.state_dict(), - "loss_state_dict": - self.loss.state_dict() if self.loss is not None else None, + "epoch": self.cur_epoch, + "rng_state": torch.get_rng_state(), + "model_cfg": self.model.get_config(), + "model_state_dict": self.model.state_dict(), + "optimizer_state_dict": self.optimizer.state_dict(), + "loss_state_dict": self.loss.state_dict() + if self.loss is not None + else None, } if self.lr_scheduler is not None: - checkpoint[ - "lr_scheduler_state_dict"] = self.lr_scheduler.state_dict() + checkpoint["lr_scheduler_state_dict"] = self.lr_scheduler.state_dict() if logs is not None: checkpoint["logs"] = logs if self.in_swa: checkpoint["swa_model_state_dict"] = self.swa_model.state_dict() - checkpoint[ - "swa_scheduler_state_dict"] = self.swa_scheduler.state_dict() + checkpoint["swa_scheduler_state_dict"] = self.swa_scheduler.state_dict() return checkpoint @@ -535,8 +529,9 @@ def save_checkpoint(self, logs=None): Args: logs: logs containing the current value of the metrics. """ - if self.ddp and (self.ddp_type == DDPType.OSS_DDP - or self.ddp_type == DDPType.OSS_SHARDED_DDP): + if self.ddp and ( + self.ddp_type == DDPType.OSS_DDP or self.ddp_type == DDPType.OSS_SHARDED_DDP + ): # Not sure what this does, just copying from the example in # https://github.com/facebookresearch/fairscale/blob/master/benchmarks/oss.py # Check the checkpointing in the case of the OSS optimizer @@ -591,17 +586,16 @@ def load_checkpoint(self, file_path): if self.loss is not None: self.loss.load_state_dict(checkpoint["loss_state_dict"]) if self.lr_scheduler is not None: - self.lr_scheduler.load_state_dict( - checkpoint["lr_scheduler_state_dict"]) + self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"]) # if self.use_amp: # amp.load_state_dict(checkpoint['amp']) if self.do_swa: if "swa_model_state_dict" in checkpoint: - self.swa_model.load_state_dict( - checkpoint["swa_model_state_dict"]) + self.swa_model.load_state_dict(checkpoint["swa_model_state_dict"]) self.swa_scheduler.load_state_dict( - checkpoint["swa_scheduler_state_dict"]) + checkpoint["swa_scheduler_state_dict"] + ) else: self.swa_scheduler = SWALR( self.optimizer, @@ -681,13 +675,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): "--eff-batch-size", type=int, default=None, - help= - "effective total batch size, if given, it overrides grad_acc_steps", + help="effective total batch size, if given, it overrides grad_acc_steps", ) - parser.add_argument("--epochs", - type=int, - default=200, - help="number of epochs") + parser.add_argument("--epochs", type=int, default=200, help="number of epochs") if train_modes is not None: parser.add_argument( "--train-mode", @@ -707,19 +697,12 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): default=False, help="use tensorboard logger", ) - parser.add_argument("--use-wandb", - action="store_true", - default=False, - help="use wandb logger") - parser.add_argument("--wandb.project", - default=None, - help="wandb project name") - parser.add_argument("--wandb.group", - default=None, - help="wandb group name") - parser.add_argument("--wandb.name", - default=None, - help="wandb display name") + parser.add_argument( + "--use-wandb", action="store_true", default=False, help="use wandb logger" + ) + parser.add_argument("--wandb.project", default=None, help="wandb project name") + parser.add_argument("--wandb.group", default=None, help="wandb group name") + parser.add_argument("--wandb.name", default=None, help="wandb display name") # parser.add_argument( # '--wandb.path', default=None, # help='wandb directory') @@ -748,10 +731,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): default=False, help="CPU offload of gradients when using fully_sharded_ddp", ) - parser.add_argument("--grad-clip", - type=float, - default=0, - help="gradient clipping norm value") + parser.add_argument( + "--grad-clip", type=float, default=0, help="gradient clipping norm value" + ) parser.add_argument( "--grad-clip-norm", default=2, @@ -764,10 +746,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): default=0, help="start epoch for SWA, if 0 it does not use SWA", ) - parser.add_argument("--swa-lr", - type=float, - default=1e-3, - help="learning rate for SWA phase") + parser.add_argument( + "--swa-lr", type=float, default=1e-3, help="learning rate for SWA phase" + ) parser.add_argument( "--swa-anneal-epochs", type=int, @@ -786,7 +767,6 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index eddf47a7..a59cbe14 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -101,16 +101,16 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + x, target = tensors_subset(data, batch_keys, self.device) + batch_size = x.size(0) with amp.autocast(enabled=self.use_amp): - output = self.model(input_data, y=target) + output = self.model(x, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 52474baa..0f6ccd9b 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -106,10 +106,10 @@ def train_epoch(self, data_loader): if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + audio, target = tensors_subset(data, batch_keys, self.device) + batch_size = audio.size(0) with torch.no_grad(): - feats, feats_lengths = self.feat_extractor(input_data) + feats, feats_lengths = self.feat_extractor(audio) with amp.autocast(enabled=self.use_amp): output = self.model(feats, feats_lengths, y=target) @@ -159,10 +159,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() for batch, data in enumerate(data_loader): - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + audio, target = tensors_subset(data, batch_keys, self.device) + batch_size = audio.size(0) - feats, feats_lengths = self.feat_extractor(input_data) + feats, feats_lengths = self.feat_extractor(audio) with amp.autocast(enabled=self.use_amp): output = self.model(feats, feats_lengths) loss = self.loss(output, target) diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index 70ee82c8..fe72339f 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -70,8 +70,33 @@ def load(cls, file_path, sep=None): if ext == "": # if no extension we load as kaldi utt2spk file df = pd.read_csv( - file_path, sep=" ", header=None, names=["id"], dtype={"id": np.str}, + file_path, + sep=" ", + header=None, + names=["id"], + dtype={"id": np.str}, ) return cls(df) return super().load(file_path, sep) + + @classmethod + def cat(cls, tables): + """Concatenates several tables. + + Args: + info_lists: List of InfoTables + + Returns: + InfoTable object concatenation the info_lists. + """ + df_list = [table.df for table in tables] + df = pd.concat(df_list) + assert df["id"].is_unique, """there are duplicated ids in original tables""" + if not df["class_idx"].is_unique: + logging.warning( + """class_idx in concat tables are not unique, + we will assign new class_idx""" + ) + df["class_idx"].drop(columns=["class_idx"], inplace=True) + return cls(df) diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index e6c9e861..0ef81ab6 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -20,21 +20,21 @@ class Dataset: - """ Class that contains all objects - (segments, recordings, features, class_infos) that - conform a dataset - - Attributes: - segments: SegmentSet object or path to it. - classes: Dictionary of ClassInfo objects or paths to then - recordings: Dictionary of RecordingSet objects or paths to then - features: Dictionary of FeatureSet objects or paths to then - enrollments: Dictionary of EnrollmentMap objects or paths to then - trials: Dictionary of TrialKey/TrialNdx/SparseTrialKey objects - or paths to then - sparse_trials: load trial keys using the SparseTrialKey class instead - of TrialKey class. - table_sep: Column separator when reading/writting tables + """Class that contains all objects + (segments, recordings, features, class_infos) that + conform a dataset + + Attributes: + segments: SegmentSet object or path to it. + classes: Dictionary of ClassInfo objects or paths to then + recordings: Dictionary of RecordingSet objects or paths to then + features: Dictionary of FeatureSet objects or paths to then + enrollments: Dictionary of EnrollmentMap objects or paths to then + trials: Dictionary of TrialKey/TrialNdx/SparseTrialKey objects + or paths to then + sparse_trials: load trial keys using the SparseTrialKey class instead + of TrialKey class. + table_sep: Column separator when reading/writting tables """ @@ -70,10 +70,12 @@ def __init__( features, FeatureSet ) self._enrollments, self._enrollments_paths = self._parse_dict_args( - enrollments, EnrollmentMap, + enrollments, + EnrollmentMap, ) self._trials, self._trials_paths = self._parse_dict_args( - trials, (TrialKey, TrialNdx, SparseTrialKey), + trials, + (TrialKey, TrialNdx, SparseTrialKey), ) self.sparse_trials = sparse_trials @@ -217,16 +219,41 @@ def save( dataset_path: PathLike, update_paths: bool = True, table_sep: Optional[str] = None, + force_save_all: bool = False, ): - """Saves all the dataset objects. + """Saves the dataset to disk. Args: - dataset_path: str/Path indicating directory - to save the dataset or .yaml file to save + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save the dataset info. - update_paths: whether to update the file_paths in the - data structures in the DateSet object + update_paths: whether to update the file_paths in the + data structures in the DataSet object + force_save_all: forces saving all tables even if they haven't changed, + otherwise, it only saves tables loaded in memory + and those that are not in the datadirectory + """ + if force_save_all: + self.save_all(dataset_path, update_paths, table_sep) + else: + self.save_changed(dataset_path, update_paths, table_sep) + def save_changed( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + force_save_all: bool = False, + ): + """Saves the tables that change in disk or tables + that are not in the ouput directory. + + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DataSet object """ table_sep = self.table_sep if table_sep is None else table_sep if update_paths: @@ -238,12 +265,139 @@ def save( file_name = f"segments{table_ext}" dataset["segments"] = file_name file_path = dataset_dir / file_name - self.segments().save(file_path, sep=table_sep) + if ( + self._segments is not None + or file_path != self._segments_path + or not file_path.exists() + ): + self.segments(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._segments_path = file_path + + file_names = {} + for k in self._recordings.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._recordings is not None + or file_path != self._recordings_paths[k] + or not file_path.exists() + ): + v = self.recordings_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._recordings_paths[k] = file_path + + if file_names: + dataset["recordings"] = file_names + + file_names = {} + for k in self._features.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._features is not None + or file_path != self._features_paths[k] + or not file_path.exists() + ): + v = self.features_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._features_paths[k] = file_path + + if file_names: + dataset["features"] = file_names + + file_names = {} + for k, v in self._classes.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._classes is not None + or file_path != self._classes_paths[k] + or not file_path.exists() + ): + v = self.classes_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._classes_paths[k] = file_path + + if file_names: + dataset["classes"] = file_names + + file_names = {} + for k, v in self._enrollments.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._enrollments is not None + or file_path != self._enrollments_paths[k] + or not file_path.exists() + ): + v = self.enrollments_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._enrollments_paths[k] = file_path + + if file_names: + dataset["enrollments"] = file_names + + file_names = {} + for k, v in self._trials.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._trials is not None + or file_path != self._trials_paths[k] + or not file_path.exists() + ): + v = self.trials_value(k, keep_loaded=False) + v.save(file_path) + if update_paths: + self._trials_paths[k] = file_path + + if file_names: + dataset["trials"] = file_names + + with open(dataset_file, "w") as f: + yaml.dump(dataset, f) + + def save_all( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + ): + """Saves all the dataset objects. + + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DataSet object + """ + table_sep = self.table_sep if table_sep is None else table_sep + if update_paths: + self.table_sep = table_sep + + table_ext = ".tsv" if table_sep == "\t" else ".csv" + dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) + dataset = {} + file_name = f"segments{table_ext}" + dataset["segments"] = file_name + file_path = dataset_dir / file_name + self.segments(keep_loaded=False).save(file_path, sep=table_sep) if update_paths: self._segments_path = file_path file_names = {} - for k, v in self.recordings(): + for k, v in self.recordings(keep_loaded=False): file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name @@ -255,7 +409,7 @@ def save( dataset["recordings"] = file_names file_names = {} - for k, v in self.features(): + for k, v in self.features(keep_loaded=False): file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name @@ -267,7 +421,7 @@ def save( dataset["features"] = file_names file_names = {} - for k, v in self.classes(): + for k, v in self.classes(keep_loaded=False): file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name @@ -279,7 +433,7 @@ def save( dataset["classes"] = file_names file_names = {} - for k, v in self.enrollments(): + for k, v in self.enrollments(keep_loaded=False): file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name @@ -291,7 +445,7 @@ def save( dataset["enrollments"] = file_names file_names = {} - for k, v in self.trials(): + for k, v in self.trials(keep_loaded=False): file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name @@ -329,8 +483,8 @@ def load( """Loads all the dataset objects. Args: - dataset_path: str/Path indicating directory - to save the dataset or .yaml file to save + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save the dataset info. lazy: load data structures lazily when they are needed. sparse_trials: load trial keys using the SparseTrialKey class instead of TrialKey class @@ -386,34 +540,64 @@ def load( return dataset - # dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) - # with open(dataset_file, "w") as f: - # dataset = yaml.safe_load(f) - - # assert "segments" in dataset - # segments = SegmentSet.load( - # Dataset.resolve_file_path(dataset_dir, dataset["segments"]) - # ) - # classes = None - # recordings = None - # features = None - # if "classes" in dataset: - # classes = {} - # for k, v in dataset["classes"]: - # classes[k] = ClassInfo.load(Dataset.resolve_file_path(dataset_dir, v)) - - # if "recordings" in dataset: - # recordings = {} - # for k, v in dataset["recordings"]: - # recordings[k] = RecordingSet.load( - # Dataset.resolve_file_path(dataset_dir, v) - # ) - - # if "features" in dataset: - # features = {} - # for k, v in dataset["features"]: - # features[k] = FeatureSet.load(Dataset.resolve_file_path(dataset_dir, v)) - - # dataset = cls(segments, classes, recordings, features) - # if not lazy: - # dataset.update_from_disk() + def add_features(self, features_name: str, features: Union[PathLike, FeatureSet]): + if isinstance(features, (str, Path)): + self._features[features_name] = None + self._features_paths[features_name] = features + elif isinstance(features, FeatureSet): + self._features[features_name] = features + self._features_paths[features_name] = None + else: + raise ValueError() + + def add_recordings( + self, + recordings_name: str, + recordings: Union[PathLike, RecordingSet], + ): + if isinstance(features, (str, Path)): + self._recordings[features_name] = None + self._recordings_paths[recordings_name] = recordings + elif isinstance(recordings, RecordingSet): + self._recordings[recordings_name] = recordings + self._recordings_paths[recordings_name] = None + else: + raise ValueError() + + def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): + if isinstance(classes, (str, Path)): + self._classes[features_name] = None + self._classes_paths[classes_name] = classes + elif isinstance(classes, ClassInfo): + self._classes[classes_name] = classes + self._classes_paths[classes_name] = None + else: + raise ValueError() + + def add_enrollments( + self, + enrollments_name: str, + enrollments: Union[PathLike, EnrollmentMap], + ): + if isinstance(features, (str, Path)): + self._enrollments[features_name] = None + self._enrollments_paths[enrollments_name] = enrollments + elif isinstance(enrollments, EnrollmentMap): + self._enrollments[enrollments_name] = enrollments + self._enrollments_paths[enrollments_name] = None + else: + raise ValueError() + + def add_trials( + self, + trials_name: str, + trials: Union[PathLike, TrialKey, TrialNdx, SparseTrialKey], + ): + if isinstance(features, (str, Path)): + self._trials[features_name] = None + self._trials_paths[trials_name] = trials + elif isinstance(trials, (TrialKey, TrialNdx, SparseTrialKey)): + self._trials[trials_name] = trials + self._trials_paths[trials_name] = None + else: + raise ValueError() diff --git a/hyperion/utils/enrollment_map.py b/hyperion/utils/enrollment_map.py index 024e5b74..4af69144 100644 --- a/hyperion/utils/enrollment_map.py +++ b/hyperion/utils/enrollment_map.py @@ -18,12 +18,13 @@ class EnrollmentMap(InfoTable): """Class to store the mapping between enrollment id - and segmentids + and segmentids """ def __init__(self, df): if "modelid" in df: df.rename(columns={"modelid": "id"}, inplace=True) + assert "segmentid" in df super().__init__(df) def split(self, idx, num_parts): @@ -84,3 +85,17 @@ def load(cls, file_path, sep=None): df = pd.read_csv(file_path, sep=sep) return cls(df) + + @classmethod + def cat(cls, tables): + """Concatenates several tables. + + Args: + info_lists: List of InfoTables + + Returns: + InfoTable object concatenation the info_lists. + """ + df_list = [table.df for table in tables] + df = pd.concat(df_list) + return cls(df) diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 6bcd4aca..45eab05f 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -176,8 +176,8 @@ def split(self, idx, num_parts, group_by=None): return self.__class__(df) @classmethod - def merge(cls, tables): - """Merges several tables. + def cat(cls, tables): + """Concatenates several tables. Args: info_lists: List of InfoTables @@ -187,6 +187,9 @@ def merge(cls, tables): """ df_list = [table.df for table in tables] df = pd.concat(df_list) + assert df[ + "id" + ].is_unique, """there are duplicated ids in the tables we are concatenating""" return cls(df) def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): From 7ea0eb08c2f74d1e57c4c77f1bba15201967f275 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 4 Jun 2023 06:29:50 +0000 Subject: [PATCH 44/89] update lid training for focal loss and hard negative sampling --- ...c2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml | 68 +++++++++++++++++ ...c2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml | 71 ++++++++++++++++++ ...c2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml | 73 +++++++++++++++++++ ...c2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml | 70 ++++++++++++++++++ ...2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml | 40 ++++++++++ .../v1/global_conf/config_lid_v6.0_13langs.sh | 42 +++++++++++ .../v1/global_conf/config_lid_v6.2_13langs.sh | 42 +++++++++++ .../v1/global_conf/config_lid_v6.3_13langs.sh | 42 +++++++++++ .../v1/global_conf/config_lid_v6.4_13langs.sh | 42 +++++++++++ .../v1/local/initailize_lid_model.py | 6 +- hyperion/bin/finetune_wav2vec2languageid.py | 27 ++++++- hyperion/bin/train_wav2vec2languageid.py | 32 +++++++- hyperion/torch/losses/__init__.py | 1 + hyperion/torch/losses/focal_loss.py | 48 ++++++++++++ .../hf_wav2vec2rnn_film_transducer.py | 1 + 15 files changed, 599 insertions(+), 6 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh create mode 100644 hyperion/torch/losses/focal_loss.py diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml new file mode 100644 index 00000000..dc654278 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml @@ -0,0 +1,68 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml new file mode 100644 index 00000000..962af029 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + loss: weightedCE + loss_weight_exp: 0.5 # 0~1 + # focal_loss_gamma: 2.0 + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml new file mode 100644 index 00000000..3918b04f --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + num_hard_prototypes: 8 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + num_hard_prototypes: 8 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + loss: weightedCE + loss_weight_exp: 0.5 # 0~1 + # focal_loss_gamma: 2.0 + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml new file mode 100644 index 00000000..17a13388 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + num_hard_prototypes: 8 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + num_hard_prototypes: 8 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml new file mode 100644 index 00000000..c40bcb1f --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3_1layer_do0.2.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh new file mode 100644 index 00000000..ebbd7fd1 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.0.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v6.0_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0034.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.0.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v6.0_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh new file mode 100644 index 00000000..57fb5d0b --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.2.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v6.2_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0034.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.2.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v6.2_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh new file mode 100644 index 00000000..d1847910 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.3.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v6.3_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0034.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.3.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v6.3_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh new file mode 100644 index 00000000..88190921 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v6.4.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v6.4_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0034.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.4.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v6.4_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/local/initailize_lid_model.py b/egs/commonvoice/v1/local/initailize_lid_model.py index 9a2c1a06..22e32bed 100644 --- a/egs/commonvoice/v1/local/initailize_lid_model.py +++ b/egs/commonvoice/v1/local/initailize_lid_model.py @@ -5,7 +5,7 @@ # LID_model = "exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe.s1_initial/model_ep0000.pth" # output_model = "model_initialized.pth" -# python local/initailize_lid_model.py /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v4.2_13_langs.s1/model_ep0003.pth /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v4.2_13_langs.s3/model_ep0001.pth +# python local/initailize_lid_model.py /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0008.pth /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v6.0_13_langs.s1/model_ep0034.pth /gspvolume/home/ec2-user/hyperion/egs/commonvoice/v1/exp/resnet1d_nnets/wav2vec2xlsr300m_resnet1d_v6.0_13_langs.s3/model_ep0001.pth ASR_model = torch.load(sys.argv[1]) LID_model = torch.load(sys.argv[2]) @@ -16,6 +16,8 @@ def copy_model_parameters(ASR_model, LID_model): ASR_state_dict = ASR_model["model_state_dict"] LID_state_dict = LID_model["model_state_dict"] + + #ASR_state_dict = {name.replace("module.", ""): param for name, param in ASR_state_dict.items()} update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in LID_state_dict and param.shape == LID_state_dict[name].shape and "hf_feats" in name} # remove feature fuser @@ -46,4 +48,4 @@ def copy_model_parameters(ASR_model, LID_model): -copy_model_parameters(ASR_model, LID_model) \ No newline at end of file +copy_model_parameters(ASR_model, LID_model) diff --git a/hyperion/bin/finetune_wav2vec2languageid.py b/hyperion/bin/finetune_wav2vec2languageid.py index 4ac24e98..0403f84c 100755 --- a/hyperion/bin/finetune_wav2vec2languageid.py +++ b/hyperion/bin/finetune_wav2vec2languageid.py @@ -31,6 +31,11 @@ from hyperion.torch.models import HFWav2Vec2ResNet1dLanguageID from torch.nn.utils.rnn import pad_sequence +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + + model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dLanguageID, # "hf_hubert2resnet1d": HFHubert2ResNet1LanguageID, @@ -94,8 +99,9 @@ def init_data(partition, rank, num_gpus, **kwargs): } if num_gpus > 0 else {}) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, - **largs, - collate_fn=Language_collate) + **largs) + # , + # collate_fn=Language_collate) return data_loader @@ -114,6 +120,21 @@ def init_model(num_classes, in_model_file, rank, model_class, **kwargs): return model +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + if not train_loader.batch_sampler.hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + if not val_loader.batch_sampler.hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + def train_model(gpu_id, args): config_logger(args.verbose) @@ -138,6 +159,7 @@ def train_model(gpu_id, args): val_loader = init_data(partition="val", **kwargs) model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: @@ -148,6 +170,7 @@ def train_model(gpu_id, args): device=device, metrics=metrics, ddp=world_size > 1, + # loss_weight=train_loader.batch_sampler.class_info["weights"], **trn_args, ) trainer.load_last_checkpoint() diff --git a/hyperion/bin/train_wav2vec2languageid.py b/hyperion/bin/train_wav2vec2languageid.py index 7af47d03..680ddd61 100755 --- a/hyperion/bin/train_wav2vec2languageid.py +++ b/hyperion/bin/train_wav2vec2languageid.py @@ -23,6 +23,7 @@ from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.utils import ddp + from hyperion.torch.trainers import LanguageIDTrainer as Trainer from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory @@ -30,6 +31,11 @@ from hyperion.torch.models import HFWav2Vec2ResNet1dLanguageID from torch.nn.utils.rnn import pad_sequence +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + + model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dLanguageID, # "hf_hubert2resnet1d": HFHubert2ResNet1LanguageID, @@ -93,8 +99,9 @@ def init_data(partition, rank, num_gpus, **kwargs): } if num_gpus > 0 else {}) data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, - **largs, - collate_fn=Language_collate) + **largs) + # , + # collate_fn=Language_collate) return data_loader @@ -109,6 +116,23 @@ def init_model(num_classes, rank, model_class, **kwargs): return model + +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + if not train_loader.batch_sampler.hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + if not val_loader.batch_sampler.hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + def train_model(gpu_id, args): config_logger(args.verbose) @@ -129,6 +153,7 @@ def train_model(gpu_id, args): # device = "cpu" # world_size=1 + # import pdb; pdb.set_trace() train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) @@ -138,14 +163,17 @@ def train_model(gpu_id, args): if rank == 0: logging.info("trainer args={}".format(trn_args)) metrics = {"acc": CategoricalAccuracy()} + # import pdb; pdb.set_trace() trainer = Trainer( model, device=device, metrics=metrics, ddp=world_size > 1, + loss_weight=train_loader.batch_sampler.class_info["weights"], **trn_args, ) trainer.load_last_checkpoint() + init_hard_prototype_mining(trainer.model, train_loader, val_loader, rank) trainer.fit(train_loader, val_loader) ddp.ddp_cleanup() diff --git a/hyperion/torch/losses/__init__.py b/hyperion/torch/losses/__init__.py index bf3ce279..55cc2f52 100644 --- a/hyperion/torch/losses/__init__.py +++ b/hyperion/torch/losses/__init__.py @@ -4,3 +4,4 @@ """ from .bce_with_llr import BCEWithLLR +from .focal_loss import FocalLoss \ No newline at end of file diff --git a/hyperion/torch/losses/focal_loss.py b/hyperion/torch/losses/focal_loss.py new file mode 100644 index 00000000..f2a0d32a --- /dev/null +++ b/hyperion/torch/losses/focal_loss.py @@ -0,0 +1,48 @@ +from torch import nn +import torch +from torch.nn import functional as F +import logging +class FocalLoss(nn.Module): + def __init__(self, alpha=0.25, gamma=2, size_average=True): + """ + Focal loss implementation: -alpha(1-yi)**gamma * ce_loss(xi,yi) + + :param alpha: scalar or list. Class weights. If scalar, the same weight applies for all classes. + :param gamma: scalar. Difficult-to-easy sample regulation parameter. + :param size_average: bool. Whether to average the loss over the batch. + :param device: str. Device to place the tensors. + """ + super(FocalLoss,self).__init__() + self.gamma = gamma + self.size_average = size_average + self.alpha = alpha + logging.info("FocalLoss: alpha={}, gamma={}, size_average={}".format(alpha, gamma, size_average)) + + def forward(self, preds, labels): + """ + Compute the focal loss. + + :param preds: Predicted classes. size:[B,N,C] or [B,C] + :param labels: Actual classes. size:[B,N] or [B] + :return: scalar. Loss value. + """ + preds = preds.view(-1, preds.size(-1)) + preds_logsoft = F.log_softmax(preds, dim=1) + preds_softmax = torch.exp(preds_logsoft) + + preds_softmax = preds_softmax.gather(1, labels.view(-1, 1)) + preds_logsoft = preds_logsoft.gather(1, labels.view(-1, 1)) + + if isinstance(self.alpha, torch.Tensor): + alpha = self.alpha.gather(0, labels.view(-1)) + else: # if alpha is a scalar + alpha = self.alpha + + loss = -torch.mul(torch.pow((1 - preds_softmax), self.gamma), preds_logsoft) + + loss = torch.mul(alpha, loss.t()) + if self.size_average: + loss = loss.mean() + else: + loss = loss.sum() + return loss diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py index 513d193c..9ee37287 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_film_transducer.py @@ -12,6 +12,7 @@ from ...tpm import HFWav2Vec2 from .hf_wav2rnn_film_transducer import HFWav2RNNFiLMTransducer from ..transducer import RNNFiLMTransducer +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID class HFWav2Vec2RNNFiLMTransducer(HFWav2RNNFiLMTransducer): """Class for RNN-T with Wav2Vec2 features From aed329beeb9791c5992363a937bfa42f41b1f294 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 4 Jun 2023 20:38:35 +0000 Subject: [PATCH 45/89] update film transducer decoder for original joiner --- .../narchs/rnn_film_transducer_decoder.py | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index e070f70b..e655581a 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -168,7 +168,10 @@ def get_config(self): def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor, y: torch.Tensor, y_lengths: torch.Tensor, pred_out: torch.Tensor, lang_embedding: torch.Tensor): - logits = self.joiner(x, pred_out, lang_embedding) + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(x, pred_out) + else: + logits = self.joiner(x, pred_out, lang_embedding) # rnnt_loss requires 0 padded targets # Note: y does not start with SOS y_padded = y.pad(mode="constant", padding_value=0) @@ -194,7 +197,10 @@ def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor, boundary[:, 2] = y_lengths boundary[:, 3] = x_lengths - logits = self.joiner(x, pred_out, lang_embedding) + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(x, pred_out) + else: + logits = self.joiner(x, pred_out, lang_embedding) with torch.cuda.amp.autocast(enabled=False): loss = k2.rnnt_loss( @@ -257,7 +263,11 @@ def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, # project_input=False since we applied the decoder's input projections # prior to do_rnnt_pruning (this is an optimization for speed). - logits = self.joiner(am_pruned, lm_pruned, lang_embedding, project_input=False) + + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(am_pruned, lm_pruned, project_input=False) + else: + logits = self.joiner(am_pruned, lm_pruned, lang_embedding, project_input=False) with torch.cuda.amp.autocast(enabled=False): @@ -374,7 +384,11 @@ def decode_greedy(self, while t < T and sym_per_utt < max_sym_per_utt: x_t = x[:, t:t + 1, :] - logits = self.joiner(x_t, pred_out, lang_embedding) # (1, 1, 1, vocab_size) + + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(x_t, pred_out) + else: + logits = self.joiner(x_t, pred_out, lang_embedding) # (1, 1, 1, vocab_size) # logits is log_prob = logits.log_softmax(dim=-1) # (1, 1, 1, vocab_size) @@ -442,7 +456,10 @@ def decode_time_sync_beam_search(self, else: pred_out, pred_state = cache[cached_key] - logits = self.joiner(x_t, pred_out, lang_embedding) + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(x_t, pred_out) + else: + logits = self.joiner(x_t, pred_out, lang_embedding) log_prob = logits.log_softmax(dim=-1) # log_prob is (1, 1, 1, vocab_size) log_prob = log_prob.squeeze() @@ -570,7 +587,10 @@ def decode_align_length_sync_beam_search( else: pred_out, pred_state = cache[cached_key] - logits = self.joiner(x_t, pred_out, lang_embedding) + if self.joiner_args["joiner_type"] == "original_joiner": + logits = self.joiner(x_t, pred_out) + else: + logits = self.joiner(x_t, pred_out, lang_embedding) log_prob = logits.log_softmax(dim=-1) # (1, 1, 1, vocab_size) log_prob = log_prob.squeeze() # (vocab_size,) From 5ec0dc71e66dcfacf2cbcebee3594cd9fd25d6c1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 5 Jun 2023 08:07:13 +0000 Subject: [PATCH 46/89] add different loss for lid --- .../bin/train_wav2vec2rnn_film_transducer.py | 3 ++- .../data/class_weighted_seg_chunk_sampler.py | 2 +- hyperion/torch/trainers/languageid_trainer.py | 27 +++++++++++++++++-- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/hyperion/bin/train_wav2vec2rnn_film_transducer.py b/hyperion/bin/train_wav2vec2rnn_film_transducer.py index f06cc684..2306b467 100755 --- a/hyperion/bin/train_wav2vec2rnn_film_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_film_transducer.py @@ -27,6 +27,7 @@ namespace_to_dict) from torch.nn.utils.rnn import pad_sequence + model_dict = { "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, "hf_wav2vec2rnn_filmed_transducer": HFWav2Vec2RNNFiLMTransducer, @@ -225,7 +226,7 @@ def make_parser(model_class): parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") - parser.add_argument("--in-model-file", required=True) + # parser.add_argument("--in-model-file", required=True) model_class.add_class_args(parser, prefix="model") Trainer.add_class_args(parser, diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 8ec63b6f..afb663d5 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -245,7 +245,7 @@ def set_hard_prototypes(self, affinity_matrix): ).indices def get_hard_prototypes(self, class_idx): - return self.hard_prototypes[class_idx].flatten().numpy() + return self.hard_prototypes[class_idx].flatten().cpu().numpy() def _sample_chunk_length(self): if self.var_batch_size: diff --git a/hyperion/torch/trainers/languageid_trainer.py b/hyperion/torch/trainers/languageid_trainer.py index add56c1e..ef252693 100644 --- a/hyperion/torch/trainers/languageid_trainer.py +++ b/hyperion/torch/trainers/languageid_trainer.py @@ -15,6 +15,8 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset from .torch_trainer import TorchTrainer +# from ..losses.focal_loss import FocalLoss +# from torchvision.ops.focal_loss import sigmoid_focal_loss class LanguageIDTrainer(TorchTrainer): @@ -78,10 +80,17 @@ def __init__( cpu_offload=False, input_key="x", target_key="language", + loss_weight=None, + loss_weight_exp=0.5, ): - if loss is None: + if loss == "CE" or loss is None: loss = nn.CrossEntropyLoss() + elif loss == "weightedCE": + loss = nn.CrossEntropyLoss(weight=torch.tensor(loss_weight.values, dtype=torch.float).to(device)**(-loss_weight_exp)) + logging.info(torch.tensor(loss_weight.values).to(device)**(-loss_weight_exp)) + elif loss == "focal_loss": + loss = FocalLoss(alpha=torch.tensor(focal_weight.values).to(device)**(-loss_weight_exp), gamma=2, size_average=True) super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) @@ -195,6 +204,11 @@ def validation_epoch(self, data_loader, swa_update_bn=False): logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(LanguageIDTrainer.__init__, kwargs) + return args + @staticmethod def add_class_args(parser, prefix=None, train_modes=None, skip=set()): if prefix is not None: @@ -210,7 +224,16 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): parser.add_argument("--target-key", default="language", help="dict. key for nnet targets") - + if "loss" not in skip: + parser.add_argument("--loss", + default=None, + choices=["CE", "weightedCE", "focal_loss"], + help="loss function") + if "loss_weight_exp" not in skip: + parser.add_argument("--loss-weight-exp", + default=0.5, + type=float, + help="focal loss weight exponent") if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) From 22920dca86a8c20afc0764564026d4c9826a096e Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 8 Jun 2023 04:10:02 +0000 Subject: [PATCH 47/89] add new training mode: film-ft, and add lid-film-asr system --- ..._wav2vec2rnn_film_transducer_languageid.py | 274 ++++++++++++++++++ hyperion/torch/models/__init__.py | 2 +- .../hf_wav2rnn_film_transducer.py | 13 + .../narchs/rnn_film_transducer_decoder.py | 16 +- 4 files changed, 302 insertions(+), 3 deletions(-) create mode 100755 hyperion/bin/train_wav2vec2rnn_film_transducer_languageid.py diff --git a/hyperion/bin/train_wav2vec2rnn_film_transducer_languageid.py b/hyperion/bin/train_wav2vec2rnn_film_transducer_languageid.py new file mode 100755 index 00000000..d5a6ad6f --- /dev/null +++ b/hyperion/bin/train_wav2vec2rnn_film_transducer_languageid.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import k2 +import numpy as np +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2RNNFiLMTransducer, + HFWav2Vec2RNNTransducerResnet1D, + HFWav2Vec2RNNFiLMTransducerResnet1D) +from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from torch.nn.utils.rnn import pad_sequence + +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + + +model_dict = { + "hf_wav2vec2rnn_film_transducer_resnet1d": HFWav2Vec2RNNFiLMTransducerResnet1D, +} + + +def transducer_language_collate(batch): + audio = [] + audio_length = [] + target = [] + language = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + language.append(record["language"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + language = [language[k] for k in sort_idx] + language = torch.as_tensor(language) + + # FiLM: add language ID to the input + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + "language": language, + } + return batch + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs, + collate_fn=transducer_language_collate) + return data_loader + +def init_model(blank_id, vocab_size, num_classes, loss_class_weight, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + # TODO: check model_args + model_args["transducer"]["decoder"]["blank_id"] = blank_id + model_args["transducer"]["decoder"]["vocab_size"] = vocab_size + model_args["languageid"]["num_classes"] = num_classes + model_args["loss_class_weight"] = loss_class_weight + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + #torch.backends.cudnn.deterministic = True + #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.enabled = False + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = torch.device("cuda:{}".format(gpu_id)) + # world_size=1 + + # import pdb; pdb.set_trace() + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + # model = init_model_from_transducer(**kwargs) + model = init_model(train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + list(train_loader.dataset.num_classes.values())[0], + train_loader.batch_sampler.class_info["weights"], + **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + # import pdb; pdb.set_trace() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + parser.link_arguments("data.train.dataset.bpe_model", + "data.val.dataset.bpe_model") + + # parser.add_argument("--in-model-file", required=True) + model_class.add_class_args(parser, prefix="model") + + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 419ea742..62215e57 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -13,7 +13,7 @@ HFWav2Vec2RNNFiLMTransducer) from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector, HFWavLM2ResNet1dXVector) -from .wav2transducer_languageid import HFWav2Vec2RNNTransducerResnet1D +from .wav2transducer_languageid import HFWav2Vec2RNNTransducerResnet1D, HFWav2Vec2RNNFiLMTransducerResnet1D from .xvectors.efficient_net_xvector import EfficientNetXVector from .xvectors.resnet1d_xvector import ResNet1dXVector from .xvectors.resnet_xvector import ResNetXVector diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py index 3f44c7c5..24efb44e 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py @@ -241,6 +241,12 @@ def infer(self, max_sym_per_utt=max_sym_per_utt) return y + def unfreeze_film(self): + for name, param in self.named_parameters(): + if "film" in name: + logging.info(f"unfreezing {name}") + param.requires_grad = True + def freeze_feat_fuser(self): if self.feat_fuser is None: return @@ -266,6 +272,9 @@ def set_train_mode(self, mode): self.unfreeze() elif mode == "frozen": self.freeze() + elif mode in ["ft-film", "ft-film-grad"]: + self.freeze() + self.unfreeze_film() elif mode in ["ft-transducer", "ft-transducer-nograd"]: self.unfreeze() self.freeze_hf_feats() @@ -294,8 +303,10 @@ def _train(self, train_mode: str): if train_mode in ["full", "frozen"]: super()._train(train_mode) elif train_mode in [ + "ft-film", "ft-transducer", "hf-feats-frozen", + "ft-film-grad", "ft-transducer-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", @@ -310,8 +321,10 @@ def valid_train_modes(): return [ "full", "frozen", + "ft-film", "ft-embed-affine", "ft-transducer", + "ft-film-grad", "hf-feats-frozen", "ft-transducer-nograd", "hf-feats-frozen-nograd", diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index e655581a..1ccac6a9 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -28,7 +28,7 @@ @dataclass class Hypothesis: - ys: List[int] # predicted sequences + ys: List[int] # lid_pred sequences log_prob: float # log prob of ys # Optional LSTM predictor state. @@ -78,6 +78,7 @@ def __init__( pruned_warmup_steps: int = 2000, langs_size: int = 13, condition_size: int = 64, + film_type: str = "one-hot", ): super().__init__() @@ -96,12 +97,16 @@ def __init__( self.simple_loss_scale = simple_loss_scale self.pruned_warmup_steps = pruned_warmup_steps self.condition_size = condition_size + self.film_type = film_type self._make_predictor() self._make_joiner() # make embedding layer for language id - self.lang_embedding = nn.Embedding(langs_size, condition_size) + if self.film_type == "one-hot": + self.lang_embedding = nn.Embedding(langs_size, condition_size) + elif self.film_type == "lid_pred": + self.lang_embedding = nn.Linear(langs_size, condition_size) if self.rnnt_loss == "k2_pruned": self.simple_am_proj = nn.Linear(in_feats, vocab_size) self.simple_lm_proj = nn.Linear(self.predictor.out_feats, @@ -161,6 +166,7 @@ def get_config(self): "simple_loss_scale": self.simple_loss_scale, "pruned_warmup_steps": self.pruned_warmup_steps, "condition_size": self.condition_size, + "film_type": self.film_type, } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) @@ -815,6 +821,12 @@ def add_class_args(parser, type=int, required=True, help=("condition vector dimension")) + + parser.add_argument("--film-type", + default="one-hot", + choices=["one-hot", "lid_pred"], + help=("type of the condition of FiLM layer")) + parser.add_argument( "--lm-scale", From e12e9f5f4ffe2439d97911129c93fb1f04fa2f99 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 8 Jun 2023 04:15:57 +0000 Subject: [PATCH 48/89] update configuration --- ...2base_rnnt_film_k2_pruned_stage3_v4.0.yaml | 92 +++++++++++++++++++ ...2base_rnnt_film_k2_pruned_stage3_v4.2.yaml | 92 +++++++++++++++++++ ...g_pruned_filmed_transducer_v4.0_13langs.sh | 45 +++++++++ ...g_pruned_filmed_transducer_v4.2_13langs.sh | 45 +++++++++ 4 files changed, 274 insertions(+) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.0_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml new file mode 100644 index 00000000..48ad726c --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm_residual + joiner: + hid_feats: 512 + feat_fusion_method: film-weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml new file mode 100644 index 00000000..db1005b1 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.0_13langs.sh new file mode 100644 index 00000000..6391fc98 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.0_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0003.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2_13langs.sh new file mode 100644 index 00000000..5de2bb92 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.2_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.2.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth From 7e1fdf8f84630fd55216da00fc084173e236d80a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 8 Jun 2023 04:24:35 +0000 Subject: [PATCH 49/89] update model --- hyperion/torch/models/wav2transducer_languageid/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hyperion/torch/models/wav2transducer_languageid/__init__.py b/hyperion/torch/models/wav2transducer_languageid/__init__.py index 98ebfdc7..bc785608 100644 --- a/hyperion/torch/models/wav2transducer_languageid/__init__.py +++ b/hyperion/torch/models/wav2transducer_languageid/__init__.py @@ -4,4 +4,5 @@ """ -from .hf_wav2vec2rnn_transducer_languageid import HFWav2Vec2RNNTransducerResnet1D \ No newline at end of file +from .hf_wav2vec2rnn_transducer_languageid import HFWav2Vec2RNNTransducerResnet1D +from .hf_wav2vec2rnn_film_transducer_languageid import HFWav2Vec2RNNFiLMTransducerResnet1D \ No newline at end of file From 4dfe23cd188acef302e5f8f73cf07ea917606296 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 8 Jun 2023 04:25:40 +0000 Subject: [PATCH 50/89] update model for film_transducer_lid --- .../hf_wav2rnn_film_transducer_languageid.py | 578 ++++++++++++++++++ ..._wav2vec2rnn_film_transducer_languageid.py | 171 ++++++ 2 files changed, 749 insertions(+) create mode 100644 hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py create mode 100644 hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py new file mode 100644 index 00000000..d967702a --- /dev/null +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py @@ -0,0 +1,578 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import contextlib +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ....utils import HypDataClass +from ...torch_model import TorchModel +from ...utils import remove_silence +from ..transducer import RNNTransducer, RNNFiLMTransducer, RNNTransducerOutput +from .hf_wav2rnn_transducer_languageid import RNNTransducerLanguageIDOutput +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID +from ...layer_blocks import FiLM + + +class HFWav2RNNFiLMTransducerLanguageID(TorchModel): + """Abstract Base class for combined transducer language identification models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + transducer: transducer model object. + languageid: language identification model object. + feat_fusion_start: the input to the combined model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__(self, + hf_feats: TorchModel, + transducer: Union[Dict, TorchModel], + languageid: Union[Dict, TorchModel], + feat_fusion_start_transducer: int = 0, + feat_fusion_start_lid: int = 0, + feat_fusion_method_transducer: str = "film-weighted-avg", + feat_fusion_method_lid: str = "weighted-avg", + loss_lid_type: str = "weightedCE", + loss_class_weight: Optional[torch.Tensor] = None, + loss_class_weight_exp= 1.0, + loss_weight_transducer: float = 0.005, + loss_weight_lid: float = 1.0, + lid_length: float = 3.0, + ): + + super().__init__() + self.hf_feats = hf_feats + if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + #transducer["joiner"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer["encoder"] = None + transducer = RNNFiLMTransducer(**transducer) + else: + assert isinstance(transducer, RNNFiLMTransducer) + if transducer.encoder is None: + assert transducer.decoder.in_feats == hf_feats.hidden_size + #assert transducer.joiner.in_feats == hf_feats.hidden_size + + if isinstance(languageid, dict): + languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size + if "class_name" in languageid: + del languageid["class_name"] + languageid = ResNet1dLanguageID(**languageid) + else: + assert isinstance(languageid, ResNet1dLanguageID) + assert languageid.encoder_net.in_feats == hf_feats.hidden_size + + + self.transducer = transducer + self.languageid = languageid + self.feat_fusion_start_transducer = feat_fusion_start_transducer + self.feat_fusion_start_lid = feat_fusion_start_lid + self.feat_fusion_method_transducer = feat_fusion_method_transducer + self.feat_fusion_method_lid = feat_fusion_method_lid + self.loss_lid_type = loss_lid_type + self.loss_class_weight = loss_class_weight + self.loss_class_weight_exp = loss_class_weight_exp + + if loss_lid_type == "CE" or loss_lid_type is None: + self.loss_lid = nn.CrossEntropyLoss() + elif loss_lid_type == "weightedCE": + self.loss_lid = nn.CrossEntropyLoss(weight=torch.tensor(loss_class_weight.values, dtype=torch.float)**(-loss_class_weight_exp)) + logging.info(torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp)) + elif loss_lid_type == "focal_loss": + self.loss_lid = FocalLoss(alpha=torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp), gamma=2, size_average=True) + + self.loss_weight_transducer = loss_weight_transducer + self.loss_weight_lid = loss_weight_lid + self.lid_length = lid_length + self._hf_context = contextlib.nullcontext() + self.transducer_fuser, self.films = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer) + self.languageid_fuser, _ = self._make_fuser(self.feat_fusion_method_lid, self.feat_fusion_start_lid) + + def _make_fuser(self, method, start): + feat_fuser = None + films = None + if method == "last": + return feat_fuser, films + num_layers = self.hf_feats.num_encoder_layers + 1 - start + layer_dim = self.hf_feats.hidden_size + if method == "film-weighted-avg": + films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)]) + feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif method == "film-fused-feature": + feat_fuser = nn.Parameter(torch.zeros(num_layers)) + film = FiLM(layer_dim, self.transducer.decoder.condition_size) + elif method == "weighted-avg": + feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif method == "linear": + feat_fuser = nn.Linear(num_layers, 1, bias=False) + feat_fuser.weight.data = torch.ones(1, + num_layers) / num_layers + elif method == "cat": + feat_fuser = nn.Linear(num_layers * layer_dim, + layer_dim, + bias=False) + + return feat_fuser, films + + def _fuse_transducer_hid_feats(self, hid_feats, lang): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + lang: language id Tensor. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + lang_condition = self.transducer.decoder.lang_embedding(lang) + hid_feats = hid_feats[self.feat_fusion_start_transducer:] + if self.feat_fusion_method_transducer == "film-weighted-avg": + film_hid_feats = tuple(self.films[i](hid_feats[i], lang_condition) for i in range(len(self.films))) + film_hid_feats = torch.stack(film_hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) + feats = torch.sum(film_hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method_transducer == "film-fused-feature": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + feats = self.films(feats, lang_condition) + elif self.feat_fusion_method_transducer == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method_transducer == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.transducer_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method_transducer == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.transducer_fuser(hid_feats) + elif self.feat_fusion_method_transducer == "last": + feats = hid_feats[-1] + + return feats + + + def _fuse_lid_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start_lid:] + if self.feat_fusion_method_lid == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.languageid_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method_lid == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.languageid_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method_lid == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.languageid_fuser(hid_feats) + elif self.feat_fusion_method_lid == "last": + feats = hid_feats[-1] + + return feats + + def forward_lid_feats(self, + x, + x_lengths, + lang=None, + return_feat_layers=None, + chunk_length=0, + detach_chunks=False): + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=True, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + + hid_feats = hf_output["hidden_states"] + feats = self._fuse_lid_hid_feats(hid_feats) + + + feats = feats.transpose(1, 2) + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + x_lengths=None, + text=None, + languageid=None, + return_feat_layers=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Dataclass with losses, "h_enc" (list of hidden encoder layers), + "h_feats" (wav2vec features) + """ + feats_languageid, hid_feats, feat_lengths = self.forward_lid_feats( + x, x_lengths, return_feat_layers) + + lid_len = int(self.lid_length * 50) + min_len = torch.min(feat_lengths).item() + if min_len > lid_len: + lid_start = torch.randint(0, min_len - lid_len + 1, (1,)).item() + feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len] + + + lid_logits = self.languageid( + feats_languageid, + None, + languageid, + return_enc_layers=return_enc_layers, + return_classif_layers=return_classif_layers, + return_logits=return_logits, + ) + + loss_lid = self.loss_lid(lid_logits, languageid) + + + feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid_logits) # (N, T, C) + + trans_output = self.transducer( + feats_transducer, + feat_lengths, + text, + lid_logits + ) + + if return_feat_layers: + trans_output.h_feats = [ + f.transpose(1, 2) for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, + loss_transducer=trans_output.loss, + loss_lid=loss_lid, + loss_transducer_simple=trans_output.loss_simple, + loss_transducer_pruned=trans_output.loss_pruned, + h_feats=trans_output.h_feats, + logits=lid_logits if return_logits else None) + return output + + def infer(self, + x: torch.Tensor, + x_lengths: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000): + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + + + feats_languageid, hid_feats, feat_lengths = self.forward_lid_feats( + x, x_lengths, return_feat_layers) + + + lid = self.languageid( + feats_languageid.float(), + feat_lengths, + None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ) + + feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid) # (N, T, C) + + + text = self.transducer.infer(feats_transducer, + feat_lengths, + decoding_method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + + return text, lid + + # def freeze_feat_fuser(self): + # if self.feat_fuser is None: + # return + + # if self.feat_fusion_method_transducer == "weighted-avg": + # self.feat_fuser.requires_grad = False + # return + + # for param in self.feat_fuser.parameters(): + # param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode in ["ft-transducer", "ft-transducer-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode in [ + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.transducer._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "transducer", + "feat_fusion_start_transducer", + "feat_fusion_start_lid", + "feat_fusion_method_transducer", + "feat_fusion_method_lid", + "loss_lid_type", + "loss_class_weight", + "loss_class_weight_exp", + "loss_weight_transducer", + "loss_weight_lid", + "languageid", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + hf_cfg = self.hf_feats.get_config() + tran_cfg = self.transducer.get_config() + lid_cfg = self.languageid.get_config() + del hf_cfg["class_name"] + del tran_cfg["class_name"] + del lid_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "transducer": tran_cfg, + "languageid": lid_cfg, + "feat_fusion_start_transducer": self.feat_fusion_start_transducer, + "feat_fusion_start_lid": self.feat_fusion_start_lid, + "feat_fusion_method_transducer": self.feat_fusion_method_transducer, + "feat_fusion_method_lid": self.feat_fusion_method_lid, + "loss_lid_type": self.loss_lid_type, + "loss_class_weight": self.loss_class_weight, + "loss_class_weight_exp": self.loss_class_weight_exp, + "loss_weight_transducer": self.loss_weight_transducer, + "loss_weight_lid": self.loss_weight_lid, + "lid_length": self.lid_length, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, transducer, languageid): + logging.info("changing hf wav2transducer config") + self.hf_feats.change_config(**hf_feats) + self.transducer.change_config(**transducer) + self.languageid.change_config(**languageid) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start-transducer", + default=0, + type=int, + help=""" + the input to transducer model will fuse the wav2vec + layers from feat_fusion_start_transducer to + the wav2vec num_layers""", + ) + parser.add_argument( + "--feat-fusion-start-lid", + default=0, + type=int, + help=""" + the input to lid model will fuse the wav2vec + layers from feat_fusion_start_lid to + the wav2vec num_layers""", + ) + + parser.add_argument( + "--feat-fusion-method-transducer", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]"), + ) + parser.add_argument( + "--feat-fusion-method-lid", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]"), + ) + + parser.add_argument( + "--loss-lid-type", + default="weightedCE", + choices=["CE", "weightedCE", "focal_loss"], + help=("loss type for language identification"), + ) + parser.add_argument( + "--loss-class-weight", + default=None, + type=str, + help=("class weight for language identification"), + ) + parser.add_argument( + "--loss-class-weight-exp", + default=1.0, + type=float, + help=("class weight exponent for language identification"), + ) + parser.add_argument( + "--loss-weight-transducer", + default=0.005, + type=float, + help=""" + The weight of the transducer loss + """, + ) + + parser.add_argument( + "--loss-weight-lid", + default=1.0, + type=float, + help=""" + The weight of the lid loss + """, + ) + + parser.add_argument( + "--lid-length", + default=3.0, + type=float, + help=""" + The length of the chunks for language id + """, + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNFiLMTransducer.add_infer_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return RNNFiLMTransducer.filter_infer_args(**kwargs) diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py new file mode 100644 index 00000000..e012f17a --- /dev/null +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py @@ -0,0 +1,171 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from ..transducer import RNNFiLMTransducer +from ..xvectors import ResNet1dXVector as ResNet1dLanguageID +from ..wav2languageid import HFWav2Vec2ResNet1dLanguageID +from ..wav2transducer import HFWav2Vec2RNNFiLMTransducer + + +from .hf_wav2rnn_film_transducer_languageid import HFWav2RNNFiLMTransducerLanguageID + + +class HFWav2Vec2RNNFiLMTransducerResnet1D(HFWav2RNNFiLMTransducerLanguageID): + """Class for RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNFiLMTransducer], + languageid: Union[Dict, ResNet1dLanguageID], + feat_fusion_start_transducer: int = 0, + feat_fusion_start_lid: int = 0, + feat_fusion_method_transducer: str = "weighted-avg", + feat_fusion_method_lid: str = "weighted-avg", + loss_lid_type: str = "weightedCE", + loss_class_weight: Optional[torch.Tensor] = None, + loss_class_weight_exp: float = 1.0, + loss_weight_transducer: float = 0.005, + loss_weight_lid: float = 1.0, + lid_length: float = 3.0, + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + # if isinstance(languageid, dict): + # languageid["resnet_enc"]["in_feats"] = hf_feats.hidden_size + # if "class_name" in languageid: + # del languageid["class_name"] + # languageid = ResNet1dLanguageID(**languageid) + # else: + # assert isinstance(languageid, ResNet1dLanguageID) + # assert languageid.encoder_net.in_feats == hf_feats.hidden_size + + # hf_feats = wav2transducer.hf_feats + # transducer = wav2transducer.transducer + # languageid = wav2languageid.languageid + + + super().__init__(hf_feats, transducer, languageid, + feat_fusion_start_transducer=feat_fusion_start_transducer, + feat_fusion_start_lid=feat_fusion_start_lid, + feat_fusion_method_transducer=feat_fusion_method_transducer, + feat_fusion_method_lid=feat_fusion_method_lid, + loss_lid_type=loss_lid_type, + loss_class_weight=loss_class_weight, + loss_class_weight_exp=loss_class_weight_exp, + loss_weight_transducer=loss_weight_transducer, + loss_weight_lid=loss_weight_lid, + lid_length=lid_length) + + + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNFiLMTransducerLanguageID.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNFiLMTransducer.filter_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + child_args = ResNet1dLanguageID.filter_args(**kwargs["languageid"]) + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + RNNFiLMTransducer.add_class_args(parser, prefix="transducer") + # HFWav2RNNFiLMTransducer.add_class_args(parser) + ResNet1dLanguageID.add_class_args(parser, prefix="languageid") + HFWav2RNNFiLMTransducerLanguageID.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + + valid_args = ( + "loss_weight_transducer", + "loss_weight_lid", + "lid_length", + ) + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNFiLMTransducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + child_args = ResNet1dLanguageID.filter_finetune_args(**kwargs["languageid"]) + base_args["languageid"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--loss-weight-transducer", + default=0.005, + type=float, + help=""" + The weight of the transducer loss + """, + ) + + parser.add_argument( + "--loss-weight-lid", + default=1.0, + type=float, + help=""" + The weight of the lid loss + """, + ) + + parser.add_argument( + "--lid-length", + default=3.0, + type=float, + help=""" + The length of the chunks for language id + """, + ) + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + RNNFiLMTransducer.add_finetune_args(parser, prefix="transducer") + ResNet1dLanguageID.add_finetune_args(parser, prefix="languageid") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) From 9e59d74fdd3b6ee36572286aa4f38637a8bb0c8e Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 9 Jun 2023 04:03:21 +0000 Subject: [PATCH 51/89] add activation option for film --- hyperion/torch/layer_blocks/film_blocks.py | 24 +++++++++++++++---- .../layer_blocks/transducer_film_joiner.py | 5 ++-- .../layer_blocks/transducer_film_predictor.py | 8 +++++-- .../hf_wav2rnn_film_transducer.py | 6 ++--- .../narchs/rnn_film_transducer_decoder.py | 24 ++++++++++++++----- 5 files changed, 48 insertions(+), 19 deletions(-) diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py index 9503fcfe..7d22416f 100644 --- a/hyperion/torch/layer_blocks/film_blocks.py +++ b/hyperion/torch/layer_blocks/film_blocks.py @@ -2,10 +2,20 @@ import torch.nn as nn class FiLM(nn.Module): - def __init__(self, input_size, condition_size): + def __init__(self, input_size, condition_size, film_type="linear"): # condition_size: the size of the language id vector # input_size: the size of the RNN input to the FiLM layer super(FiLM, self).__init__() + # if film_type == "tanh": + # self.linear_scale = nn.Sequential( + # nn.Linear(condition_size, input_size), + # nn.Tanh() + # ) + # self.linear_shift = nn.Sequential( + # nn.Linear(condition_size, input_size), + # nn.Tanh() + # ) + # elif film_type == "linear": self.linear_scale = nn.Linear(condition_size, input_size) self.linear_shift = nn.Linear(condition_size, input_size) @@ -24,7 +34,7 @@ def forward(self, x, lang_condition): class RNNWithFiLM(nn.Module): - def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm"): + def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm", film_type="tanh"): super(RNNWithFiLM, self).__init__() self.input_size = input_size self.hidden_size = hidden_size @@ -32,11 +42,14 @@ def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, self.dropout = dropout self.batch_first = batch_first self.rnn_type = rnn_type + self.film_type = film_type if self.rnn_type == "lstm": self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) elif self.rnn_type == "gru": self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) - self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)]) + + self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)]) + self.dropout_layer = nn.Dropout(dropout) def forward(self, x, states, lang_condition): @@ -64,7 +77,7 @@ def forward(self, x, states, lang_condition): class RNNWithFiLMResidual(nn.Module): - def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm_residual"): + def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm_residual", film_type="linear"): super(RNNWithFiLMResidual, self).__init__() self.input_size = input_size self.hidden_size = hidden_size @@ -76,7 +89,8 @@ def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) elif self.rnn_type == "gru_residual": self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) - self.films = nn.ModuleList([FiLM(hidden_size, condition_size) for _ in range(num_layers)]) + self.film_type = film_type + self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)]) self.dropout_layer = nn.Dropout(dropout) def forward(self, x, states, lang_condition): diff --git a/hyperion/torch/layer_blocks/transducer_film_joiner.py b/hyperion/torch/layer_blocks/transducer_film_joiner.py index 7fdae60d..02a9dfdf 100644 --- a/hyperion/torch/layer_blocks/transducer_film_joiner.py +++ b/hyperion/torch/layer_blocks/transducer_film_joiner.py @@ -21,7 +21,7 @@ class TransducerFiLMJoiner(nn.Module): vocab_size: vocabulary size """ - def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: int, condition_size: int): + def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: int, condition_size: int, film_type: str = "linear"): super().__init__() self.enc_feats = enc_feats @@ -32,8 +32,7 @@ def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: self.enc_proj = nn.Linear(enc_feats, hid_feats) self.pred_proj = nn.Linear(pred_feats, hid_feats) self.output = nn.Linear(hid_feats, vocab_size) - - self.film = FiLM(hid_feats, condition_size) + self.film = FiLM(hid_feats, condition_size, film_type) def get_config(self): config = { diff --git a/hyperion/torch/layer_blocks/transducer_film_predictor.py b/hyperion/torch/layer_blocks/transducer_film_predictor.py index cb628a2c..dc7a7ae4 100644 --- a/hyperion/torch/layer_blocks/transducer_film_predictor.py +++ b/hyperion/torch/layer_blocks/transducer_film_predictor.py @@ -38,6 +38,7 @@ def __init__(self, embed_dropout_rate: float = 0.0, rnn_dropout_rate: float = 0.0, rnn_type: str = "lstm", + film_type: str = "linear", blank_id: int = 0): super().__init__() self.embedding = nn.Embedding( @@ -54,7 +55,8 @@ def __init__(self, dropout=rnn_dropout_rate, condition_size=condition_size, batch_first=True, - rnn_type=rnn_type + rnn_type=rnn_type, + film_type=film_type ) elif rnn_type in ["lstm_residual","gru_residual"]: self.rnn = RNNWithFiLMResidual( @@ -64,7 +66,8 @@ def __init__(self, dropout=rnn_dropout_rate, condition_size=condition_size, batch_first=True, - rnn_type=rnn_type + rnn_type=rnn_type, + film_type=film_type ) else: raise Exception(f"Unknown RNN type {rnn_type}") @@ -97,6 +100,7 @@ def get_config(self): "embed_dropout_rate": self.embed_dropout_rate, "rnn_dropout_rate": self.rnn_dropout_rate, "rnn_type": self.rnn_type, + "film_type": self.film_type, "blank_id": self.blank_id, } return config diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py index 24efb44e..84f2239c 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py @@ -65,11 +65,11 @@ def _make_fuser(self): num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start layer_dim = self.hf_feats.hidden_size if self.feat_fusion_method == "film-weighted-avg": - self.films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)]) + self.films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size, self.transducer.decoder.film_type) for _ in range(num_layers)]) self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif self.feat_fusion_method == "film-fused-feature": self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) - self.film = FiLM(layer_dim, self.transducer.decoder.condition_size) + self.film = FiLM(layer_dim, self.transducer.decoder.condition_size, self.transducer.decoder.film_type) elif self.feat_fusion_method == "weighted-avg": self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif self.feat_fusion_method == "linear": @@ -251,7 +251,7 @@ def freeze_feat_fuser(self): if self.feat_fuser is None: return - if self.feat_fusion_method == "weighted-avg": + if self.feat_fusion_method in ["weighted-avg", "film-weighted-avg", "film-fused-feature"]: self.feat_fuser.requires_grad = False return diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index 1ccac6a9..6a5c0845 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -78,7 +78,8 @@ def __init__( pruned_warmup_steps: int = 2000, langs_size: int = 13, condition_size: int = 64, - film_type: str = "one-hot", + film_cond_type: str = "one-hot", + film_type: str = "linear", ): super().__init__() @@ -97,15 +98,16 @@ def __init__( self.simple_loss_scale = simple_loss_scale self.pruned_warmup_steps = pruned_warmup_steps self.condition_size = condition_size + self.film_cond_type = film_cond_type self.film_type = film_type self._make_predictor() self._make_joiner() # make embedding layer for language id - if self.film_type == "one-hot": + if self.film_cond_type == "one-hot": self.lang_embedding = nn.Embedding(langs_size, condition_size) - elif self.film_type == "lid_pred": + elif self.film_cond_type == "lid_pred": self.lang_embedding = nn.Linear(langs_size, condition_size) if self.rnnt_loss == "k2_pruned": self.simple_am_proj = nn.Linear(in_feats, vocab_size) @@ -140,7 +142,7 @@ def _make_joiner(self): pred_feats = self.predictor_args["out_feats"] hid_feats = self.joiner_args["hid_feats"] self.joiner = FiLMJoiner(self.in_feats, pred_feats, hid_feats, - self.vocab_size, self.condition_size) + self.vocab_size, self.condition_size, self.film_type) elif joiner_type == "original_joiner": pred_feats = self.predictor_args["out_feats"] hid_feats = self.joiner_args["hid_feats"] @@ -166,6 +168,7 @@ def get_config(self): "simple_loss_scale": self.simple_loss_scale, "pruned_warmup_steps": self.pruned_warmup_steps, "condition_size": self.condition_size, + "film_cond_type": self.film_cond_type, "film_type": self.film_type, } base_config = super().get_config() @@ -722,6 +725,12 @@ def add_pred_args(parser): help= """type of recurrent network for thep predictor in [lstm, gru]""") + pred_parser.add_argument("--film-type", + default="linear", + choices=["linear", "tanh"], + help=("type of the FiLM layer")) + + pred_parser.add_argument("--num-layers", default=2, type=int, @@ -822,12 +831,15 @@ def add_class_args(parser, required=True, help=("condition vector dimension")) - parser.add_argument("--film-type", + parser.add_argument("--film-cond-type", default="one-hot", choices=["one-hot", "lid_pred"], help=("type of the condition of FiLM layer")) - + parser.add_argument("--film-type", + default="linear", + choices=["linear", "tanh"], + help=("type of the FiLM layer")) parser.add_argument( "--lm-scale", default=0.25, From 1f56469f639ecabc80de1bd57f8d66d70d236809 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 9 Jun 2023 04:06:52 +0000 Subject: [PATCH 52/89] add configuration --- ...ase_rnnt_film_k2_pruned_stage3_v4.2.1.yaml | 92 +++++++++++++++++++ ...ase_rnnt_film_k2_pruned_stage4_v4.2.1.yaml | 76 +++++++++++++++ ...pruned_filmed_transducer_v4.2.1_13langs.sh | 45 +++++++++ 3 files changed, 213 insertions(+) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml new file mode 100644 index 00000000..d6c995e8 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml new file mode 100644 index 00000000..aaf5dedb --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15 + max_audio_length: 12. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15 + max_audio_length: 12. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + transducer: + decoder: + prune_range: 15 + reduction: mean +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh new file mode 100644 index 00000000..d209d421 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.2.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.2.1_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0012.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.2.1.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth From 35cb6f3b7c043a97ca1952b6dc437df85ddbc20a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 11 Jun 2023 13:34:40 +0000 Subject: [PATCH 53/89] add config for film-asr-lid model --- ...uned_filmed_transducer_lid_v1.0_13langs.sh | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh new file mode 100644 index 00000000..8d9e95d3 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh @@ -0,0 +1,43 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0007.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth From 250bacf66edd358dad77751c6b3010f720b0a919 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 13 Jun 2023 07:21:15 +0000 Subject: [PATCH 54/89] add new configs for film model --- ...2base_rnnt_film_k2_pruned_stage3_v4.3.yaml | 92 ++++++++++++++++++ ...2base_rnnt_film_k2_pruned_stage3_v4.4.yaml | 94 +++++++++++++++++++ hyperion/torch/layer_blocks/film_blocks.py | 24 ++--- hyperion/torch/trainers/torch_trainer.py | 3 + 4 files changed, 201 insertions(+), 12 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml new file mode 100644 index 00000000..e436c876 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-fused-feature + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml new file mode 100644 index 00000000..72a4c6a6 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml @@ -0,0 +1,94 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + film_type: tanh + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + film_type: tanh + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py index 7d22416f..00ee1a10 100644 --- a/hyperion/torch/layer_blocks/film_blocks.py +++ b/hyperion/torch/layer_blocks/film_blocks.py @@ -6,18 +6,18 @@ def __init__(self, input_size, condition_size, film_type="linear"): # condition_size: the size of the language id vector # input_size: the size of the RNN input to the FiLM layer super(FiLM, self).__init__() - # if film_type == "tanh": - # self.linear_scale = nn.Sequential( - # nn.Linear(condition_size, input_size), - # nn.Tanh() - # ) - # self.linear_shift = nn.Sequential( - # nn.Linear(condition_size, input_size), - # nn.Tanh() - # ) - # elif film_type == "linear": - self.linear_scale = nn.Linear(condition_size, input_size) - self.linear_shift = nn.Linear(condition_size, input_size) + if film_type == "tanh": + self.linear_scale = nn.Sequential( + nn.Linear(condition_size, input_size), + nn.Tanh() + ) + self.linear_shift = nn.Sequential( + nn.Linear(condition_size, input_size), + nn.Tanh() + ) + elif film_type == "linear": + self.linear_scale = nn.Linear(condition_size, input_size) + self.linear_shift = nn.Linear(condition_size, input_size) def forward(self, x, lang_condition): # import pdb; pdb.set_trace() diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 7ae7c50e..f98ff2b9 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -245,6 +245,9 @@ def fit(self, train_data, val_data=None): val_logs = {} self.loggers.on_train_begin(epochs=self.epochs) + if self.cur_epoch == 0: + self.save_checkpoint() + # exit() for epoch in range(self.cur_epoch, self.epochs): self.set_epoch(train_data) self.loggers.on_epoch_begin(epoch, batches=len(train_data)) From 410100c482fb4b1a4b8f98bc44b5e0da57f58f7c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 13 Jun 2023 07:27:08 +0000 Subject: [PATCH 55/89] update config --- ...g_pruned_filmed_transducer_v4.3_13langs.sh | 45 +++++++++++++++++++ ...g_pruned_filmed_transducer_v4.4_13langs.sh | 45 +++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.3_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.4_13langs.sh diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.3_13langs.sh new file mode 100644 index 00000000..0134e84f --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.3_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.3_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.3.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.4_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.4_13langs.sh new file mode 100644 index 00000000..99b5d16c --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.4_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.4_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.4.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth From 2a8c3c4c81d37ac3e34ee5a1098553b63abfc0e5 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 18 Jun 2023 11:35:28 +0000 Subject: [PATCH 56/89] update config for film ASR --- ...c2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml | 65 +++++++++++++++++++ ...g_pruned_filmed_transducer_v5.1_13langs.sh | 45 +++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml new file mode 100644 index 00000000..a15272d4 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v5.1.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 128 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_do0.5.yaml +trainer: + optim: + opt_type: sgd + lr: 0.15 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 4200 + #hold_steps: 1500 + decay_steps: 16000 + hold_steps: 32000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh new file mode 100644 index 00000000..ab3d1ec8 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v5.1_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v5.1.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v5.1.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth From 19078699acbf5eb48dfa34aaebb5ed7eea8a58f0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 18 Jun 2023 17:12:32 +0000 Subject: [PATCH 57/89] add config for film ASR --- ...2base_rnnt_film_k2_pruned_stage3_v5.1.yaml | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml new file mode 100644 index 00000000..8947cfd0 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-fused-feature + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.8 + decay_steps: 45000 + hold_steps: 90000 + min_lr: 4e-5 + warmup_steps: 3000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + From 87822f6d5522ab51ee58fa464b97403711df63be Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 23 Jun 2023 02:03:11 +0000 Subject: [PATCH 58/89] add bias initialization --- ...2base_rnnt_film_k2_pruned_stage3_v5.6.yaml | 92 +++++++++++++++++++ ...g_pruned_filmed_transducer_v5.6_13langs.sh | 45 +++++++++ .../v1/local/initailize_film_model_bias.py | 67 ++++++++++++++ 3 files changed, 204 insertions(+) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.6_13langs.sh create mode 100644 egs/commonvoice/v1/local/initailize_film_model_bias.py diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml new file mode 100644 index 00000000..a3f25ffd --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-fused-feature + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.8 + decay_steps: 45000 + hold_steps: 40000 + min_lr: 4e-5 + warmup_steps: 3000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.6_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.6_13langs.sh new file mode 100644 index 00000000..f0db5fb6 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.6_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.6.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v5.6_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0003.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v5.6.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage5_v5.6.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/local/initailize_film_model_bias.py b/egs/commonvoice/v1/local/initailize_film_model_bias.py new file mode 100644 index 00000000..6abedf57 --- /dev/null +++ b/egs/commonvoice/v1/local/initailize_film_model_bias.py @@ -0,0 +1,67 @@ +import torch +import sys + +# arguments example +# pretrained_model = 'exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe.s2/model_ep0010.pth' +# film_model = "exp/transducer_nnets/wav2vec2xlsr300m_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe.s1_initial/model_ep0000.pth" +# output_model = "model_initialized.pth" + +pretrained_model = torch.load(sys.argv[1]) +film_model = torch.load(sys.argv[2]) + +output_model = sys.argv[3] + + +def update_film_lstm_parameters(film_state_dict, pretrained_state_dict): + for i in range(2): + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_ih_l0"] = pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_ih_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".weight_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.weight_hh_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_ih_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_ih_l' + str(i)].clone() + film_state_dict["module.transducer.decoder.predictor.rnn.lstms." + str(i) + ".bias_hh_l0"]= pretrained_state_dict['module.transducer.decoder.predictor.rnn.bias_hh_l' + str(i)].clone() + return film_state_dict + +def copy_model_parameters(pretrained_model, film_model): + pretrained_state_dict = pretrained_model["model_state_dict"] + film_state_dict = film_model["model_state_dict"] + update_state_dict = {name: param for name, param in pretrained_state_dict.items() if name in film_state_dict and param.shape == film_state_dict[name].shape} + + film_update_state_dict = {} + for name, param in film_state_dict.items(): + if "linear_scale.weight" in name: + film_update_state_dict[name] = torch.zeros_like(param) + elif "linear_scale.bias" in name: + film_update_state_dict[name] = torch.ones_like(param) + elif "linear_shift.weight" in name or "linear_shift.bias" in name: + film_update_state_dict[name] = torch.zeros_like(param) + # import pdb; pdb.set_trace() + new_film_state_dict = film_state_dict.copy() + new_film_state_dict.update(update_state_dict) + new_film_state_dict.update(film_update_state_dict) + + + new_film_state_dict = update_film_lstm_parameters(new_film_state_dict, pretrained_state_dict) + + film_model["model_state_dict"] = new_film_state_dict + + unchanged_parameters = [] + changed_parameters = [] + unloaded_parameters = [] + for name, param in film_state_dict.items(): + if torch.all(torch.eq(param, new_film_state_dict[name])): + unchanged_parameters.append(name) + else: + changed_parameters.append(name) + + for name, param in pretrained_state_dict.items(): + if name not in changed_parameters: + unloaded_parameters.append(name) + + print(f"Unchanged parameters: {unchanged_parameters}") + print(f"Unloaded parameters: {unloaded_parameters}") + print(f"Changed parameters: {changed_parameters}") + film_model["epoch"] =1 + torch.save(film_model, output_model) + + + +unchanged_parameters = copy_model_parameters(pretrained_model, film_model) \ No newline at end of file From 27eea766c242253946179401c9dfc7ac2092313a Mon Sep 17 00:00:00 2001 From: ylu125 Date: Sat, 24 Jun 2023 03:27:20 -0400 Subject: [PATCH 59/89] add new config --- ...2base_rnnt_film_k2_pruned_stage3_v6.0.yaml | 92 +++++++++++++++++++ ...g_pruned_filmed_transducer_v6.0_13langs.sh | 45 +++++++++ 2 files changed, 137 insertions(+) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml new file mode 100644 index 00000000..4a72296d --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-fused-feature + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 4500 + hold_steps: 4000 + min_lr: 4e-5 + warmup_steps: 3000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + + diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh new file mode 100644 index 00000000..71d38168 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh @@ -0,0 +1,45 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v6.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0003.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v6.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0003.pth + +nnet_s3_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage5_v6.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth From f809855cb14126a714fba03d7df15cef9f799c0e Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 28 Jun 2023 10:10:27 +0000 Subject: [PATCH 60/89] first config for film-lid model --- ...ucer_ecapadnn512x3_1layer_stage1_v1.0.yaml | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml new file mode 100644 index 00000000..da03a499 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0.yaml @@ -0,0 +1,139 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 8 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 15. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 8 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + film_cond_type: lid_pred + reduction: mean + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 256 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm_residual + joiner: + hid_feats: 512 + languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish + + loss_lid_type: weightedCE + loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 0.1 + loss_weight_lid: 1.0 + lid_length: 3.0 + + feat_fusion_method_transducer: film-weighted-avg + feat_fusion_method_lid: weighted-avg + feat_fusion_start_transducer: 2 + feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 180000 + hold_steps: 60000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + \ No newline at end of file From 8c73fa4b388b4a8597b6002c89f3fdd4a5dcd543 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 28 Jun 2023 10:28:45 +0000 Subject: [PATCH 61/89] add run script for film-asr-lid --- .../v1/run_025_train_film_asr_lid.sh | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100755 egs/commonvoice/v1/run_025_train_film_asr_lid.sh diff --git a/egs/commonvoice/v1/run_025_train_film_asr_lid.sh b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh new file mode 100755 index 00000000..8b213cfe --- /dev/null +++ b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +# export CUDA_VISIBLE_DEVICES=0 + +#ml purge +#module load namd/2.14-cuda-smp +#module load cuda/11.6.0 +#ml +#nvidia-smi +#export CUDA_VISIBLE_DEVICES=0,1,2,3 +#export CONV_RSH=ssh +#export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH + + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2rnn_film_transducer_languageid.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1238 \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer_languageid.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-transducer $nnet_transducer \ + --in-model-lid $nnet_lid \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + From 982499e4de0b7de9cd8bc9c60f8acaebf77876b8 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Fri, 30 Jun 2023 06:49:38 -0400 Subject: [PATCH 62/89] update joint-training code --- ...ucer_ecapadnn512x3_1layer_stage1_v2.0.yaml | 139 ++++++++++++++++++ ...uned_filmed_transducer_lid_v2.0_13langs.sh | 43 ++++++ .../hf_wav2rnn_film_transducer_languageid.py | 35 ++++- .../narchs/rnn_film_transducer_decoder.py | 11 +- 4 files changed, 217 insertions(+), 11 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.0_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml new file mode 100644 index 00000000..0931c052 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml @@ -0,0 +1,139 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + film_cond_type: lid_pred_embed + reduction: mean + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish + + loss_lid_type: weightedCE + loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + lid_length: 3.0 + + feat_fusion_method_transducer: film-fused-feature + feat_fusion_method_lid: weighted-avg + feat_fusion_start_transducer: 2 + feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 9000 + hold_steps: 6000 + min_lr: 4e-5 + warmup_steps: 3000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.0_13langs.sh new file mode 100644 index 00000000..6fe79ec1 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.0_13langs.sh @@ -0,0 +1,43 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0007.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py index d967702a..8e29bc84 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py @@ -140,7 +140,8 @@ def _fuse_transducer_hid_feats(self, hid_feats, lang): # There is only one layer of features return hid_feats[0] - lang_condition = self.transducer.decoder.lang_embedding(lang) + if self.transducer.decoder.film_cond_type in ["one-hot", "lid_pred"]: + lang_condition = self.transducer.decoder.lang_embedding(lang) hid_feats = hid_feats[self.feat_fusion_start_transducer:] if self.feat_fusion_method_transducer == "film-weighted-avg": film_hid_feats = tuple(self.films[i](hid_feats[i], lang_condition) for i in range(len(self.films))) @@ -230,7 +231,7 @@ def forward( languageid=None, return_feat_layers=None, return_enc_layers=None, - return_classif_layers=None, + return_classif_layers=[0], return_logits=True, ): """Forward function. If returns the logits posteriors of the classes. @@ -261,7 +262,7 @@ def forward( feats_languageid = feats_languageid[:, :, lid_start: lid_start + lid_len] - lid_logits = self.languageid( + output = self.languageid( feats_languageid, None, languageid, @@ -269,17 +270,21 @@ def forward( return_classif_layers=return_classif_layers, return_logits=return_logits, ) + # output["h_classif"] = h_classif + # output["logits"] = y_pred - loss_lid = self.loss_lid(lid_logits, languageid) + #loss_lid = self.loss_lid(lid_logits, languageid) + loss_lid = self.loss_lid(output["logits"], languageid) - - feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid_logits) # (N, T, C) + # feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid_logits) # (N, T, C) + feats_transducer = self._fuse_transducer_hid_feats(hid_feats, output["h_classif"]) # (N, T, C) trans_output = self.transducer( feats_transducer, feat_lengths, text, - lid_logits + output["h_classif"] + # lid_logits ) if return_feat_layers: @@ -293,7 +298,8 @@ def forward( loss_transducer_simple=trans_output.loss_simple, loss_transducer_pruned=trans_output.loss_pruned, h_feats=trans_output.h_feats, - logits=lid_logits if return_logits else None) + logits=output["logits"] if return_logits else None) + # logits=lid_logits if return_logits else None) return output def infer(self, @@ -341,6 +347,12 @@ def infer(self, return text, lid + def unfreeze_film(self): + for name, param in self.named_parameters(): + if "film" in name: + logging.info(f"unfreezing {name}") + param.requires_grad = True + # def freeze_feat_fuser(self): # if self.feat_fuser is None: # return @@ -366,6 +378,9 @@ def set_train_mode(self, mode): self.unfreeze() elif mode == "frozen": self.freeze() + elif mode in ["ft-film", "ft-film-grad"]: + self.freeze() + self.unfreeze_film() elif mode in ["ft-transducer", "ft-transducer-nograd"]: self.unfreeze() self.freeze_hf_feats() @@ -394,8 +409,10 @@ def _train(self, train_mode: str): if train_mode in ["full", "frozen"]: super()._train(train_mode) elif train_mode in [ + "ft-film", "ft-transducer", "hf-feats-frozen", + "ft-film-grad", "ft-transducer-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", @@ -410,8 +427,10 @@ def valid_train_modes(): return [ "full", "frozen", + "ft-film", "ft-embed-affine", "ft-transducer", + "ft-film-grad", "hf-feats-frozen", "ft-transducer-nograd", "hf-feats-frozen-nograd", diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index 6a5c0845..f2cfad35 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -109,6 +109,9 @@ def __init__( self.lang_embedding = nn.Embedding(langs_size, condition_size) elif self.film_cond_type == "lid_pred": self.lang_embedding = nn.Linear(langs_size, condition_size) + elif self.film_cond_type == "lid_pred_embed": + # self.lang_embedding = nn.Linear(langs_size, condition_size) + pass if self.rnnt_loss == "k2_pruned": self.simple_am_proj = nn.Linear(in_feats, vocab_size) self.simple_lm_proj = nn.Linear(self.predictor.out_feats, @@ -309,7 +312,8 @@ def forward( self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, lang: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: # embed lang - lang_embedding = self.lang_embedding(lang) + if self.film_cond_type in ["one-hot", "lid_pred"]: + lang_embedding = self.lang_embedding(lang) # get y_lengths row_splits = y.shape.row_splits(1) y_lengths = row_splits[1:] - row_splits[:-1] @@ -342,7 +346,8 @@ def decode(self, max_sym_per_utt: int = 1000, ) -> List[int]: # embed lang - lang_embedding = self.lang_embedding(lang) + if self.film_cond_type in ["one-hot", "lid_pred"]: + lang_embedding = self.lang_embedding(lang) if method == "time_sync_beam_search": return self.decode_time_sync_beam_search(x, lang_embedding, @@ -833,7 +838,7 @@ def add_class_args(parser, parser.add_argument("--film-cond-type", default="one-hot", - choices=["one-hot", "lid_pred"], + choices=["one-hot", "lid_pred", "lid_pred_embed"], help=("type of the condition of FiLM layer")) parser.add_argument("--film-type", From 63a2bd994c961b6c438bda454cc66a8695d1b797 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 30 Jun 2023 09:38:03 -0400 Subject: [PATCH 63/89] added config 2.0 to vox v2 --- ...un_031_attack_type_verif_and_noveltydet.sh | 2 +- egs/voxceleb/v1.1/README.md | 18 +- ...rain_idrnd_resnet100_xvec_stage1_v3.0.yaml | 2 +- ...train_res2net50w26s8_xvec_stage1_v3.0.yaml | 2 +- ...train_res2net50w26s8_xvec_stage2_v3.0.yaml | 3 +- egs/voxceleb/v1.1/run_030_extract_xvectors.sh | 4 +- egs/voxceleb/v1.1/run_040_eval_be.sh | 2 +- egs/voxceleb/v1.2/run_001_prepare_data.sh | 34 +- egs/voxceleb/v2/README.md | 149 +----- ...lmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++ ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml | 63 +++ ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml | 73 +++ .../wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml | 45 ++ .../wavlmbaseplus_ecapatdnn512x3_v2.0.yaml | 44 ++ ...onfig_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | 54 ++ egs/voxceleb/v2/run_030_extract_xvectors.sh | 16 +- egs/voxceleb/v2/run_040_eval_be.sh | 294 +++++++++- hyperion/bin/adv_finetune_xvector_from_wav.py | 5 +- hyperion/bin/apply_mvn_select_frames.py | 9 +- hyperion/bin/audio_to_duration.py | 5 +- hyperion/bin/compute_energy_vad.py | 9 +- hyperion/bin/compute_mfcc_feats.py | 9 +- hyperion/bin/copy_feats.py | 1 - hyperion/bin/decode_wav2transducer.py | 12 +- hyperion/bin/decode_wav2vec2rnn_transducer.py | 5 +- ...l_xvec_cosine_scoring_from_adv_test_wav.py | 9 +- ...osine_scoring_from_adv_test_wav_wavegan.py | 10 +- ...l_xvec_cosine_scoring_from_art_test_wav.py | 18 +- .../eval_xvec_cosine_scoring_from_test_wav.py | 9 +- ...sine_scoring_from_transfer_adv_test_wav.py | 5 +- ...sine_scoring_from_transfer_art_test_wav.py | 18 +- hyperion/bin/eval_xvec_logits_from_wav.py | 9 +- hyperion/bin/extract_wav2vec2xvectors.py | 38 +- hyperion/bin/extract_xvectors_from_feats.py | 5 +- hyperion/bin/extract_xvectors_from_wav.py | 9 +- .../extract_xvectors_slidwin_from_feats.py | 11 +- .../bin/extract_xvectors_slidwin_from_wav.py | 11 +- hyperion/bin/finetune_wav2vec2transducer.py | 5 +- hyperion/bin/finetune_wav2vec2xvector.py | 18 +- .../bin/finetune_xvector_dfr_from_feats.py | 5 +- hyperion/bin/finetune_xvector_dfr_from_wav.py | 5 +- hyperion/bin/finetune_xvector_from_feats.py | 5 +- hyperion/bin/finetune_xvector_from_wav.py | 9 +- .../generate_adv_attacks_xvector_classif.py | 11 +- .../bin/generate_adv_attacks_xvector_verif.py | 11 +- hyperion/bin/hyperion_dataset.py | 23 +- hyperion/bin/hyperion_tables.py | 22 +- hyperion/bin/make_babble_noise_audio_files.py | 7 +- hyperion/bin/pack_wav_rirs.py | 9 +- hyperion/bin/plot_embedding_tsne.py | 5 +- hyperion/bin/plot_embedding_tsne_per_class.py | 5 +- hyperion/bin/prepare_data.py | 5 +- hyperion/bin/preprocess_audio_files.py | 7 +- .../split_dataset_into_trials_and_cohort.py | 68 +++ hyperion/bin/train_wav2rnn_transducer.py | 5 +- hyperion/bin/train_wav2vec2rnn_transducer.py | 5 +- hyperion/bin/train_wav2vec2transducer.py | 5 +- hyperion/bin/train_wav2vec2xvector.py | 5 +- hyperion/bin/train_xvector_from_feats.py | 5 +- hyperion/bin/train_xvector_from_wav.py | 9 +- hyperion/data_prep/__init__.py | 1 + hyperion/data_prep/voxceleb1.py | 2 +- hyperion/data_prep/voxceleb2.py | 2 +- hyperion/data_prep/voxsrc22.py | 21 +- .../data/class_weighted_seg_chunk_sampler.py | 2 +- .../models/wav2xvectors/hf_wav2xvector.py | 100 ++-- hyperion/torch/torch_model.py | 34 +- hyperion/torch/tpm/hf/hf_hubert.py | 32 ++ hyperion/torch/tpm/hf/hf_wav2vec2.py | 6 + hyperion/torch/tpm/hf/hf_wav2vec_base.py | 84 ++- hyperion/torch/tpm/hf/hf_wavlm.py | 32 ++ hyperion/torch/trainers/torch_trainer.py | 12 +- hyperion/utils/dataset.py | 500 ++++++++++++++---- hyperion/utils/segment_set.py | 10 +- 74 files changed, 1535 insertions(+), 628 deletions(-) create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh create mode 100755 hyperion/bin/split_dataset_into_trials_and_cohort.py diff --git a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh index 4ce703ba..3b93fabd 100755 --- a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh +++ b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh @@ -293,7 +293,7 @@ if [ $stage -le 13 ]; then awk '!/benign/' $list_someknown_dir/train/utt2spk > $list_someknown_dir/train_nobenign/utt2spk steps_backend/train_be_v1.sh --cmd "$train_cmd" \ --plda-type splda \ - --y-dim 6 \ + --y-dim 5 \ $sign_dir/train/xvector.scp \ $list_someknown_dir/train_nobenign \ $be_dir diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 73b9bb4e..3b9eeaa9 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -111,8 +111,11 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 | | | | | Cosine + QMF | 0.67 | 0.039 | 0.074 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | -| | | | Cosine + AS-Norm | 0.52 | 0.33 | 0.045 | +| | | | Cosine + AS-Norm | 0.52 | 0.033 | 0.045 | | | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 | +| | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 | +| | | | Cosine + QMF | 0.49 | 0.033 | 0.054 | ### VoxCeleb 1 Entire-Clean trial list @@ -143,8 +146,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| | | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | | | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | - - +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 | +| | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 | +| | | | Cosine + QMF | 0.68 | 0.040 | 0.069 | ### VoxCeleb 1 Hard-Clean trial list @@ -174,7 +178,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | | | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | | | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | - +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 | +| | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 | +| | | | Cosine + QMF | 1.21 | 0.069 | 0.113 | ### VoxSRC2022 dev @@ -205,6 +211,10 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | | | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | | | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 | +| | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 | +| | | | Cosine + QMF | 1.68 | 0.114 | 0.216 | + ## Results before 2023 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml index 9e302200..1016087d 100644 --- a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml @@ -68,5 +68,5 @@ trainer: grad_clip: 250 use_amp: true log_interval: 1000 - epochs: 35 + epochs: 30 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml index 40fb362e..e98d6c13 100644 --- a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml @@ -68,5 +68,5 @@ trainer: grad_clip: 250 use_amp: true log_interval: 1000 - epochs: 35 + epochs: 30 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml index 469e166b..5c9af011 100644 --- a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml @@ -44,7 +44,8 @@ model: margin_warmup_epochs: 0 intertop_margin: 0.1 override_dropouts: true - dropout_rate: 0.0 + # dropout_rate: 0.0 + dropout_rate: 0.2 trainer: optim: opt_type: sgd diff --git a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh index 8c0949f4..f933a7b2 100755 --- a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh @@ -8,7 +8,7 @@ set -e stage=1 -nnet_stage=1 +nnet_stage=2 config_file=default_config.sh use_gpu=false xvec_chunk_length=12800 @@ -85,4 +85,4 @@ if [ $stage -le 2 ]; then done fi -exit + diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh index 0780584c..6bdbdf92 100755 --- a/egs/voxceleb/v1.1/run_040_eval_be.sh +++ b/egs/voxceleb/v1.1/run_040_eval_be.sh @@ -8,7 +8,7 @@ set -e stage=1 -nnet_stage=1 +nnet_stage=2 config_file=default_config.sh diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh index f956bc8c..c151e270 100755 --- a/egs/voxceleb/v1.2/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh @@ -16,26 +16,31 @@ config_file=default_config.sh if [ $stage -le 1 ];then # Prepare the VoxCeleb2 dataset for training. - hyp_utils/conda_env.sh \ - prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ - --cat-videos --use-kaldi-ids \ - --output-dir data/voxceleb2cat_train + prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train fi if [ $stage -le 2 ];then # prepare voxceleb1 for test - #hyp_utils/conda_env.sh \ - prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \ - --use-kaldi-ids \ - --output-dir data/voxceleb1_test + # hyp_utils/conda_env.sh + prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test #local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi -exit + if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then - local/prepare_voxsrc22_dev.py \ - --vox1-corpus-dir $voxceleb1_root \ - --voxsrc22-corpus-dir $voxsrc22_root \ - --output-dir data/voxsrc22_dev + prepare_data.py voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev + # local/prepare_voxsrc22_dev.py \ + # --vox1-corpus-dir $voxceleb1_root \ + # --voxsrc22-corpus-dir $voxsrc22_root \ + # --output-dir data/voxsrc22_dev + prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_test fi # if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then @@ -46,5 +51,6 @@ fi if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then # # split vox2 into 2 parts, for cohort and qmf training - local/make_vox2_trials.py --data-dir data/voxceleb2cat_train + split_dataset_into_trials_and_cohort.py --data-dir data/voxceleb2cat_train + #local/make_vox2_trials.py --data-dir data/voxceleb2cat_train fi diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md index 5b5b93e5..c64a4b41 100644 --- a/egs/voxceleb/v2/README.md +++ b/egs/voxceleb/v2/README.md @@ -1,24 +1,9 @@ -# VoxCeleb V1.1 +# VoxCeleb V2 -Recipe for the VoxCeleb Speaker Verification Task +Recipe for the VoxCeleb Speaker Verification Task using Wav2Vec2, WavLM or Hubert models from HuggingFace as feature extractors ## Differences w.r.t VoxCeleb V1 recipe -In recipe version V1: - - We compute speech augmentations and acoustic features offline and dump them to disk. - - Augmentation is performed using Kaldi scripts and wav-reverbate tool - - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files. - -In this recipe: - - We compute speech augmentations and acoustic features are computed always on-the-fly, - we don't dump any features to disk. - - Augmentation is performed using Hyperin SpeechAugment class. - - The behavior of this class is controlled - by the the configuration file `conf/reverb_noise_aug.yml`, - which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe. - - Babble noise is created offline by mixing 3-10 single speaker files. - - ## Citing ## Training Data @@ -41,15 +26,14 @@ In this recipe: ## Usage - Run the run_0*.sh scripts in sequence - - By default it will use Light ResNet (16 base channels) - - For better performance use full ResNet (64 base channels) using `config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` file as + - By default it will use + - For better performance use ```bash run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh ``` - - To train with mixed precision training use config file `config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` ## Recipe Steps: @@ -73,7 +57,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr - Creates training and validation lists for x-vector training - `run_011_train_xvector.sh` - - Trains the x-vector network + - Trains the x-vector model on frozen wav2vec features + - Finetunes wav2vec+x-vector model + - Large margin finetuning of wav2vec+x-vector model - `run_030_extract_xvectors.sh` - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training @@ -89,117 +75,30 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 | -| | | | Cosine | 2.04 | 0.138 | 0.210 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.35 | 0.091 | 0.159 | -| | | | Cosine | 1.22 | 0.082 | 0.129 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.074 | 0.124 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.090 | 0.160 | -| | | | Cosine | 1.44 | 0.100 | 0.173 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.091 | 0.143 | -| | | | Cosine | 1.17 | 0.081 | 0.110 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.37 | 0.104 | 0.179 | -| | | | Cosine | 1.31 | 0.080 | 0.139 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.29 | 0.088 | 0.129 | -| | | | Cosine | 1.23 | 0.083 | 0.136 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.095 | 0.156 | -| | | | Cosine | 1.29 | 0.089 | 0.146 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.084 | 0.136 | -| | | | Cosine | 1.18 | 0.078 | 0.115 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.084 | 0.145 | -| | | | Cosine | 1.12 | 0.073 | 0.131 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.53 | 0.104 | 0.189 | -| | | | Cosine | 1.31 | 0.084 | 0.132 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 0.98 | 0.066 | 0.116 | -| | | | Cosine | 1.12 | 0.071 | 0.103 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.05 | 0.077 | 0.123 | -| | | | Cosine | 0.96 | 0.065 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.04 | 0.071 | 0.118 | -| | | | Cosine | 0.93 | 0.067 | 0.108 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 0.90 | 0.067 | 0.118 | -| | | | Cosine | 0.85 | 0.060 | 0.094 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.44 | 0.102 | 0.169 | -| | | | Cosine | 1.29 | 0.084 | 0.140 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.071 | 0.116 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.074 | 0.116 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.09 | 0.081 | 0.150 | - +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.060 | 0.116 | +| | | | Cosine + AS-Norm | 0.81 | 0.058 | 0.108 | +| | | | Cosine + QMF | 0.75 | 0.054 | 0.086 | ### VoxCeleb 1 Entire-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.86 | 0.124 | 0.210 | -| | | | Cosine | 1.93 | 0.122 | 0.201 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.43 | 0.091 | 0.159 | -| | | | Cosine | 1.24 | 0.080 | 0.136 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.077 | 0.132 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.084 | 0.150 | -| | | | Cosine | 1.30 | 0.082 | 0.150 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.083 | 0.146 | -| | | | Cosine | 1.09 | 0.071 | 0.124 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.45 | 0.097 | 0.165 | -| | | | Cosine | 1.15 | 0.076 | 0.132 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.47 | 0.094 | 0.165 | -| | | | Cosine | 1.27 | 0.082 | 0.148 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.31 | 0.086 | 0.149 | -| | | | Cosine | 1.22 | 0.079 | 0.134 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.082 | 0.145 | -| | | | Cosine | 1.16 | 0.074 | 0.130 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.077 | 0.136 | -| | | | Cosine | 1.11 | 0.071 | 0.125 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.46 | 0.097 | 0.173 | -| | | | Cosine | 1.24 | 0.080 | 0.140 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.071 | 0.127 | -| | | | Cosine | 1.05 | 0.067 | 0.117 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.078 | 0.134 | -| | | | Cosine | 1.05 | 0.069 | 0.121 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.18 | 0.075 | 0.131 | -| | | | Cosine | 0.98 | 0.063 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | Res2Net50 width=26x8 | + SWA | PLDA | 1.17 | 0.072 | 0.123 | -| | | | Cosine | 0.94 | 0.061 | 0.107 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.56 | 0.095 | 0.166 | -| | | | Cosine | 1.27 | 0.079 | 0.142 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.19 | 0.077 | 0.137 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.073 | 0.129 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | TSE-Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.068 | 0.120 | - +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.81 | 0.051 | 0.087 | +| | | | Cosine + AS-Norm | 0.78 | 0.047 | 0.083 | +| | | | Cosine + QMF | 0.75 | 0.046 | 0.076 | ### VoxCeleb 1 Hard-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 3.29 | 0.195 | 0.318 | -| | | | Cosine | 3.27 | 0.188 | 0.303 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.66 | 0.160 | 0.258 | -| | | | Cosine | 2.32 | 0.139 | 0.232 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 2.19 | 0.133 | 0.215 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 2.33 | 0.139 | 0.227 | -| | | | Cosine | 2.33 | 0.142 | 0.235 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.46 | 0.142 | 0.237 | -| | | | Cosine | 2.14 | 0.126 | 0.203 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 2.57 | 0.153 | 0.255 | -| | | | Cosine | 2.11 | 0.127 | 0.205 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 2.64 | 0.157 | 0.244 | -| | | | Cosine | 2.33 | 0.141 | 0.232 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 2.42 | 0.144 | 0.245 | -| | | | Cosine | 2.26 | 0.133 | 0.224 -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.39 | 0.141 | 0.235 | -| | | | Cosine | 2.17 | 0.128 | 0.215 -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.28 | 0.131 | 0.225 | -| | | | Cosine | 2.11 | 0.124 | 0.204 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 2.77 | 0.172 | 0.271 | -| | | | Cosine | 2.45 | 0.141 | 0.225 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 2.07 | 0.124 | 0.201 | -| | | | Cosine | 1.95 | 0.113 | 0.181 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 2.34 | 0.136 | 0.230 | -| | | | Cosine | 1.99 | 0.119 | 0.196 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 2.18 | 0.127 | 0.211 | -| | | | Cosine | 1.89 | 0.112 | 0.184 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 2.14 | 0.125 | 0.209 | -| | | | Cosine | 1.84 | 0.110 | 0.186 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 2.78 | 0.156 | 0.252 | -| | | | Cosine | 2.26 | 0.134 | 0.214 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 2.24 | 0.134 | 0.221 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.20 | 0.132 | 0.219 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.02 | 0.123 | 0.203 | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.73 | 0.113 | 0.182 | +| | | | Cosine + AS-Norm | 1.63 | 0.100 | 0.160 | +| | | | Cosine + QMF | 1.56 | 0.096 | 0.155 | + +### VoxSRC2022 dev + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.60 | 0.163 | 0.257 | +| | | | Cosine + AS-Norm | 2.43 | 0.150 | 0.244 | +| | | | Cosine + QMF | 2.31 | 0.143 | 0.232 | diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..bd3e7f86 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..eed0ad1f --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..d66d6877 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..d7e3388f --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..b2430d97 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..373535c2 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/run_030_extract_xvectors.sh b/egs/voxceleb/v2/run_030_extract_xvectors.sh index 67122f85..16f29841 100755 --- a/egs/voxceleb/v2/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v2/run_030_extract_xvectors.sh @@ -7,10 +7,10 @@ . ./path.sh set -e -stage=2 +stage=1 +nnet_stage=3 config_file=default_config.sh use_gpu=false -nnet_stage=3 hf_chunk_length=120 #seconds xvec_chunk_length=120 #seconds . parse_options.sh || exit 1; @@ -36,20 +36,20 @@ fi xvector_dir=exp/xvectors/$nnet_name -if [ $stage -le 1 ]; then +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then # Extract xvectors for training LDA/PLDA for name in voxceleb2cat_train do if [ $plda_num_augs -eq 0 ]; then steps_xvec/extract_wav2vec2xvectors.sh \ --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ - --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + --random-utt-length true --min-utt-length 2 --max-utt-length 30 \ $nnet data/${name} \ $xvector_dir/${name} else steps_xvec/extract_wav2vec2xvectors.sh \ --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ - --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + --random-utt-length true --min-utt-length 2 --max-utt-length 30 \ --aug-config $plda_aug_config --num-augs $plda_num_augs \ $nnet data/${name} \ $xvector_dir/${name}_augx${plda_num_augs} \ @@ -60,7 +60,10 @@ fi if [ $stage -le 2 ]; then # Extracts x-vectors for evaluation - for name in voxceleb1_test + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data do num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') nj=$(($num_spk < 100 ? $num_spk:100)) @@ -71,4 +74,3 @@ if [ $stage -le 2 ]; then done fi -exit diff --git a/egs/voxceleb/v2/run_040_eval_be.sh b/egs/voxceleb/v2/run_040_eval_be.sh index ac561344..0982abeb 100755 --- a/egs/voxceleb/v2/run_040_eval_be.sh +++ b/egs/voxceleb/v2/run_040_eval_be.sh @@ -7,10 +7,10 @@ . ./path.sh set -e -# By default we evaluate the nnet after finetuning stage 3 and only with cosine scoring -stage=3 -config_file=default_config.sh +stage=1 nnet_stage=3 +config_file=default_config.sh + . parse_options.sh || exit 1; . $config_file @@ -25,6 +25,15 @@ elif [ $nnet_stage -eq 2 ];then elif [ $nnet_stage -eq 3 ];then nnet=$nnet_s3 nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name fi plda_label=${plda_type}y${plda_y_dim}_v1 @@ -35,8 +44,12 @@ be_dir=exp/be/$nnet_name/$be_name score_dir=exp/scores/$nnet_name/${be_name} score_plda_dir=$score_dir/plda score_cosine_dir=exp/scores/$nnet_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/cosine_qmf -if [ $stage -le 1 ]; then + +if [ "$do_plda" == "true" ];then + if [ $stage -le 1 ]; then echo "Train PLDA on Voxceleb2" steps_be/train_be_v1.sh \ --cmd "$train_cmd" \ @@ -45,14 +58,12 @@ if [ $stage -le 1 ]; then --y_dim $plda_y_dim --z_dim $plda_z_dim \ $xvector_dir/$plda_data/xvector.scp \ data/$plda_data \ - $be_dir & - - wait -fi - - -if [ $stage -le 2 ];then - + $be_dir + + fi + + + if [ $stage -le 2 ];then echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA" steps_be/eval_be_v1.sh \ --cmd "$train_cmd" --plda_type $plda_type \ @@ -62,7 +73,7 @@ if [ $stage -le 2 ];then $be_dir/lda_lnorm.h5 \ $be_dir/plda.h5 \ $score_plda_dir/voxceleb1_scores - + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir @@ -72,32 +83,267 @@ if [ $stage -le 2 ];then cat $f echo "" done - + fi fi -score_plda_dir=$score_cosine_dir + if [ $stage -le 3 ];then - echo "Eval Voxceleb 1 with Cosine scoring" + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir + + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + + echo "Eval voxsrc2 with Cosine scoring" steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $score_plda_dir/voxceleb1_scores + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $score_cosine_dir/voxsrc22_dev_scores & - $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + # steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $score_cosine_dir/voxsrc22_test_scores - for f in $(ls $score_plda_dir/*_results); + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_dir + + for f in $(ls $score_cosine_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + +fi + + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 22G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/*_results); + do + echo $f + cat $f + echo "" + done + fi + + if [ $stage -le 6 ];then + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos_snorm.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_snorm_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + fi +fi + + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxceleb2cat_train/utt2speech_dur \ + > $xvector_dir/voxceleb2cat_train/utt2num_frames + + echo "Train QMF in Vox2" + steps_be/train_be_cos_qmf.sh \ + --cmd "$train_cmd" --coh-nbest 1000 \ + data/voxceleb2cat_train/trials \ + data/voxceleb2cat_train/utt2model \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $xvector_dir/voxceleb2cat_train/utt2num_frames \ + data/voxceleb2cat_train/snorm_utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/voxceleb2_qmf_scores + + fi + + if [ $stage -le 8 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxceleb1_test/utt2speech_dur \ + > $xvector_dir/voxceleb1_test/utt2num_frames + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $xvector_dir/voxceleb1_test/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_snorm.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_qmf.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxceleb1{,_snorm,_qmf}_[oeh]_clean_results); + do + echo $f + cat $f + echo "" + done + + fi + + if [ $stage -le 9 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxsrc22_dev/utt2speech_dur \ + > $xvector_dir/voxsrc22_dev/utt2num_frames + + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $xvector_dir/voxsrc22_dev/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxsrc22_dev_scores & + + # awk '{ print $1, $2*100}' \ + # $xvector_dir/voxsrc22_test/utt2speech_dur \ + # > $xvector_dir/voxsrc22_test/utt2num_frames + # steps_be/eval_be_cos_qmf.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $xvector_dir/voxsrc22_test/utt2num_frames \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_qmf_dir/qmf.h5 \ + # $score_cosine_qmf_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_snorm.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_qmf.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxsrc22_dev{,_snorm,_qmf}_results); do echo $f cat $f echo "" done + fi + +fi + +if [ "$do_pca" != "true" ];then + exit 0 +fi + + +be_name=pca_r${pca_var_r} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name/${be_name} +score_cosine_dir=exp/scores/$nnet_name/$be_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/$be_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/$be_name/cosine_qmf + +be_dir=exp/be/$nnet_name/ +score_be_dir=$score_dir/pca_r${pca_var_r} + +if [ $stage -le 10 ]; then + echo "Train projection on Voxceleb2" + $train_cmd $be_dir/log/train_be.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_proj_v1.py \ + --v-file scp:$xvector_dir/$plda_data/xvector.scp \ + --train-list data/$plda_data/utt2spk \ + --output-path $be_dir \ + --pca.pca-var-r $pca_var_r fi -exit +if [ $stage -le 11 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + --preproc-file $be_dir/preproc.h5 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores + $train_cmd --mem 10G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir + + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi diff --git a/hyperion/bin/adv_finetune_xvector_from_wav.py b/hyperion/bin/adv_finetune_xvector_from_wav.py index 7be882e0..f45b84a0 100755 --- a/hyperion/bin/adv_finetune_xvector_from_wav.py +++ b/hyperion/bin/adv_finetune_xvector_from_wav.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -32,6 +29,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py index f5a3ce15..bdf53786 100755 --- a/hyperion/bin/apply_mvn_select_frames.py +++ b/hyperion/bin/apply_mvn_select_frames.py @@ -10,13 +10,6 @@ import time import numpy as np -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import RandomAccessDataReaderFactory as RDRF @@ -25,6 +18,8 @@ from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.utils import Utt2Info from hyperion.utils.kaldi_matrix import compression_methods +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def process_feats( diff --git a/hyperion/bin/audio_to_duration.py b/hyperion/bin/audio_to_duration.py index 38e8dff2..ac8852a4 100755 --- a/hyperion/bin/audio_to_duration.py +++ b/hyperion/bin/audio_to_duration.py @@ -9,12 +9,11 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import SequentialAudioReader as AR from hyperion.utils import SegmentSet +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def audio_to_duration(audio_file, output_file, **kwargs): diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index 058f982a..e9773fff 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -9,17 +9,12 @@ import time import numpy as np -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.feats import EnergyVAD +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def compute_vad(input_path, output_path, write_num_frames, **kwargs): diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py index ca6e26f7..442e4141 100755 --- a/hyperion/bin/compute_mfcc_feats.py +++ b/hyperion/bin/compute_mfcc_feats.py @@ -9,19 +9,14 @@ import time import numpy as np -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import compression_methods from hyperion.np.feats import MFCC +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def compute_mfcc_feats( diff --git a/hyperion/bin/copy_feats.py b/hyperion/bin/copy_feats.py index 0385cc55..4549caec 100755 --- a/hyperion/bin/copy_feats.py +++ b/hyperion/bin/copy_feats.py @@ -12,7 +12,6 @@ import time import numpy as np - from hyperion.hyp_defs import config_logger from hyperion.io import CopyFeats as CF diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py index c7de38f1..972b247c 100755 --- a/hyperion/bin/decode_wav2transducer.py +++ b/hyperion/bin/decode_wav2transducer.py @@ -13,13 +13,6 @@ import numpy as np import pandas as pd import sentencepiece as spm -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -27,10 +20,13 @@ from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search +from hyperion.torch.models.wav2transducer.beam_search import (beam_search, + greedy_search) from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py index 8ef8d414..4fdc3140 100755 --- a/hyperion/bin/decode_wav2vec2rnn_transducer.py +++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py @@ -13,9 +13,6 @@ import numpy as np import pandas as pd import sentencepiece as spm -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -29,6 +26,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index 50fd5088..7c9d4104 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -10,13 +10,6 @@ import numpy as np import pandas as pd -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -33,6 +26,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index 5697404d..fb0d402c 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -7,19 +7,11 @@ import os import sys import time - # [Added Sonal May21] from pathlib import Path import numpy as np import pandas as pd -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -37,6 +29,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) torch.backends.cudnn.enabled = False diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index 0ca1f740..2d5baa17 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -11,17 +11,10 @@ import numpy as np import pandas as pd -from art.classifiers import PyTorchClassifier -from art.estimators.classification import PyTorchClassifier -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn +from art.classifiers import PyTorchClassifier +from art.estimators.classification import PyTorchClassifier from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -29,15 +22,16 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import ( - ARTAttackFactory as AttackFactory, -) +from hyperion.torch.adv_attacks.art_attack_factory import \ + ARTAttackFactory as AttackFactory from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index 49a762af..76af5d75 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -10,13 +10,6 @@ import time import numpy as np -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -31,6 +24,8 @@ from hyperion.torch.utils.misc import l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index b2c111ca..f33402a1 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -10,9 +10,6 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -29,6 +26,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index 8b6c8dae..f94dc497 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -11,17 +11,10 @@ import numpy as np import pandas as pd -from art.classifiers import PyTorchClassifier -from art.estimators.classification import PyTorchClassifier -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn +from art.classifiers import PyTorchClassifier +from art.estimators.classification import PyTorchClassifier from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -29,15 +22,16 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import ( - ARTAttackFactory as AttackFactory, -) +from hyperion.torch.adv_attacks.art_attack_factory import \ + ARTAttackFactory as AttackFactory from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index 98ba76b5..9efbd6dd 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -11,13 +11,6 @@ import numpy as np import pandas as pd -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -28,6 +21,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index c4c4676f..6f7d269e 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -11,15 +11,8 @@ import numpy as np import pandas as pd -import torchaudio.transforms as tat -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch +import torchaudio.transforms as tat from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -28,26 +21,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info - -resamplers = {} - - -def get_resampler(source_fs, target_fs): - if source_fs in resamplers: - return resamplers[source_fs] - - resampler = tat.Resample( - int(source_fs), - int(target_fs), - lowpass_filter_width=64, - rolloff=0.9475937167399596, - resampling_method="kaiser_window", - beta=14.769656459379492, - ) - resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() - resamplers[source_fs] = resampler_f - return resampler_f - +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) resamplers = {} @@ -168,7 +143,10 @@ def extract_xvectors( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) + v_reader = VRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) while not reader.eof(): t1 = time.time() @@ -240,7 +218,7 @@ def extract_xvectors( writer.write([key], [y]) if write_speech_dur is not None: keys.append(key) - info.append(str(x.shape[1] * fs)) + info.append(str(x.shape[1] / fs)) t8 = time.time() read_time = t2 - t1 diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py index 926e0bcc..13ad4277 100755 --- a/hyperion/bin/extract_xvectors_from_feats.py +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -22,6 +19,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index f49a5fb0..577bbae7 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -11,13 +11,6 @@ import numpy as np import pandas as pd -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -28,6 +21,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index eaf0a5cc..a54c4d64 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -10,15 +10,8 @@ import time import numpy as np -import yaml -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF @@ -27,6 +20,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index 9dc0aa2c..8939ba91 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -11,15 +11,8 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -29,6 +22,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/finetune_wav2vec2transducer.py b/hyperion/bin/finetune_wav2vec2transducer.py index df267e72..6f17f800 100755 --- a/hyperion/bin/finetune_wav2vec2transducer.py +++ b/hyperion/bin/finetune_wav2vec2transducer.py @@ -12,9 +12,6 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,6 +22,8 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index b3edd9b5..fc3c7084 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -26,6 +23,8 @@ HFWavLM2ResNet1dXVector) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, @@ -79,7 +78,12 @@ def init_model(num_classes, in_model_file, rank, **kwargs): def init_hard_prototype_mining(model, train_loader, val_loader, rank): - if not train_loader.batch_sampler.hard_prototype_mining: + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: return if rank == 0: @@ -118,7 +122,11 @@ def train_model(gpu_id, args): logging.info("trainer args={}".format(trn_args)) metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( - model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, ) trainer.load_last_checkpoint() trainer.fit(train_loader, val_loader) diff --git a/hyperion/bin/finetune_xvector_dfr_from_feats.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py index 2ac01025..17cafb85 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_feats.py +++ b/hyperion/bin/finetune_xvector_dfr_from_feats.py @@ -12,9 +12,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,6 +22,8 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/finetune_xvector_dfr_from_wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py index ff97d3ca..f7832a47 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_wav.py +++ b/hyperion/bin/finetune_xvector_dfr_from_wav.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,6 +21,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data( diff --git a/hyperion/bin/finetune_xvector_from_feats.py b/hyperion/bin/finetune_xvector_from_feats.py index 7a1fb5a9..ac9c2d0b 100755 --- a/hyperion/bin/finetune_xvector_from_feats.py +++ b/hyperion/bin/finetune_xvector_from_feats.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML @@ -23,6 +20,8 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index 7d602709..1c7cbe58 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -10,13 +10,6 @@ import time from pathlib import Path -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML @@ -32,6 +25,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index 8c6f38a6..209915c5 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -11,16 +11,9 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -31,6 +24,8 @@ from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialNdx, Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def read_utt_list(list_file, class2int_file, part_idx, num_parts): diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py index fbd3a5fb..363e3afc 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_verif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -11,16 +11,9 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -35,6 +28,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py index 9e7bac5c..c5a3f6b9 100644 --- a/hyperion/bin/hyperion_dataset.py +++ b/hyperion/bin/hyperion_dataset.py @@ -4,27 +4,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from typing import Optional, Union, List from pathlib import Path - -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) +from typing import List, Optional, Union from hyperion.hyp_defs import config_logger -from hyperion.utils import ( - PathLike, - Dataset, - InfoTable, - RecordingSet, - FeatureSet, - ClassInfo, - EnrollmentMap, - SegmentSet, -) +from hyperion.utils import (ClassInfo, Dataset, EnrollmentMap, FeatureSet, + InfoTable, PathLike, RecordingSet, SegmentSet) +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) subcommands = ["add_features"] # table_dict = { diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py index a79a1dca..5a5f0b4f 100755 --- a/hyperion/bin/hyperion_tables.py +++ b/hyperion/bin/hyperion_tables.py @@ -4,26 +4,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from typing import Optional, Union, List from pathlib import Path - -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) +from typing import List, Optional, Union from hyperion.hyp_defs import config_logger -from hyperion.utils import ( - PathLike, - InfoTable, - RecordingSet, - FeatureSet, - ClassInfo, - EnrollmentMap, - SegmentSet, -) +from hyperion.utils import (ClassInfo, EnrollmentMap, FeatureSet, InfoTable, + PathLike, RecordingSet, SegmentSet) +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) subcommands = ["cat"] table_dict = { diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py index 972ff01f..4a356037 100755 --- a/hyperion/bin/make_babble_noise_audio_files.py +++ b/hyperion/bin/make_babble_noise_audio_files.py @@ -10,15 +10,14 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from scipy import ndimage, signal - from hyperion.hyp_defs import config_logger from hyperion.io import AudioWriter as Writer from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from scipy import ndimage, signal def make_noise(xs): diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index 4aafa075..78ac59c1 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -10,16 +10,11 @@ import time import numpy as np -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def pack_wav_rirs(input_path, output_spec, **kwargs): diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py index e011dfe8..e2157e3e 100755 --- a/hyperion/bin/plot_embedding_tsne.py +++ b/hyperion/bin/plot_embedding_tsne.py @@ -13,13 +13,12 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet +from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, + ArgumentParser, namespace_to_dict) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py index 6f35f074..6af0202c 100755 --- a/hyperion/bin/plot_embedding_tsne_per_class.py +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -13,15 +13,14 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.clustering import AHC from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet from hyperion.utils.math import cosine_scoring +from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, + ArgumentParser, namespace_to_dict) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py index 4105f482..e90ad0f7 100755 --- a/hyperion/bin/prepare_data.py +++ b/hyperion/bin/prepare_data.py @@ -6,11 +6,10 @@ import logging from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - from hyperion.data_prep import DataPrep from hyperion.hyp_defs import config_logger +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def make_parser(data_prep_class): diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py index 2f4e5cbc..e8adfd16 100755 --- a/hyperion/bin/preprocess_audio_files.py +++ b/hyperion/bin/preprocess_audio_files.py @@ -10,15 +10,14 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from scipy import ndimage, signal - from hyperion.hyp_defs import config_logger from hyperion.io import AudioWriter as Writer from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from scipy import ndimage, signal def process_vad(vad, length, fs, dilation, erosion): diff --git a/hyperion/bin/split_dataset_into_trials_and_cohort.py b/hyperion/bin/split_dataset_into_trials_and_cohort.py new file mode 100755 index 00000000..24ec10bf --- /dev/null +++ b/hyperion/bin/split_dataset_into_trials_and_cohort.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +from hyperion.hyp_defs import config_logger +from hyperion.utils import Dataset +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +if __name__ == "__main__": + + parser = ArgumentParser( + description=( + """Split speakers in dataset into test speaker to create ASV trials and + cohort speakers for S-Norm""" + ) + ) + + parser.add_argument("--data-dir", required=True, help="Path to dataset") + parser.add_argument( + "--num-1k-tar-trials", type=int, default=30, help="thousands of target trials" + ) + parser.add_argument( + "--num-trial-speakers", + type=int, + default=1000, + help="number of speakers to create trials", + ) + parser.add_argument( + "--intra-gender", + default=True, + action=ActionYesNo, + help="Whether we create intra gender trials or not", + ) + parser.add_argument("--seed", type=int, default=1123, help="random seed") + parser.add_argument( + "--trials-dir", default=None, help="Path to output trials dataset" + ) + parser.add_argument( + "--cohort-dir", default=None, help="Path to output cohort dataset" + ) + + args = parser.parse_args() + config_logger(1) + data_dir = args.data_dir + cohort_dir = args.cohort_dir + cohort_dir = f"{data_dir}_cohort" if cohort_dir is None else cohort_dir + trials_dir = args.trials_dir + trials_dir = f"{data_dir}_trials" if trials_dir is None else trials_dir + + del args.data_dir + del args.cohort_dir + del args.trials_dir + args = namespace_to_dict(args) + + dataset = Dataset.load(data_dir) + trials_dataset, cohort_dataset = dataset.split_into_trials_and_cohort(**args) + trials_dataset.save(trials_dir) + cohort_dataset.save(cohort_dir) diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index 26fcf72c..8930b299 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -12,9 +12,6 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -23,6 +20,8 @@ from hyperion.torch.models import Wav2RNNRNNTransducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py index 5daffb6d..7018c406 100755 --- a/hyperion/bin/train_wav2vec2rnn_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -12,9 +12,6 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,6 +22,8 @@ HFWav2Vec2RNNTransducer) from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index ce53be86..55f3b996 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -12,9 +12,6 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,6 +21,8 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 5e7ecafa..8e1653b1 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,6 +22,8 @@ HFWavLM2ResNet1dXVector) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py index 7f4ab0fa..71bba080 100755 --- a/hyperion/bin/train_xvector_from_feats.py +++ b/hyperion/bin/train_xvector_from_feats.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -28,6 +25,8 @@ from hyperion.torch.models import TransformerXVectorV1 as TFXVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index a210d429..b2e36cac 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -8,13 +8,6 @@ import os from pathlib import Path -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD @@ -29,6 +22,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py index 9ae59246..e978e219 100644 --- a/hyperion/data_prep/__init__.py +++ b/hyperion/data_prep/__init__.py @@ -6,3 +6,4 @@ from .data_prep import DataPrep from .voxceleb2 import VoxCeleb2DataPrep from .voxceleb1 import VoxCeleb1DataPrep +from .voxsrc22 import VoxSRC22DataPrep diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py index c23b64ff..b3958605 100644 --- a/hyperion/data_prep/voxceleb1.py +++ b/hyperion/data_prep/voxceleb1.py @@ -330,7 +330,7 @@ def prepare(self): logging.info("making dataset") dataset = Dataset( segments, - classes={"speaker": speakers, "languages": languages}, + classes={"speaker": speakers, "language_est": languages}, recordings={"recordings": recs}, enrollments=enrollments, trials=trials, diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index bef34ec9..29ad3e44 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -251,7 +251,7 @@ def prepare(self): logging.info("making dataset") dataset = Dataset( segments, - {"speaker": speakers, "languages": languages}, + {"speaker": speakers, "language_est": languages}, {"recordings": recs}, ) logging.info("saving dataset at %s", self.output_dir) diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py index 1999262a..79369149 100644 --- a/hyperion/data_prep/voxsrc22.py +++ b/hyperion/data_prep/voxsrc22.py @@ -53,7 +53,7 @@ def __init__( @staticmethod def dataset_name(): - return "voxceleb2" + return "voxsrc22" @staticmethod def add_class_args(parser): @@ -117,11 +117,13 @@ def prepare_track12_dev(self): vox1_segmentid.append(s) vox1_rec_files = [ - glob.glob(f"{self.vox1_corpus_dir}/**/{s}") for s in vox1_segmentid - ] - vox22_rec_files = [ - glob.glob(f"{self.corpus_dir}/**/{s}") for s in vox22_segmentid + glob.glob(f"{self.vox1_corpus_dir}/**/{s}")[0] for s in vox1_segmentid ] + # vox22_rec_files = [ + # glob.glob(f"{self.corpus_dir}/**/{s}")[0] for s in vox22_segmentid + # ] + vox22_rec_files = [f"{self.corpus_dir}/{s}" for s in vox22_segmentid] + rec_ids = vox22_segmentid + vox1_segmentid rec_files = vox22_rec_files + vox1_rec_files @@ -135,7 +137,11 @@ def prepare_track12_dev(self): recs["target_sample_freq"] = self.target_sample_freq logging.info("making SegmentsSet") - segments = pd.DataFrame({"id": rec_ids,}) + segments = pd.DataFrame( + { + "id": rec_ids, + } + ) segments = SegmentSet(segments) segments.sort() @@ -150,7 +156,8 @@ def prepare_track12_dev(self): logging.info("saving dataset at %s", self.output_dir) dataset.save(self.output_dir) logging.info( - "datasets containts %d segments", len(segments), + "datasets containts %d segments", + len(segments), ) # wav_file = voxsrc22_corpus_dir / file_id diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 7fbfbd71..6ee00307 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -205,7 +205,7 @@ def _set_class_weights(self): self.class_info.set_uniform_weights() elif self.weight_mode == "data-prior": weights = self.class_info["total_duration"].values - self.class_info.set_weights(self, weights) + self.class_info.set_weights(weights) if self.weight_exponent != 1.0: self.class_info.exp_weights(self.weight_exponent) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 5599fa1e..c2bcdf99 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -26,11 +26,9 @@ class HFWav2XVector(TorchModel): than one layer is used. """ - def __init__(self, - hf_feats, - xvector, - feat_fusion_start=0, - feat_fusion_method="weighted-avg"): + def __init__( + self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" + ): super().__init__() self.hf_feats = hf_feats @@ -51,12 +49,9 @@ def _make_fuser(self): self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif self.feat_fusion_method == "linear": self.feat_fuser = nn.Linear(num_layers, 1, bias=False) - self.feat_fuser.weight.data = torch.ones(1, - num_layers) / num_layers + self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers elif self.feat_fusion_method == "cat": - self.feat_fuser = nn.Linear(num_layers * layer_dim, - layer_dim, - bias=False) + self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) def _fuse_hid_feats(self, hid_feats): """Fuses the hidden features from the Wav2Vec model. @@ -71,7 +66,7 @@ def _fuse_hid_feats(self, hid_feats): # There is only one layer of features return hid_feats[0] - hid_feats = hid_feats[self.feat_fusion_start:] + hid_feats = hid_feats[self.feat_fusion_start :] if self.feat_fusion_method == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) @@ -125,14 +120,14 @@ def rebuild_output_layer( num_subcenters=num_subcenters, ) - def forward_feats(self, - x, - x_lengths, - return_feat_layers=None, - chunk_length=0, - detach_chunks=False): - return_hid_states = (False if return_feat_layers is None - and self.feat_fusion_method == "last" else True) + def forward_feats( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fusion_method == "last" + else True + ) with self._hf_context: hf_output = self.hf_feats( x, @@ -154,7 +149,8 @@ def forward_feats(self, # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) # as the hidden features of the x-vector encoder. hid_feats = [ - f.transpose(1, 2) for i, f in enumerate(hid_feats) + f.transpose(1, 2) + for i, f in enumerate(hid_feats) if i in return_feat_layers ] else: @@ -194,7 +190,8 @@ def forward( "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) """ feats, hid_feats, feat_lengths = self.forward_feats( - x, x_lengths, return_feat_layers) + x, x_lengths, return_feat_layers + ) output = self.xvector( feats, feat_lengths, @@ -230,16 +227,17 @@ def extract_embed( x, x_lengths = remove_silence(x, x_lengths) feats, _, feat_lengths = self.forward_feats( - x, - x_lengths, - chunk_length=hf_chunk_length, - detach_chunks=detach_chunks) - xvec_chunk_length = int(xvec_chunk_length * - self.hf_feats.sample_frequency * - feats.size(-1) // x.size(-1)) - return self.xvector.extract_embed(feats, feat_lengths, - xvec_chunk_length, embed_layer, - detach_chunks) + x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks + ) + xvec_chunk_length = int( + xvec_chunk_length + * self.hf_feats.sample_frequency + * feats.size(-1) + // x.size(-1) + ) + return self.xvector.extract_embed( + feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks + ) def freeze_feat_fuser(self): if self.feat_fuser is None: @@ -258,6 +256,23 @@ def freeze_hf_feats(self): def freeze_hf_feature_encoder(self): self.hf_feats.freeze_feature_encoder() + def has_param_groups(self): + return self.hf_feats.has_param_groups() + + def trainable_param_groups(self): + if not self.has_param_groups(): + return self.trainable_parameters() + + param_groups = self.hf_feats.trainable_param_groups() + if self.feat_fusion_method == "weighted-avg": + if self.feat_fuser.requires_grad: + param_groups.append({"params": self.feat_fuser}) + else: + param_groups.append({"params": self.feat_fuser.parameters()}) + + param_groups.append({"params": self.xvector.trainable_parameters()}) + return param_groups + def set_train_mode(self, mode): if mode == self._train_mode: return @@ -302,11 +317,11 @@ def _train(self, train_mode: str): self.hf_feats.train() self.xvector._train("ft-embed_affine") elif train_mode in [ - "ft-xvector", - "hf-feats-frozen", - "ft-xvector-nograd", - "hf-feats-frozen-nograd", - "hf-feat-extractor-frozen", + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", ]: self.hf_feats.train() self.xvector._train("full") @@ -369,16 +384,19 @@ def add_class_args(parser, prefix=None, skip=set()): "--feat-fusion-start", default=0, type=int, - help= - ("the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" - "the wav2vec num_layers"), + help=( + "the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" + "the wav2vec num_layers" + ), ) parser.add_argument( "--feat-fusion-method", default="weighted-avg", choices=["weighted-avg", "linear", "cat", "last"], - help=("method to fuse the hidden layers from the wav2vec model " - "in [weighted-avg, cat]"), + help=( + "method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, cat]" + ), ) if prefix is not None: diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 65e5884d..0cb887ca 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -13,8 +13,8 @@ class TorchModel(nn.Module): - """Base class for all Pytorch Models and NNet architectures - """ + """Base class for all Pytorch Models and NNet architectures""" + registry = {} def __init_subclass__(cls, **kwargs): @@ -45,6 +45,12 @@ def non_trainable_parameters(self, recurse: bool = True): if not param.requires_grad: yield param + def has_param_groups(self): + return False + + def trainable_param_groups(self): + return self.trainable_parameters() + def freeze(self): for param in self.parameters(): param.requires_grad = False @@ -109,10 +115,9 @@ def save(self, file_path): os.makedirs(file_dir, exist_ok=True) config = self.get_config() - torch.save({ - "model_cfg": self.get_config(), - "model_state_dict": self.state_dict() - }) + torch.save( + {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()} + ) @staticmethod def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): @@ -132,8 +137,7 @@ def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = TorchModel._load_cfg_state_dict( - file_path, cfg, state_dict) + cfg, state_dict = TorchModel._load_cfg_state_dict(file_path, cfg, state_dict) model = cls(**cfg) if state_dict is not None: @@ -148,14 +152,15 @@ def get_loss(self): @property def device(self): - devices = {param.device - for param in self.parameters() - } | {buf.device - for buf in self.buffers()} + devices = {param.device for param in self.parameters()} | { + buf.device for buf in self.buffers() + } if len(devices) != 1: raise RuntimeError( "Cannot determine device: {} different devices found".format( - len(devices))) + len(devices) + ) + ) return next(iter(devices)) @@ -217,5 +222,4 @@ def auto_load(file_path, extra_objs={}, map_location=None): # if it failed the 3 trials raise exception raise err # remove module prefix when is trained with dataparallel - state_dict = ODict( - (p.sub("", k), v) for k, v in state_dict.items()) + state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items()) diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index b2198924..2957e433 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -135,6 +135,8 @@ class HFHubert(HFWav2VecBase): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. """ def __init__( @@ -182,6 +184,8 @@ def __init__( left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, ): super().__init__( @@ -199,6 +203,8 @@ def __init__( left_encoder_context=left_encoder_context, right_encoder_context=right_encoder_context, sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -287,6 +293,32 @@ def num_encoder_layers(self): def hidden_size(self): return self.hf_config.hidden_size + def change_dropouts( + self, + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + feat_proj_dropout: float = 0.1, + **kwargs, + ): + import transformers.models.hubert.modeling_hubert as t + + self.hf_model.config.hidden_dropout = hidden_dropout + self.hf_model.config.activation_dropout = activation_dropout + self.hf_model.config.attention_dropout = attention_dropout + self.hf_model.config.feat_proj_dropout = feat_proj_dropout + + self.hf_model.feature_projection.dropout.p = feat_proj_dropout + for module in self.hf_model.encoder.modules(): + if isinstance(module, nn.Dropout): + module.p = hidden_dropout + + for module in self.hf_model.encoder.modules(): + if isinstance(module, t.HubertAttention): + module.dropout = activation_dropout + if isinstance(module, t.HubertFeatureProjection): + module.intermediate_dropout.p = activation_dropout + def drop_upper_layers(self, max_layers: int): if max_layers >= self.hf_config.num_hidden_layers: return diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index e1f21153..26da7beb 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -148,6 +148,8 @@ class HFWav2Vec2(HFWav2VecBase): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. """ def __init__( @@ -200,6 +202,8 @@ def __init__( left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, ): super().__init__( @@ -217,6 +221,8 @@ def __init__( left_encoder_context=left_encoder_context, right_encoder_context=right_encoder_context, sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, ) if pretrained_model_path is not None and not ignore_pretrained: diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index b0a815c7..a9c4ddef 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -53,6 +53,8 @@ class HFWav2VecBase(TorchModel): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. """ def __init__( @@ -71,6 +73,8 @@ def __init__( left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, ): super().__init__() self.pretrained_model_path = pretrained_model_path @@ -84,6 +88,8 @@ def __init__( self.override_spec_augment = override_spec_augment self.right_encoder_context = right_encoder_context self.left_encoder_context = left_encoder_context + self.feat_extract_lr = feat_extract_lr + self.encoder_lr = encoder_lr if pretrained_model_path is not None and not ignore_pretrained: rank = ddp_get_rank() @@ -215,7 +221,14 @@ def out_shape(self, in_shape): C = self.hf_model.config.hidden_size return (in_shape[0], out_length, C) - def change_config(self, override_dropouts, override_spec_augment, **kwargs): + def change_config( + self, + override_dropouts: bool, + override_spec_augment: bool, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + **kwargs, + ): if override_spec_augment: logging.info("overriding speech augment") self.change_spec_augment(**kwargs) @@ -224,6 +237,9 @@ def change_config(self, override_dropouts, override_spec_augment, **kwargs): logging.info("overriding hf model dropouts") self.change_dropouts(**kwargs) + self.feat_extract_lr = feat_extract_lr + self.encoder_lr = encoder_lr + def change_spec_augment( self, apply_spec_augment: bool = True, @@ -249,6 +265,35 @@ def change_dropouts(self, **kwargs): def freeze_feature_encoder(self): self.hf_model.freeze_feature_encoder() + def has_param_groups(self): + return self.feat_extract_lr is not None or self.encoder_lr is not None + + def trainable_param_groups(self): + if not self.has_param_groups(): + return self.trainable_parameters() + + if self.feat_extract_lr == self.encoder_lr: + return [{"params": self.trainable_parameters(), "lr": self.encoder_lr}] + + param_groups = [ + {"params": self.hf_model.feature_extractor.parameters()}, + {"params": self.hf_model.feature_projection.parameters()}, + {"params": self.hf_model.encoder.parameters()}, + ] + if self.hf_model.adapter is not None: + param_groups.append({"params": self.hf_model.adapter.parameters()}) + + if self.feat_extract_lr is not None: + param_groups[0]["lr"] = self.feat_extract_lr + param_groups[1]["lr"] = self.feat_extract_lr + + if self.encoder_lr is not None: + param_groups[2]["lr"] = self.encoder_lr + if len(param_groups) == 4: + param_groups[3]["lr"] = self.encoder_lr + + return param_groups + @property def hf_config(self): return self.hf_model.config @@ -570,7 +615,6 @@ def add_class_args(parser, prefix=None, skip=set()): help=("file path or HuggingFace Hub path to pre-trained model"), ) - parser.add_argument( "--normalize-input", default=True, @@ -659,6 +703,24 @@ def add_class_args(parser, prefix=None, skip=set()): "when the signal is evaluated chunk by chunk." ), ) + parser.add_argument( + "--feat-extractor-lr", + default=None, + type=float, + help=( + "lr for conv feature extractor, it serves to set a lr " + "different than the global one." + ), + ) + parser.add_argument( + "--encoder-lr", + default=None, + type=float, + help=( + "lr for transformer encoder, it serves to set a lr " + "different than the global one." + ), + ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @@ -696,6 +758,24 @@ def add_finetune_args(parser, prefix=None, skip=set()): "arguments instead of the defaults in the pretrained model." ), ) + parser.add_argument( + "--feat-extractor-lr", + default=None, + type=float, + help=( + "lr for conv feature extractor, it serves to set a lr " + "different than the global one." + ), + ) + parser.add_argument( + "--encoder-lr", + default=None, + type=float, + help=( + "lr for transformer encoder, it serves to set a lr " + "different than the global one." + ), + ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index 0d5c5ad3..e1b67d81 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -148,6 +148,8 @@ class HFWavLM(HFWav2VecBase): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. """ def __init__( @@ -200,6 +202,8 @@ def __init__( left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, ): super().__init__( @@ -217,6 +221,8 @@ def __init__( left_encoder_context=left_encoder_context, right_encoder_context=right_encoder_context, sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -310,6 +316,32 @@ def num_encoder_layers(self): def hidden_size(self): return self.hf_config.hidden_size + def change_dropouts( + self, + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + feat_proj_dropout: float = 0.1, + **kwargs, + ): + import transformers.models.wavlm.modeling_wavlm as t + + self.hf_model.config.hidden_dropout = hidden_dropout + self.hf_model.config.activation_dropout = activation_dropout + self.hf_model.config.attention_dropout = attention_dropout + self.hf_model.config.feat_proj_dropout = feat_proj_dropout + + self.hf_model.feature_projection.dropout.p = feat_proj_dropout + for module in self.hf_model.encoder.modules(): + if isinstance(module, nn.Dropout): + module.p = hidden_dropout + + for module in self.hf_model.encoder.modules(): + if isinstance(module, t.WavLMAttention): + module.dropout = activation_dropout + if isinstance(module, t.WavLMFeatureProjection): + module.intermediate_dropout.p = activation_dropout + def drop_upper_layers(self, max_layers: int): if max_layers >= self.hf_config.num_hidden_layers: return diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index c8565d1d..5e41747c 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -163,7 +163,9 @@ def __init__( oss = False if ddp_type == DDPType.DDP else True self.optimizer = self._make_optimizer(optim, self.model, oss=oss) self.model = TorchDDP( - self.model, device_ids=[device], output_device=device, + self.model, + device_ids=[device], + output_device=device, ) elif ddp_type == DDPType.OSS_SHARDED_DDP: self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) @@ -424,7 +426,9 @@ def _make_optimizer(self, optim, model, oss=False): opt_args["oss"] = oss if self.rank == 0: logging.info("optimizer args={}".format(opt_args)) - optimizer = OF.create(model.parameters(), **opt_args) + + # optimizer = OF.create(model.parameters(), **opt_args) + optimizer = OF.create(model.trainable_param_groups(), **opt_args) return optimizer def _make_lr_sched(self, lr_sched, optim): @@ -458,8 +462,8 @@ def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb): def _get_lr(self): """Returns the current learning rate to show in the loggers""" - for param_group in self.optimizer.param_groups: - return param_group["lr"] + lrs = [param_group["lr"] for param_group in self.optimizer.param_groups] + return max(lrs) def _compute_grad_acc_steps(self, data_loader): if self.eff_batch_size is None: diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index 0ef81ab6..d1d969fb 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -2,10 +2,13 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - +import logging from pathlib import Path from typing import Dict, Optional, Union - +from copy import deepcopy +import math +import numpy as np +import pandas as pd import yaml from .class_info import ClassInfo @@ -93,10 +96,13 @@ def _parse_dict_args(self, data, types): return objects, paths + def clone(self): + return deepcopy(self) + def segments(self, keep_loaded: bool = True): if self._segments is None: assert self._segments_path is not None - segments = SegmentSet.load(self.segments_path, sep=self.table_sep) + segments = SegmentSet.load(self._segments_path, sep=self.table_sep) if keep_loaded: self._segments = segments return segments @@ -111,6 +117,7 @@ def recordings_value(self, key: str, keep_loaded: bool = True): ) if keep_loaded: self._recordings[key] = recordings + return recordings return self._recordings[key] @@ -120,6 +127,7 @@ def features_value(self, key: str, keep_loaded: bool = True): features = FeatureSet.load(self._features_paths[key], sep=self.table_sep) if keep_loaded: self._features[key] = features + return features return self._features[key] @@ -129,6 +137,7 @@ def classes_value(self, key: str, keep_loaded: bool = True): classes = ClassInfo.load(self._classes_paths[key], self.table_sep) if keep_loaded: self._classes[key] = classes + return classes return self._classes[key] @@ -140,6 +149,7 @@ def enrollments_value(self, key: str, keep_loaded: bool = True): ) if keep_loaded: self._enrollments[key] = enrollments + return enrollments return self._enrollments[key] @@ -156,6 +166,7 @@ def trials_value(self, key: str, keep_loaded: bool = True): if keep_loaded: self._trials[key] = trials + return trials return self._trials[key] @@ -194,6 +205,49 @@ def trials(self, keep_loaded: bool = True): for key in self._trials.keys(): yield key, self.trials_value(key, keep_loaded) + # def add_recordings(self, recordings: Dict[str, Union[RecordingSet, PathLike]]): + # recordings, recordings_paths = self._parse_dict_args(recordings, RecordingSet) + # if self._recordings is None: + # self._recordings = self._recordings_paths = {} + # self._recordings.update(recordings) + # self._recordings_paths.update(recordings_paths) + + # def add_features(self, features: Dict[str, Union[FeatureSet, PathLike]]): + # features, features_paths = self._parse_dict_args(features, FeatureSet) + # if self._features is None: + # self._features = self._features_paths = {} + # self._features.update(features) + # self._features_paths.update(features_paths) + + # def add_classes(self, classes: Dict[str, Union[ClassInfo, PathLike]]): + # classes, classes_paths = self._parse_dict_args(classes, ClassInfo) + # if self._classes is None: + # self._classes = self._classes_paths = {} + # self._classes.update(classes) + # self._classes_paths.update(classes_paths) + + # def add_enrollments(self, enrollments: Dict[str, Union[EnrollmentMap, PathLike]]): + # enrollments, enrollments_paths = self._parse_dict_args( + # enrollments, + # EnrollmentMap, + # ) + # if self._enrollments is None: + # self._enrollments = self._enrollments_paths = {} + # self._enrollments.update(enrollments) + # self._enrollments_paths.update(enrollments_paths) + + # def add_trials( + # self, trials: Dict[str, Union[TrialKey, TrialNdx, SparseTrialKey, PathLike]] + # ): + # trials, trials_paths = self._parse_dict_args( + # trials, + # (TrialKey, TrialNdx, SparseTrialKey), + # ) + # if self._trials is None: + # self._trials = self._trials_paths = {} + # self._trials.update(trials) + # self._trials_paths.update(trials_paths) + @staticmethod def resolve_dataset_path(dataset_path): dataset_path = Path(dataset_path) @@ -209,6 +263,8 @@ def resolve_dataset_path(dataset_path): @staticmethod def resolve_file_path(dataset_dir, file_path): + dataset_dir = Path(dataset_dir) + file_path = Path(file_path) if file_path.is_file(): return file_path @@ -274,95 +330,100 @@ def save_changed( if update_paths: self._segments_path = file_path - file_names = {} - for k in self._recordings.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._recordings is not None - or file_path != self._recordings_paths[k] - or not file_path.exists() - ): - v = self.recordings_value(k, keep_loaded=False) - v.save(file_path, sep=table_sep) - if update_paths: - self._recordings_paths[k] = file_path - - if file_names: - dataset["recordings"] = file_names - - file_names = {} - for k in self._features.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._features is not None - or file_path != self._features_paths[k] - or not file_path.exists() - ): - v = self.features_value(k, keep_loaded=False) - v.save(file_path, sep=table_sep) - if update_paths: - self._features_paths[k] = file_path - - if file_names: - dataset["features"] = file_names - - file_names = {} - for k, v in self._classes.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._classes is not None - or file_path != self._classes_paths[k] - or not file_path.exists() - ): - v = self.classes_value(k, keep_loaded=False) - v.save(file_path, sep=table_sep) - if update_paths: - self._classes_paths[k] = file_path - - if file_names: - dataset["classes"] = file_names - - file_names = {} - for k, v in self._enrollments.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._enrollments is not None - or file_path != self._enrollments_paths[k] - or not file_path.exists() - ): - v = self.enrollments_value(k, keep_loaded=False) - v.save(file_path, sep=table_sep) - if update_paths: - self._enrollments_paths[k] = file_path - - if file_names: - dataset["enrollments"] = file_names - - file_names = {} - for k, v in self._trials.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._trials is not None - or file_path != self._trials_paths[k] - or not file_path.exists() - ): - v = self.trials_value(k, keep_loaded=False) - v.save(file_path) - if update_paths: - self._trials_paths[k] = file_path - - if file_names: - dataset["trials"] = file_names + if self._recordings is not None: + file_names = {} + for k in self._recordings.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._recordings[k] is not None + or file_path != self._recordings_paths[k] + or not file_path.exists() + ): + v = self.recordings_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._recordings_paths[k] = file_path + + if file_names: + dataset["recordings"] = file_names + + if self._features is not None: + file_names = {} + for k in self._features.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._features[k] is not None + or file_path != self._features_paths[k] + or not file_path.exists() + ): + v = self.features_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._features_paths[k] = file_path + + if file_names: + dataset["features"] = file_names + + if self._classes is not None: + file_names = {} + for k in self._classes.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._classes[k] is not None + or file_path != self._classes_paths[k] + or not file_path.exists() + ): + v = self.classes_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._classes_paths[k] = file_path + + if file_names: + dataset["classes"] = file_names + + if self._enrollments is not None: + file_names = {} + for k in self._enrollments.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._enrollments[k] is not None + or file_path != self._enrollments_paths[k] + or not file_path.exists() + ): + v = self.enrollments_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._enrollments_paths[k] = file_path + + if file_names: + dataset["enrollments"] = file_names + + if self._trials is not None: + file_names = {} + for k in self._trials.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._trials[k] is not None + or file_path != self._trials_paths[k] + or not file_path.exists() + ): + v = self.trials_value(k, keep_loaded=False) + v.save(file_path) + if update_paths: + self._trials_paths[k] = file_path + + if file_names: + dataset["trials"] = file_names with open(dataset_file, "w") as f: yaml.dump(dataset, f) @@ -491,7 +552,7 @@ def load( """ dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) - with open(dataset_file, "w") as f: + with open(dataset_file, "r") as f: dataset = yaml.safe_load(f) assert "segments" in dataset @@ -503,27 +564,27 @@ def load( trials = None if "classes" in dataset: classes = {} - for k, v in dataset["classes"]: + for k, v in dataset["classes"].items(): classes[k] = Dataset.resolve_file_path(dataset_dir, v) if "recordings" in dataset: recordings = {} - for k, v in dataset["recordings"]: + for k, v in dataset["recordings"].items(): recordings[k] = Dataset.resolve_file_path(dataset_dir, v) if "features" in dataset: features = {} - for k, v in dataset["features"]: + for k, v in dataset["features"].items(): features[k] = Dataset.resolve_file_path(dataset_dir, v) if "enrollments" in dataset: enrollments = {} - for k, v in dataset["enrollments"]: + for k, v in dataset["enrollments"].items(): enrollments[k] = Dataset.resolve_file_path(dataset_dir, v) if "trials" in dataset: trials = {} - for k, v in dataset["trials"]: + for k, v in dataset["trials"].items(): trials[k] = Dataset.resolve_file_path(dataset_dir, v) dataset = cls( @@ -541,6 +602,10 @@ def load( return dataset def add_features(self, features_name: str, features: Union[PathLike, FeatureSet]): + if self._features is None: + self._features = {} + self._features_paths = {} + if isinstance(features, (str, Path)): self._features[features_name] = None self._features_paths[features_name] = features @@ -555,6 +620,10 @@ def add_recordings( recordings_name: str, recordings: Union[PathLike, RecordingSet], ): + if self._recordings is None: + self._recordings = {} + self._recordings_paths = {} + if isinstance(features, (str, Path)): self._recordings[features_name] = None self._recordings_paths[recordings_name] = recordings @@ -565,6 +634,10 @@ def add_recordings( raise ValueError() def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): + if self._classes is None: + self._classes = {} + self._classes_paths = {} + if isinstance(classes, (str, Path)): self._classes[features_name] = None self._classes_paths[classes_name] = classes @@ -579,8 +652,12 @@ def add_enrollments( enrollments_name: str, enrollments: Union[PathLike, EnrollmentMap], ): - if isinstance(features, (str, Path)): - self._enrollments[features_name] = None + if self._enrollments is None: + self._enrollments = {} + self._enrollments_paths = {} + + if isinstance(enrollments, (str, Path)): + self._enrollments[enrollments_name] = None self._enrollments_paths[enrollments_name] = enrollments elif isinstance(enrollments, EnrollmentMap): self._enrollments[enrollments_name] = enrollments @@ -593,7 +670,11 @@ def add_trials( trials_name: str, trials: Union[PathLike, TrialKey, TrialNdx, SparseTrialKey], ): - if isinstance(features, (str, Path)): + if self._trials is None: + self._trials = {} + self._trials_paths = {} + + if isinstance(trials, (str, Path)): self._trials[features_name] = None self._trials_paths[trials_name] = trials elif isinstance(trials, (TrialKey, TrialNdx, SparseTrialKey)): @@ -601,3 +682,220 @@ def add_trials( self._trials_paths[trials_name] = None else: raise ValueError() + + def remove_features(self, features_name: str): + if self._features_paths[features_name] is not None: + file_path = Path(self._features_paths[features_name]) + if file_path.is_file(): + file_path.unlink() + + del self._features[features_name] + del self._features_paths[features_name] + + def remove_recordings( + self, + recordings_name: str, + ): + if self._recordingsr_paths[recordings_name] is not None: + file_path = Path(self._recordings_paths[recordings_name]) + if file_path.is_file(): + file_path.unlink() + + del self._recordings[recordings_name] + del self._recordings_paths[recordings_name] + + def remove_classes(self, classes_name: str): + if self._classes_paths[classes_name] is not None: + file_path = Path(self._classes_paths[classes_name]) + if file_path.is_file(): + file_path.unlink() + + del self._classes[classes_name] + del self._classes_paths[classes_name] + + def remove_enrollments( + self, + enrollments_name: str, + ): + if self._enrollments_paths[enrollments_name] is not None: + file_path = Path(self._enrollments_paths[enrollments_name]) + if file_path.is_file(): + file_path.unlink() + + del self._enrollments[enrollments_name] + del self._enrollments_paths[enrollments_name] + + def remove_trials( + self, + trials_name: str, + ): + if self._trials_paths[trials_name] is not None: + file_path = Path(self._trials_paths[trials_name]) + if file_path.is_file(): + file_path.unlink() + + del self._trials[trials_name] + del self._trials_paths[trials_name] + + def set_segments(self, segments: Union[PathLike, SegmentSet]): + if isinstance(segments, SegmentSet): + self._segments = segments + else: + self._segments_path = segments + + def clean(self): + rec_ids = self.segments().recording_ids() + for k, table in self.recordings(): + table = table.loc[table["id"].isin(rec_ids)].copy() + self._recordings[k] = RecordingSet(table) + + ids = self.segments()["id"].values + for k, table in self.features(): + table = table.loc[table["id"].isin(ids)].copy() + self._features[k] = FeatureSet(table) + + for k, table in self.classes(): + class_ids = self.segments()[k].unique() + table = table[table["id"].isin(class_ids)].copy() + self._classes[k] = ClassInfo(table) + + remove_keys = [] + for k, table in self.enrollments(): + table = table.loc[table["segmentid"].isin(ids)].copy() + if len(table) > 0: + self._enrollments[k] = EnrollmentMap(table) + else: + remove_keys.append(k) + + for k in remove_keys: + self.remove_enrollments(k) + + remove_keys = [] + for k, key in self.trials(): + keep_ids = [cur_id for cur_id in key.seg_set if cur_id in ids] + if keep_ids: + key = key.filter(key.model_set, keep_ids, keep=True) + self._trials[k] = key + else: + remove_keys.append(k) + + for k in remove_keys: + self.remove_trials(k) + + def _split_into_trials_and_cohort( + self, + segments: SegmentSet, + num_tar_trials: int, + num_trial_speakers: int, + seed: int, + ): + # select test speakers + rng = np.random.RandomState(seed=seed) + + spks = segments["speaker"].unique() + trial_spks = rng.choice(spks, size=(num_trial_speakers,), replace=False) + snorm_segments = SegmentSet(segments[~segments["speaker"].isin(trial_spks)]) + + trial_segments = segments[segments["speaker"].isin(trial_spks)] + # solution of 2nd degree eq. + # num_spks * n (n-1) /2 = num_trials + num_segs_per_spk = int( + math.ceil((1 + math.sqrt(1 + 8 * num_tar_trials // num_trial_speakers)) / 2) + ) + + n = num_trial_speakers * num_segs_per_spk + seg_ids = rng.choice(trial_segments["id"], size=(n,), replace=False) + trial_segments = SegmentSet(segments[segments["id"].isin(seg_ids)]) + seg_ids = trial_segments["id"].values + class_ids = trial_segments["speaker"].values + tar = np.zeros((n - 1, n), dtype=bool) + non = np.zeros((n - 1, n), dtype=bool) + + ntar = 0 + nnon = 0 + for i in range(n - 1): + for j in range(i + 1, n): + if class_ids[i] == class_ids[j]: + tar[i, j] = True + else: + non[i, j] = True + + logging.info("Got ntar=%d and nnon=%d", tar.sum(), non.sum()) + trials = TrialKey(seg_ids[:-1], seg_ids, tar, non) + df_enr = pd.DataFrame({"id": seg_ids[:-1], "segmentid": seg_ids[:-1]}) + enrollments = EnrollmentMap(df_enr) + return trials, enrollments, snorm_segments + + def split_into_trials_and_cohort( + self, + num_1k_tar_trials: int, + num_trial_speakers: int, + intra_gender: bool = True, + trials_name="trials_qmf", + seed=1123, + ): + """When training quality measure fusion in, e.g., VoxCeleb recipe. + We split the data into 2 parts: + 1) used to calculate SV scores to train the fusion + 2) cohort used to calculate the S-Norm parameters used in the QMF. + + The trials_file will be stored in the current dataset + A new dataset is created with only the cohort speakers + + Args: + num_1k_tar_trials: num of 1000 target trials. + num_trial_speakers: number of spks used to create trials. + intra_gender: if True, no cross gender trials are done. + + Returns: + Dataset used for trials with trial list. + Dataset used for cohort. + """ + num_tar_trials = num_1k_tar_trials * 1000 + if intra_gender: + num_tar_trials = num_tar_trials // 2 + num_trial_speakers = num_trial_speakers // 2 + segments = self.segments() + segments_male = SegmentSet(segments[segments["gender"] == "m"]) + segments_female = SegmentSet(segments[segments["gender"] == "f"]) + trials_male, enroll_male, cohort_male = self._split_into_trials_and_cohort( + segments_male, + num_tar_trials, + num_trial_speakers, + seed, + ) + ( + trials_female, + enroll_female, + cohort_female, + ) = self._split_into_trials_and_cohort( + segments_female, + num_tar_trials, + num_trial_speakers, + seed, + ) + trials = TrialKey.merge([trials_male, trials_female]) + enroll = EnrollmentMap.cat([enroll_male, enroll_female]) + cohort = SegmentSet.cat([cohort_male, cohort_female]) + else: + segments = self.segments() + trials, enroll, cohort = self._split_into_trials_and_cohort( + segments, + num_tar_trials, + num_trial_speakers, + seed, + ) + + dataset_trials = self.clone() + segments = self.segments() + trials_segments = SegmentSet(segments.loc[segments["id"].isin(trials.seg_set)]) + dataset_trials.set_segments(trials_segments) + dataset_trials.add_trials("trials", trials) + dataset_trials.add_enrollments("enrollments", enroll) + dataset_trials.clean() + + dataset_cohort = self.clone() + dataset_cohort.set_segments(cohort) + dataset_cohort.clean() + + return dataset_trials, dataset_cohort diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py index 1852d25d..6aef5bb2 100644 --- a/hyperion/utils/segment_set.py +++ b/hyperion/utils/segment_set.py @@ -8,7 +8,7 @@ class SegmentSet(InfoTable): """Class to store information about a speech segment - Internally, it uses a pandas table. + Internally, it uses a pandas table. """ def __init__(self, df): @@ -29,7 +29,13 @@ def has_time_marks(self): def has_recording_ids(self): return "recording_id" in self.df - def recording_ids(self, ids): + def recording_ids(self, ids=None): + if ids is None: + if "recording_id" in self.df: + return self.df["recording_id"] + else: + return self.df["id"] + if "recording_id" in self.df: return self.df.loc[ids, "recording_id"] From ac71e9aed4e1a5b490ddc0f37dfa31ec4e3b2d31 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 1 Jul 2023 08:38:42 +0000 Subject: [PATCH 64/89] update joint-training of LID-FILM-ASR --- ...ucer_ecapadnn512x3_1layer_stage1_v2.1.yaml | 140 +++++++++++++++ ...ucer_ecapadnn512x3_1layer_stage2_v2.1.yaml | 91 ++++++++++ ...uned_filmed_transducer_lid_v2.1_13langs.sh | 43 +++++ .../initailize_joint_film_lid_model_bias.py | 74 ++++++++ .../v1/run_025_train_film_asr_lid.sh | 18 +- ...wav2vec2rnn_film_transducer_languageid.py} | 168 ++++++++---------- hyperion/torch/data/audio_dataset.py | 2 +- hyperion/torch/layer_blocks/film_blocks.py | 17 +- .../layer_blocks/transducer_film_joiner.py | 22 ++- .../layer_blocks/transducer_film_predictor.py | 8 +- .../hf_wav2rnn_film_transducer_languageid.py | 88 +++++++-- .../hf_wav2rnn_transducer_languageid.py | 1 + ..._wav2vec2rnn_film_transducer_languageid.py | 30 ++++ .../narchs/rnn_film_transducer_decoder.py | 37 ++-- 14 files changed, 593 insertions(+), 146 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh create mode 100644 egs/commonvoice/v1/local/initailize_joint_film_lid_model_bias.py rename hyperion/bin/{finetune_wav2vec2transducer_languageid.py => finetune_wav2vec2rnn_film_transducer_languageid.py} (60%) diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml new file mode 100644 index 00000000..7347e8b4 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml @@ -0,0 +1,140 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + film_cond_type: lid_pred_embed + reduction: mean + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish + + loss_lid_type: weightedCE + loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + loss_weight_embed: 10 + lid_length: 3.0 + + feat_fusion_method_transducer: film-fused-feature + feat_fusion_method_lid: weighted-avg + feat_fusion_start_transducer: 2 + feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml new file mode 100644 index 00000000..377ea296 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml @@ -0,0 +1,91 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + transducer: + decoder: + prune_range: 15 + override_dropouts: false + languageid: + cos_scale: 32.0 + + loss_lid_type: weightedCE + loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 10.0 + loss_weight_embed: 10 + lid_length: 3.0 + + # feat_fusion_method_transducer: film-fused-feature + # feat_fusion_method_lid: weighted-avg + # feat_fusion_start_transducer: 2 + # feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh new file mode 100644 index 00000000..b0e39914 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh @@ -0,0 +1,43 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0009.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.1.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/local/initailize_joint_film_lid_model_bias.py b/egs/commonvoice/v1/local/initailize_joint_film_lid_model_bias.py new file mode 100644 index 00000000..3bc5148f --- /dev/null +++ b/egs/commonvoice/v1/local/initailize_joint_film_lid_model_bias.py @@ -0,0 +1,74 @@ +import torch +import sys +# arguments example +# + +ASR_model = torch.load(sys.argv[1]) +LID_model = torch.load(sys.argv[2]) +joint_model = torch.load(sys.argv[3]) + +output_model = sys.argv[4] + + +def check_update_parameters(joint_state_dict, new_joint_state_dict): + shape_changed_parameters = [] + unchanged_parameters = [] + changed_parameters = [] + unloaded_parameters = [] + for name, param in joint_state_dict.items(): + new_param = new_joint_state_dict[name].to(param.device) + if param.shape != new_param.shape: + shape_changed_parameters.append(name) + elif torch.all(torch.eq(param, new_param)): + unchanged_parameters.append(name) + else: + changed_parameters.append(name) + print("Shape changed parameters: {}".format(shape_changed_parameters)) + print("Unchanged parameters: {}".format(unchanged_parameters)) + print("Changed parameters: {}".format(changed_parameters)) + + + +def copy_model_parameters(ASR_model, LID_model, joint_model, output_model): + ASR_state_dict = ASR_model["model_state_dict"] + LID_state_dict = LID_model["model_state_dict"] + + LID_state_dict = {"module." + name: param for name, param in LID_state_dict.items()} + + joint_state_dict = joint_model["model_state_dict"] + + hf_feats_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "hf_feats" in name} + transducer_update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and ("transducer" in name or "film" in name)} + languageid_update_state_dict = {name: param for name, param in LID_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "languageid" in name} + + + film_update_state_dict = {} + for name, param in joint_state_dict.items(): + if "linear_scale.weight" in name and "lid_film" in name: + film_update_state_dict[name] = torch.zeros_like(param) + elif "linear_scale.bias" in name and "lid_film" in name: + film_update_state_dict[name] = torch.ones_like(param) + elif ("linear_shift.weight" in name or "linear_shift.bias" in name) and "lid_film" in name: + film_update_state_dict[name] = torch.zeros_like(param) + + new_joint_state_dict = joint_state_dict.copy() + new_joint_state_dict.update(hf_feats_update_state_dict) + new_joint_state_dict.update(transducer_update_state_dict) + new_joint_state_dict.update(languageid_update_state_dict) + new_joint_state_dict.update(film_update_state_dict) + + # import pdb;pdb.set_trace() + + new_joint_state_dict["module.transducer_fuser"] = ASR_state_dict["module.feat_fuser"] + new_joint_state_dict["module.languageid_fuser"] = LID_state_dict["module.feat_fuser"] + + + joint_model["model_state_dict"] = new_joint_state_dict + joint_model["epoch"] =1 + + check_update_parameters(joint_state_dict, new_joint_state_dict) + torch.save(joint_model, output_model) + + + +copy_model_parameters(ASR_model, LID_model, joint_model, output_model) \ No newline at end of file diff --git a/egs/commonvoice/v1/run_025_train_film_asr_lid.sh b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh index 8b213cfe..f5976ee1 100755 --- a/egs/commonvoice/v1/run_025_train_film_asr_lid.sh +++ b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh @@ -20,7 +20,7 @@ set -e stage=1 -ngpu=4 +ngpu=2 config_file=default_config.sh interactive=false num_workers="" @@ -61,13 +61,13 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ train_wav2vec2rnn_film_transducer_languageid.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ --data.val.dataset.class-files $train_dir/langs \ @@ -90,15 +90,15 @@ if [ $stage -le 2 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - finetune_wav2vec2transducer_languageid.py $nnet_type \ + finetune_wav2vec2rnn_film_transducer_languageid.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ --data.val.dataset.class-files $train_dir/langs \ @@ -123,13 +123,13 @@ if [ $stage -le 3 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s3_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - finetune_wav2vec2transducer.py $nnet_type \ + finetune_wav2vec2rnn_film_transducer_languageid.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ diff --git a/hyperion/bin/finetune_wav2vec2transducer_languageid.py b/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py similarity index 60% rename from hyperion/bin/finetune_wav2vec2transducer_languageid.py rename to hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py index 68d8dacf..22808dbd 100755 --- a/hyperion/bin/finetune_wav2vec2transducer_languageid.py +++ b/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py @@ -9,7 +9,7 @@ import sys import time from pathlib import Path - +import gc import k2 import numpy as np import torch @@ -21,15 +21,23 @@ from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, HFWav2Vec2RNNTransducer, - HFWav2Vec2RNNTransducerResnet1D) + HFWav2Vec2RNNFiLMTransducer, + HFWav2Vec2RNNTransducerResnet1D, + HFWav2Vec2RNNFiLMTransducerResnet1D) from hyperion.torch.trainers import TransducerLanguageIDTrainer as Trainer from hyperion.torch.utils import ddp from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, namespace_to_dict) from torch.nn.utils.rnn import pad_sequence + +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + model_dict = { "hf_wav2vec2rnn_transducer_resnet1d": HFWav2Vec2RNNTransducerResnet1D, + "hf_wav2vec2rnn_film_transducer_resnet1d": HFWav2Vec2RNNFiLMTransducerResnet1D, } @@ -99,94 +107,63 @@ def init_data(partition, rank, num_gpus, **kwargs): data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_language_collate) return data_loader - -def check_update_parameters(joint_state_dict, new_joint_state_dict, rank): - unchanged_parameters = [] - changed_parameters = [] - unloaded_parameters = [] - for name, param in joint_state_dict.items(): - new_param = new_joint_state_dict[name].to(param.device) - if torch.all(torch.eq(param, new_param)): - unchanged_parameters.append(name) - else: - changed_parameters.append(name) - # logging - if rank == 0: - logging.info("Unchanged parameters: {}".format(unchanged_parameters)) - logging.info("Changed parameters: {}".format(changed_parameters)) - - -def remove_module_from_state_dict(state_dict): - new_state_dict = {} - for name, param in state_dict.items(): - if name.startswith("module."): - new_state_dict[name[len("module."):]] = param - else: - new_state_dict[name] = param - return new_state_dict - - -def copy_model_parameters(joint_model, wav2transducer_state_dict, wav2lid_state_dict, rank): - joint_state_dict = joint_model.state_dict() - wav2transducer_state_dict = remove_module_from_state_dict(wav2transducer_state_dict) - wav2lid_state_dict = remove_module_from_state_dict(wav2lid_state_dict) - - - hf_feats_update_state_dict = {name: param for name, param in wav2transducer_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "hf_feats" in name} - transducer_update_state_dict = {name: param for name, param in wav2transducer_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "transducer" in name} - languageid_update_state_dict = {name: param for name, param in wav2lid_state_dict.items() if name in joint_state_dict and param.shape == joint_state_dict[name].shape and "languageid" in name} - - new_joint_state_dict = joint_state_dict.copy() - new_joint_state_dict.update(hf_feats_update_state_dict) - new_joint_state_dict.update(transducer_update_state_dict) - new_joint_state_dict.update(languageid_update_state_dict) - - new_joint_state_dict["transducer_fuser"] = wav2transducer_state_dict["feat_fuser"] - new_joint_state_dict["languageid_fuser"] = wav2lid_state_dict["feat_fuser"] - - - check_update_parameters(joint_state_dict, new_joint_state_dict, rank) - joint_model.load_state_dict(new_joint_state_dict) - -def init_model(in_model_transducer, in_model_lid, rank, model_class, **kwargs): - # load pretrained models - model_wav2transducer = torch.load(in_model_transducer) - model_wav2lid = torch.load(in_model_lid) - if rank == 0: - logging.info("init joint model") - logging.info("hf_feats network ft args={}".format(model_wav2transducer["model_cfg"]["hf_feats"])) - logging.info("transducer network ft args={}".format(model_wav2transducer["model_cfg"]["transducer"])) - logging.info("languageid network ft args={}".format(model_wav2lid["model_cfg"]["languageid"])) - logging.info("feat_fusion_start={}".format(model_wav2transducer["model_cfg"]["feat_fusion_start"])) - logging.info("feat_fusion_method_transducer={}".format(model_wav2transducer["model_cfg"]["feat_fusion_method"])) - logging.info("feat_fusion_method_languageid={}".format(model_wav2lid["model_cfg"]["feat_fusion_method"])) - - # init joint model - model = model_class(hf_feats=model_wav2transducer["model_cfg"]["hf_feats"], - transducer=model_wav2transducer["model_cfg"]["transducer"], - languageid=model_wav2lid["model_cfg"]["languageid"], - feat_fusion_start=model_wav2transducer["model_cfg"]["feat_fusion_start"], - feat_fusion_method_transducer=model_wav2transducer["model_cfg"]["feat_fusion_method"], - feat_fusion_method_languageid=model_wav2lid["model_cfg"]["feat_fusion_method"], - loss_weight_transducer=kwargs["model"]["loss_weight_transducer"], - loss_weight_lid=kwargs["model"]["loss_weight_lid"], - lid_length=kwargs["model"]["lid_length"], - ) - - copy_model_parameters(model, model_wav2transducer["model_state_dict"], model_wav2lid["model_state_dict"], rank) - - - # add finetune args +def init_model(num_classes, loss_class_weight, in_model_file, rank, model_class, **kwargs): model_args = model_class.filter_finetune_args(**kwargs["model"]) - # model_args = model_class.filter_args(**kwargs["model"]) if rank == 0: logging.info("model network ft args={}".format(model_args)) - model_args["languageid"]["num_classes"] = model_wav2lid["model_cfg"]["languageid"]["num_classes"] + model_args["languageid"]["num_classes"] = num_classes + # model_args["loss_class_weight"] = loss_class_weight + model = TML.load(in_model_file) + logging.info(model_args) model.change_config(**model_args) if rank == 0: logging.info("model={}".format(model)) return model +# def init_model(in_model_transducer, in_model_lid, rank, model_class, **kwargs): +# # load pretrained models +# model_wav2transducer = torch.load(in_model_transducer) +# model_wav2lid = torch.load(in_model_lid) +# if rank == 0: +# logging.info("init joint model") +# logging.info("hf_feats network ft args={}".format(model_wav2transducer["model_cfg"]["hf_feats"])) +# logging.info("transducer network ft args={}".format(model_wav2transducer["model_cfg"]["transducer"])) +# logging.info("languageid network ft args={}".format(model_wav2lid["model_cfg"]["languageid"])) +# logging.info("feat_fusion_start={}".format(model_wav2transducer["model_cfg"]["feat_fusion_start"])) +# logging.info("feat_fusion_method_transducer={}".format(model_wav2transducer["model_cfg"]["feat_fusion_method"])) +# logging.info("feat_fusion_method_languageid={}".format(model_wav2lid["model_cfg"]["feat_fusion_method"])) + +# # init joint model +# model = model_class(hf_feats=model_wav2transducer["model_cfg"]["hf_feats"], +# transducer=model_wav2transducer["model_cfg"]["transducer"], +# languageid=model_wav2lid["model_cfg"]["languageid"], +# feat_fusion_start=model_wav2transducer["model_cfg"]["feat_fusion_start"], +# feat_fusion_method_transducer=model_wav2transducer["model_cfg"]["feat_fusion_method"], +# feat_fusion_method_languageid=model_wav2lid["model_cfg"]["feat_fusion_method"], +# loss_weight_transducer=kwargs["model"]["loss_weight_transducer"], +# loss_weight_lid=kwargs["model"]["loss_weight_lid"], +# lid_length=kwargs["model"]["lid_length"], +# ) + +# copy_model_parameters(model, model_wav2transducer["model_state_dict"], model_wav2lid["model_state_dict"], rank) + + +# # add finetune args +# model_args = model_class.filter_finetune_args(**kwargs["model"]) + +# # model_args = model_class.filter_args(**kwargs["model"]) +# if rank == 0: +# logging.info("model network ft args={}".format(model_args)) +# model_args["languageid"]["num_classes"] = model_wav2lid["model_cfg"]["languageid"]["num_classes"] +# model.change_config(**model_args) +# if rank == 0: +# logging.info("model={}".format(model)) + +# model_wav2transducer = None +# model_wav2lid = None +# gc.collect() +# torch.cuda.empty_cache() +# return model def train_model(gpu_id, args): @@ -199,19 +176,22 @@ def train_model(gpu_id, args): torch.manual_seed(args.seed) set_float_cpu("float32") - # ddp_args = ddp.filter_ddp_args(**kwargs) - # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - # kwargs["rank"] = rank + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank - # for Debug - rank = 0 - kwargs["rank"] = 0 - device = torch.device("cuda:0") - world_size=1 + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(**kwargs) + # model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + model = init_model(list(train_loader.dataset.num_classes.values())[0], + train_loader.batch_sampler.class_info["weights"], + **kwargs) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: @@ -280,8 +260,10 @@ def make_parser(model_class): ) - parser.add_argument("--in-model-transducer", required=True) - parser.add_argument("--in-model-lid", required=True) + # parser.add_argument("--in-model-transducer", required=True) + # parser.add_argument("--in-model-lid", required=True) + parser.add_argument("--in-model-file", required=True) + model_class.add_finetune_args(parser, prefix="model") # model_class.add_class_args(parser, prefix="model") Trainer.add_class_args( diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 2e354031..0d1cf332 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -112,7 +112,7 @@ def _load_legacy_durations(self, time_durs_file): time_durs = SegmentSet.load(time_durs_file) self.seg_set["duration"] = time_durs.loc[ self.seg_set["id"] - ].class_id.values.astype(np.float, copy=False) + ].class_id.values.astype(float, copy=False) def _load_bpe_model(self, bpe_model, is_val): if self.rank == 0: diff --git a/hyperion/torch/layer_blocks/film_blocks.py b/hyperion/torch/layer_blocks/film_blocks.py index 00ee1a10..0a1a1c00 100644 --- a/hyperion/torch/layer_blocks/film_blocks.py +++ b/hyperion/torch/layer_blocks/film_blocks.py @@ -34,7 +34,7 @@ def forward(self, x, lang_condition): class RNNWithFiLM(nn.Module): - def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm", film_type="tanh"): + def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, batch_first=True, rnn_type="lstm", film_type="tanh", film_cond_type="one-hot"): super(RNNWithFiLM, self).__init__() self.input_size = input_size self.hidden_size = hidden_size @@ -43,12 +43,18 @@ def __init__(self, input_size, hidden_size, num_layers, dropout, condition_size, self.batch_first = batch_first self.rnn_type = rnn_type self.film_type = film_type + self.film_cond_type = film_cond_type + if self.rnn_type == "lstm": self.lstms = nn.ModuleList([nn.LSTM(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) elif self.rnn_type == "gru": self.grus = nn.ModuleList([nn.GRU(input_size if i==0 else hidden_size, hidden_size, 1, batch_first=batch_first) for i in range(num_layers)]) - self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)]) + if self.film_cond_type == "one-hot": + self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)]) + else: + self.films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)]) + self.lid_films = nn.ModuleList([FiLM(hidden_size, condition_size, film_type) for _ in range(num_layers)]) self.dropout_layer = nn.Dropout(dropout) @@ -59,8 +65,13 @@ def forward(self, x, states, lang_condition): rnns = self.lstms elif self.rnn_type == "gru": rnns = self.grus + + if self.film_cond_type == "one-hot": + films = self.films + else: + films = self.lid_films - for i, (rnn, film) in enumerate(zip(rnns, self.films)): + for i, (rnn, film) in enumerate(zip(rnns, films)): if states: x, (h_i, c_i) = rnn(x, (states[0][i].unsqueeze(0), states[1][i].unsqueeze(0))) else: diff --git a/hyperion/torch/layer_blocks/transducer_film_joiner.py b/hyperion/torch/layer_blocks/transducer_film_joiner.py index 02a9dfdf..2c6d8d48 100644 --- a/hyperion/torch/layer_blocks/transducer_film_joiner.py +++ b/hyperion/torch/layer_blocks/transducer_film_joiner.py @@ -21,7 +21,7 @@ class TransducerFiLMJoiner(nn.Module): vocab_size: vocabulary size """ - def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: int, condition_size: int, film_type: str = "linear"): + def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: int, condition_size: int, film_type: str = "linear", film_cond_type="one-hot"): super().__init__() self.enc_feats = enc_feats @@ -32,7 +32,18 @@ def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, vocab_size: self.enc_proj = nn.Linear(enc_feats, hid_feats) self.pred_proj = nn.Linear(pred_feats, hid_feats) self.output = nn.Linear(hid_feats, vocab_size) - self.film = FiLM(hid_feats, condition_size, film_type) + + self.film_cond_type = film_cond_type + + + if self.film_cond_type == "one-hot": + self.film = FiLM(hid_feats, condition_size, film_type) + else: + self.film = FiLM(hid_feats, condition_size, film_type) + self.lid_film = FiLM(hid_feats, condition_size, film_type) + + # self.film = FiLM(hid_feats, condition_size, film_type) + def get_config(self): config = { @@ -69,8 +80,11 @@ def forward(self, else: x = enc_out + pred_out - x = self.film(x, lang_condition) - + if self.film_cond_type == "one-hot": + x = self.film(x, lang_condition) + else: + x = self.lid_film(x, lang_condition) + x = torch.tanh(x) logits = self.output(x) return logits diff --git a/hyperion/torch/layer_blocks/transducer_film_predictor.py b/hyperion/torch/layer_blocks/transducer_film_predictor.py index dc7a7ae4..42272051 100644 --- a/hyperion/torch/layer_blocks/transducer_film_predictor.py +++ b/hyperion/torch/layer_blocks/transducer_film_predictor.py @@ -39,6 +39,7 @@ def __init__(self, rnn_dropout_rate: float = 0.0, rnn_type: str = "lstm", film_type: str = "linear", + film_cond_type: str = "one-hot", blank_id: int = 0): super().__init__() self.embedding = nn.Embedding( @@ -56,7 +57,8 @@ def __init__(self, condition_size=condition_size, batch_first=True, rnn_type=rnn_type, - film_type=film_type + film_type=film_type, + film_cond_type=film_cond_type ) elif rnn_type in ["lstm_residual","gru_residual"]: self.rnn = RNNWithFiLMResidual( @@ -67,7 +69,8 @@ def __init__(self, condition_size=condition_size, batch_first=True, rnn_type=rnn_type, - film_type=film_type + film_type=film_type, + film_cond_type=film_cond_type ) else: raise Exception(f"Unknown RNN type {rnn_type}") @@ -101,6 +104,7 @@ def get_config(self): "rnn_dropout_rate": self.rnn_dropout_rate, "rnn_type": self.rnn_type, "film_type": self.film_type, + "film_cond_type": self.film_cond_type, "blank_id": self.blank_id, } return config diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py index 8e29bc84..0322543d 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py @@ -46,6 +46,7 @@ def __init__(self, loss_class_weight_exp= 1.0, loss_weight_transducer: float = 0.005, loss_weight_lid: float = 1.0, + loss_weight_embed: float = 0.005, lid_length: float = 3.0, ): @@ -95,24 +96,28 @@ def __init__(self, self.loss_weight_transducer = loss_weight_transducer self.loss_weight_lid = loss_weight_lid + self.loss_weight_embed = loss_weight_embed self.lid_length = lid_length self._hf_context = contextlib.nullcontext() - self.transducer_fuser, self.films = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer) - self.languageid_fuser, _ = self._make_fuser(self.feat_fusion_method_lid, self.feat_fusion_start_lid) + self.transducer_fuser, self.film, self.lid_film = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer) + self.languageid_fuser, _, _ = self._make_fuser(self.feat_fusion_method_lid, self.feat_fusion_start_lid) def _make_fuser(self, method, start): feat_fuser = None - films = None + film = None + lid_film = None if method == "last": - return feat_fuser, films + return feat_fuser, None, None num_layers = self.hf_feats.num_encoder_layers + 1 - start layer_dim = self.hf_feats.hidden_size if method == "film-weighted-avg": - films = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)]) + film = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)]) + lid_film = nn.ModuleList([FiLM(layer_dim, self.transducer.decoder.condition_size) for _ in range(num_layers)]) feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif method == "film-fused-feature": feat_fuser = nn.Parameter(torch.zeros(num_layers)) film = FiLM(layer_dim, self.transducer.decoder.condition_size) + lid_film = FiLM(layer_dim, self.transducer.decoder.condition_size) elif method == "weighted-avg": feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif method == "linear": @@ -124,9 +129,9 @@ def _make_fuser(self, method, start): layer_dim, bias=False) - return feat_fuser, films + return feat_fuser, film, lid_film - def _fuse_transducer_hid_feats(self, hid_feats, lang): + def _fuse_transducer_hid_feats(self, hid_feats, lang_condition): """Fuses the hidden features from the Wav2Vec model. Args: @@ -141,10 +146,10 @@ def _fuse_transducer_hid_feats(self, hid_feats, lang): return hid_feats[0] if self.transducer.decoder.film_cond_type in ["one-hot", "lid_pred"]: - lang_condition = self.transducer.decoder.lang_embedding(lang) + lang_condition = self.transducer.decoder.lang_embedding(lang_condition) hid_feats = hid_feats[self.feat_fusion_start_transducer:] if self.feat_fusion_method_transducer == "film-weighted-avg": - film_hid_feats = tuple(self.films[i](hid_feats[i], lang_condition) for i in range(len(self.films))) + film_hid_feats = tuple(self.lid_film[i](hid_feats[i], lang_condition) for i in range(len(self.lid_film))) film_hid_feats = torch.stack(film_hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) feats = torch.sum(film_hid_feats * norm_weights, dim=-1) @@ -152,7 +157,7 @@ def _fuse_transducer_hid_feats(self, hid_feats, lang): hid_feats = torch.stack(hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) feats = torch.sum(hid_feats * norm_weights, dim=-1) - feats = self.films(feats, lang_condition) + feats = self.lid_film(feats, lang_condition) elif self.feat_fusion_method_transducer == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) @@ -223,6 +228,39 @@ def forward_lid_feats(self, return feats, hid_feats, feat_lengths + def compute_embed_loss(self, lang_embed, languageid): + # comput the loss for the embeding between the film and lid_film + lang_condition = self.transducer.decoder.lang_embedding(languageid) + + # for the encoder + film_scale = self.film.linear_scale(lang_condition) + lid_film_scale = self.lid_film.linear_scale(lang_embed) + film_shift = self.film.linear_shift(lang_condition) + lid_film_shift = self.lid_film.linear_shift(lang_embed) + loss_embed_encode = torch.mean(torch.abs(film_scale - lid_film_scale)) + torch.mean(torch.abs(film_shift - lid_film_shift)) + + # for the predictor + loss_embed_predictor = 0 + for i in range(2): + film_scale = self.transducer.decoder.predictor.rnn.films[i].linear_scale(lang_condition) + lid_film_scale = self.transducer.decoder.predictor.rnn.lid_films[i].linear_scale(lang_embed) + film_shift = self.transducer.decoder.predictor.rnn.films[i].linear_shift(lang_condition) + lid_film_shift = self.transducer.decoder.predictor.rnn.lid_films[i].linear_shift(lang_embed) + loss_embed_predictor += torch.mean(torch.abs(film_scale - lid_film_scale)) + torch.mean(torch.abs(film_shift - lid_film_shift)) + + + # for the joiner + film_scale = self.transducer.decoder.joiner.film.linear_scale(lang_condition) + lid_film_scale = self.transducer.decoder.joiner.lid_film.linear_scale(lang_embed) + film_shift = self.transducer.decoder.joiner.film.linear_shift(lang_condition) + lid_film_shift = self.transducer.decoder.joiner.lid_film.linear_shift(lang_embed) + loss_embed_joiner = torch.mean(torch.abs(film_scale - lid_film_scale)) + torch.mean(torch.abs(film_shift - lid_film_shift)) + + + loss_embed = loss_embed_encode + loss_embed_predictor + loss_embed_joiner + + return loss_embed + def forward( self, x, @@ -275,15 +313,19 @@ def forward( #loss_lid = self.loss_lid(lid_logits, languageid) loss_lid = self.loss_lid(output["logits"], languageid) + # import pdb; pdb.set_trace() + # logging.info(output["h_classif"]) + + loss_embed = self.compute_embed_loss(output["h_classif"][0], languageid) # feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid_logits) # (N, T, C) - feats_transducer = self._fuse_transducer_hid_feats(hid_feats, output["h_classif"]) # (N, T, C) + feats_transducer = self._fuse_transducer_hid_feats(hid_feats, output["h_classif"][0]) # (N, T, C) trans_output = self.transducer( feats_transducer, feat_lengths, text, - output["h_classif"] + output["h_classif"][0] # lid_logits ) @@ -292,9 +334,10 @@ def forward( f.transpose(1, 2) for i, f in enumerate(hid_feats) if i in return_feat_layers ] - output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid, + output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid + self.loss_weight_embed * loss_embed, loss_transducer=trans_output.loss, loss_lid=loss_lid, + loss_embed=loss_embed, loss_transducer_simple=trans_output.loss_simple, loss_transducer_pruned=trans_output.loss_pruned, h_feats=trans_output.h_feats, @@ -347,9 +390,9 @@ def infer(self, return text, lid - def unfreeze_film(self): + def unfreeze_lid_film(self): for name, param in self.named_parameters(): - if "film" in name: + if "lid_film" in name: logging.info(f"unfreezing {name}") param.requires_grad = True @@ -380,11 +423,13 @@ def set_train_mode(self, mode): self.freeze() elif mode in ["ft-film", "ft-film-grad"]: self.freeze() - self.unfreeze_film() + self.unfreeze_lid_film() elif mode in ["ft-transducer", "ft-transducer-nograd"]: self.unfreeze() self.freeze_hf_feats() self.freeze_feat_fuser() + self.freeze_film() + self.unfreeze_lid_film() elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: self.unfreeze() self.freeze_hf_feats() @@ -451,6 +496,7 @@ def filter_args(**kwargs): "loss_class_weight_exp", "loss_weight_transducer", "loss_weight_lid", + "loss_weight_embed", "languageid", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -476,6 +522,7 @@ def get_config(self): "loss_class_weight_exp": self.loss_class_weight_exp, "loss_weight_transducer": self.loss_weight_transducer, "loss_weight_lid": self.loss_weight_lid, + "loss_weight_embed": self.loss_weight_embed, "lid_length": self.lid_length, } @@ -565,6 +612,15 @@ def add_class_args(parser, prefix=None, skip=set()): """, ) + parser.add_argument( + "--loss-weight-embed", + default=0.005, + type=float, + help=""" + The weight of the embedding loss + """, + ) + parser.add_argument( "--lid-length", default=3.0, diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py index 8c7d54d7..d8374e77 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py @@ -22,6 +22,7 @@ class RNNTransducerLanguageIDOutput(HypDataClass): loss: torch.Tensor # Total loss loss_transducer: torch.Tensor # Loss from the transducer loss_lid: torch.Tensor # Loss from the language ID + loss_embed: Optional[torch.Tensor] = None # Loss from the embedding loss_transducer_simple: Optional[torch.Tensor] = None # Simple loss from the transducer, if available loss_transducer_pruned: Optional[torch.Tensor] = None # Pruned loss from the transducer, if available h_feats: Optional[List[torch.Tensor]] = None # Hidden features, if available diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py index e012f17a..4215ea1d 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py @@ -47,6 +47,7 @@ def __init__( loss_class_weight_exp: float = 1.0, loss_weight_transducer: float = 0.005, loss_weight_lid: float = 1.0, + loss_weight_embed: float = 0.005, lid_length: float = 3.0, ): @@ -81,6 +82,7 @@ def __init__( loss_class_weight_exp=loss_class_weight_exp, loss_weight_transducer=loss_weight_transducer, loss_weight_lid=loss_weight_lid, + loss_weight_embed=loss_weight_embed, lid_length=lid_length) @@ -117,8 +119,11 @@ def filter_finetune_args(**kwargs): base_args = {} valid_args = ( + "loss_lid_type", + "loss_class_weight_exp", "loss_weight_transducer", "loss_weight_lid", + "loss_weight_embed", "lid_length", ) child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) @@ -134,6 +139,22 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") + parser.add_argument( + "--loss-lid-type", + default="weightedCE", + type=str, + help=""" + The type of the loss for language id + """, + ) + parser.add_argument( + "--loss-class-weight-exp", + default=1.0, + type=float, + help=""" + The exponent of the class weight for language id + """, + ) parser.add_argument( "--loss-weight-transducer", @@ -153,6 +174,15 @@ def add_finetune_args(parser, prefix=None): """, ) + parser.add_argument( + "--loss-weight-embed", + default=0.005, + type=float, + help=""" + The weight of the embedding loss + """, + ) + parser.add_argument( "--lid-length", default=3.0, diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index f2cfad35..9d030ae7 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -78,8 +78,8 @@ def __init__( pruned_warmup_steps: int = 2000, langs_size: int = 13, condition_size: int = 64, - film_cond_type: str = "one-hot", film_type: str = "linear", + film_cond_type: str = "one-hot", ): super().__init__() @@ -105,13 +105,10 @@ def __init__( self._make_predictor() self._make_joiner() # make embedding layer for language id - if self.film_cond_type == "one-hot": - self.lang_embedding = nn.Embedding(langs_size, condition_size) - elif self.film_cond_type == "lid_pred": - self.lang_embedding = nn.Linear(langs_size, condition_size) - elif self.film_cond_type == "lid_pred_embed": - # self.lang_embedding = nn.Linear(langs_size, condition_size) - pass + self.lang_embedding = nn.Embedding(langs_size, condition_size) + if self.film_cond_type == "lid_pred": + self.lid_lang_embedding = nn.Linear(langs_size, condition_size) + if self.rnnt_loss == "k2_pruned": self.simple_am_proj = nn.Linear(in_feats, vocab_size) self.simple_lm_proj = nn.Linear(self.predictor.out_feats, @@ -129,7 +126,7 @@ def _make_predictor(self): if pred_type == "rnn": pred_args = filter_func_args(RNNPredictor.__init__, self.predictor_args) - self.predictor = RNNPredictor(**pred_args) + self.predictor = RNNPredictor(**pred_args, film_type=self.film_type, film_cond_type=self.film_cond_type) # elif pred_type == "conv": # pred_args = filter_func_args(ConvPredictor.__init__, # self.predictor_args) @@ -145,7 +142,7 @@ def _make_joiner(self): pred_feats = self.predictor_args["out_feats"] hid_feats = self.joiner_args["hid_feats"] self.joiner = FiLMJoiner(self.in_feats, pred_feats, hid_feats, - self.vocab_size, self.condition_size, self.film_type) + self.vocab_size, self.condition_size, film_type=self.film_type, film_cond_type=self.film_cond_type) elif joiner_type == "original_joiner": pred_feats = self.predictor_args["out_feats"] hid_feats = self.joiner_args["hid_feats"] @@ -309,11 +306,13 @@ def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, return loss, loss_simple, loss_pruned def forward( - self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, lang: torch.Tensor + self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, lang_embedding: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: # embed lang - if self.film_cond_type in ["one-hot", "lid_pred"]: - lang_embedding = self.lang_embedding(lang) + if self.film_cond_type == ["one-hot"]: + lang_embedding = self.lang_embedding(lang_embedding) + elif self.film_cond_type == ["lid_pred"]: + lang_embedding = self.lid_lang_embedding(lang_embedding) # get y_lengths row_splits = y.shape.row_splits(1) y_lengths = row_splits[1:] - row_splits[:-1] @@ -346,8 +345,13 @@ def decode(self, max_sym_per_utt: int = 1000, ) -> List[int]: # embed lang - if self.film_cond_type in ["one-hot", "lid_pred"]: + # if self.film_cond_type in ["one-hot", "lid_pred"]: + # lang_embedding = self.lang_embedding(lang) + + if self.film_cond_type == ["one-hot"]: lang_embedding = self.lang_embedding(lang) + elif self.film_cond_type == ["lid_pred"]: + lang_embedding = self.lid_lang_embedding(lang) if method == "time_sync_beam_search": return self.decode_time_sync_beam_search(x, lang_embedding, @@ -730,10 +734,7 @@ def add_pred_args(parser): help= """type of recurrent network for thep predictor in [lstm, gru]""") - pred_parser.add_argument("--film-type", - default="linear", - choices=["linear", "tanh"], - help=("type of the FiLM layer")) + pred_parser.add_argument("--num-layers", From 28e61e3a8998df94e9ecc4cfcff66356f6d149c1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 2 Jul 2023 07:54:52 +0000 Subject: [PATCH 65/89] update lid-film-asr training --- ...ucer_ecapadnn512x3_1layer_stage1_v2.0.yaml | 1 + ...ucer_ecapadnn512x3_1layer_stage1_v2.2.yaml | 140 ++++++++++++++++++ ...ucer_ecapadnn512x3_1layer_stage1_v2.3.yaml | 140 ++++++++++++++++++ ...ucer_ecapadnn512x3_1layer_stage2_v2.1.yaml | 9 +- ...ucer_ecapadnn512x3_1layer_stage2_v2.2.yaml | 94 ++++++++++++ ...ucer_ecapadnn512x3_1layer_stage3_v2.1.yaml | 92 ++++++++++++ ...uned_filmed_transducer_lid_v2.1_13langs.sh | 7 +- ...uned_filmed_transducer_lid_v2.2_13langs.sh | 42 ++++++ .../models/transducer/rnn_film_transducer.py | 13 ++ .../hf_wav2rnn_film_transducer_languageid.py | 90 +++++++++-- ..._wav2vec2rnn_film_transducer_languageid.py | 32 +++- .../hf_wav2vec2rnn_transducer_languageid.py | 4 +- 12 files changed, 639 insertions(+), 25 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.3.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.2_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml index 0931c052..6c06c29b 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.0.yaml @@ -108,6 +108,7 @@ model: loss_weight_transducer: 1.0 loss_weight_lid: 0.0 + loss_weight_embed: 0.05 lid_length: 3.0 feat_fusion_method_transducer: film-fused-feature diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml new file mode 100644 index 00000000..7347e8b4 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml @@ -0,0 +1,140 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + film_cond_type: lid_pred_embed + reduction: mean + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish + + loss_lid_type: weightedCE + loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + loss_weight_embed: 10 + lid_length: 3.0 + + feat_fusion_method_transducer: film-fused-feature + feat_fusion_method_lid: weighted-avg + feat_fusion_start_transducer: 2 + feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.3.yaml new file mode 100644 index 00000000..f7a430a7 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.3.yaml @@ -0,0 +1,140 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + film_cond_type: lid_pred_embed + reduction: mean + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + languageid: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + hid_act: swish + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: arc-softmax + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.3 + hid_act: swish + + loss_lid_type: weightedCE + loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + loss_weight_embed: 20 + lid_length: 3.0 + + feat_fusion_method_transducer: film-fused-feature + feat_fusion_method_lid: weighted-avg + feat_fusion_start_transducer: 2 + feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-film + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml index 377ea296..716a9d8f 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml @@ -49,18 +49,19 @@ data: model: transducer: decoder: + reduction: mean prune_range: 15 override_dropouts: false languageid: cos_scale: 32.0 - loss_lid_type: weightedCE - loss_class_weight_exp: 1.0 # 0~1 + # loss_lid_type: weightedCE + # loss_class_weight_exp: 1.0 # 0~1 loss_weight_transducer: 1.0 loss_weight_lid: 10.0 loss_weight_embed: 10 - lid_length: 3.0 + # lid_length: 3.0 # feat_fusion_method_transducer: film-fused-feature # feat_fusion_method_lid: weighted-avg @@ -87,5 +88,5 @@ trainer: epochs: 120 # eff_batch_size: 1024 eff_batch_size: 128 - train_mode: ft-film + train_mode: freeze-gt-film \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml new file mode 100644 index 00000000..2f625da0 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml @@ -0,0 +1,94 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + transducer: + decoder: + reduction: mean + prune_range: 15 + override_dropouts: false + languageid: + cos_scale: 32.0 + + # loss_lid_type: weightedCE + # loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + loss_weight_embed: 10 + loss_reg_weight_transducer: 0.5 + loss_reg_weight_lid: 0.0 + # lid_length: 3.0 + + # feat_fusion_method_transducer: film-fused-feature + # feat_fusion_method_lid: weighted-avg + # feat_fusion_start_transducer: 2 + # feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-transducer + \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml new file mode 100644 index 00000000..a7be4925 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.05 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20. + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + transducer: + decoder: + reduction: mean + prune_range: 15 + override_dropouts: false + languageid: + cos_scale: 32.0 + + # loss_lid_type: weightedCE + # loss_class_weight_exp: 1.0 # 0~1 + + loss_weight_transducer: 1.0 + loss_weight_lid: 0.0 + loss_weight_embed: 10 + # lid_length: 3.0 + + # feat_fusion_method_transducer: film-fused-feature + # feat_fusion_method_lid: weighted-avg + # feat_fusion_start_transducer: 2 + # feat_fusion_start_lid: 2 + +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 12000 + hold_steps: 10000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: ft-transducer + \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh index b0e39914..d5d72490 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.1_13langs.sh @@ -27,7 +27,7 @@ nnet_s1_args="" nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.1_13_langs_weighted_8000_bpe nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0009.pth +nnet_s1=$nnet_s1_dir/model_ep0015.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml nnet_s2_args="" @@ -35,9 +35,8 @@ nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name nnet_s2=$nnet_s2_dir/model_ep0020.pth -nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.1.yaml +nnet_s3_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.1.yaml nnet_s3_args="" nnet_s3_name=${nnet_name}.s3 -nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3_dir=exp/transducer_resnet1d_nnets/$nnet_s3_name nnet_s3=$nnet_s3_dir/model_ep0002.pth -nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.2_13langs.sh new file mode 100644 index 00000000..f4ccf18e --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v2.2_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_film_transducer_resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v2.2_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0006.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage3_v2.2.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth diff --git a/hyperion/torch/models/transducer/rnn_film_transducer.py b/hyperion/torch/models/transducer/rnn_film_transducer.py index 68066442..6f82e101 100644 --- a/hyperion/torch/models/transducer/rnn_film_transducer.py +++ b/hyperion/torch/models/transducer/rnn_film_transducer.py @@ -193,6 +193,19 @@ def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + def get_regularization_loss(self): + reg_loss = 0.0 + total_params = 0 + + for param in self.parameters(): + reg_loss += torch.norm(param)**2 + total_params += torch.numel(param) + + reg_loss = (reg_loss) / total_params + + return reg_loss + def change_config( self, diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py index 0322543d..7daeddcb 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py @@ -47,6 +47,8 @@ def __init__(self, loss_weight_transducer: float = 0.005, loss_weight_lid: float = 1.0, loss_weight_embed: float = 0.005, + loss_reg_weight_transducer: float = 0.0, + loss_reg_weight_lid: float = 0.0, lid_length: float = 3.0, ): @@ -97,6 +99,8 @@ def __init__(self, self.loss_weight_transducer = loss_weight_transducer self.loss_weight_lid = loss_weight_lid self.loss_weight_embed = loss_weight_embed + self.loss_reg_weight_transducer = loss_reg_weight_transducer + self.loss_reg_weight_lid = loss_reg_weight_lid self.lid_length = lid_length self._hf_context = contextlib.nullcontext() self.transducer_fuser, self.film, self.lid_film = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer) @@ -334,7 +338,18 @@ def forward( f.transpose(1, 2) for i, f in enumerate(hid_feats) if i in return_feat_layers ] - output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid + self.loss_weight_embed * loss_embed, + + loss_reg_lid = 0 + if self.loss_reg_weight_lid > 0: + loss_reg_lid = self.languageid.get_regularization_loss() + + loss_reg_transducer = 0 + if self.loss_reg_weight_transducer > 0: + loss_reg_transducer = self.transducer.get_regularization_loss() + + + + output = RNNTransducerLanguageIDOutput(loss=self.loss_weight_transducer * trans_output.loss + self.loss_weight_lid * loss_lid + self.loss_weight_embed * loss_embed + self.loss_reg_weight_lid * loss_reg_lid + self.loss_reg_weight_transducer * loss_reg_transducer, loss_transducer=trans_output.loss, loss_lid=loss_lid, loss_embed=loss_embed, @@ -396,16 +411,29 @@ def unfreeze_lid_film(self): logging.info(f"unfreezing {name}") param.requires_grad = True - # def freeze_feat_fuser(self): - # if self.feat_fuser is None: - # return + def freeze_lid(self): + self.languageid.freeze() - # if self.feat_fusion_method_transducer == "weighted-avg": - # self.feat_fuser.requires_grad = False - # return + def freeze_film(self): + for name, param in self.named_parameters(): + # logging.info(f"parameter {name}") + if "film" in name and "lid_film" not in name: + logging.info(f"freezing {name}") + param.requires_grad = False + if "lang_embedding" in name: + logging.info(f"freezing {name}") + param.requires_grad = False + + def freeze_lid_feat_fuser(self): + if self.languageid_fuser is None: + return - # for param in self.feat_fuser.parameters(): - # param.requires_grad = False + if self.feat_fusion_method_lid == "weighted-avg": + self.languageid_fuser.requires_grad = False + return + + for param in self.languageid_fuser.parameters(): + param.requires_grad = False def freeze_hf_feats(self): self.hf_feats.freeze() @@ -414,11 +442,16 @@ def freeze_hf_feature_encoder(self): self.hf_feats.freeze_feature_encoder() def set_train_mode(self, mode): + logging.info("setting train mode to %s", mode) + logging.info("train mode was %s", self._train_mode) if mode == self._train_mode: return if mode == "full": self.unfreeze() + if mode == "freeze-gt-film": + self.unfreeze() + self.freeze_film() elif mode == "frozen": self.freeze() elif mode in ["ft-film", "ft-film-grad"]: @@ -427,9 +460,10 @@ def set_train_mode(self, mode): elif mode in ["ft-transducer", "ft-transducer-nograd"]: self.unfreeze() self.freeze_hf_feats() - self.freeze_feat_fuser() self.freeze_film() - self.unfreeze_lid_film() + self.freeze_lid_feat_fuser() + self.freeze_lid() + # self.unfreeze_lid_film() elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: self.unfreeze() self.freeze_hf_feats() @@ -455,6 +489,7 @@ def _train(self, train_mode: str): super()._train(train_mode) elif train_mode in [ "ft-film", + "freeze-gt-film", "ft-transducer", "hf-feats-frozen", "ft-film-grad", @@ -472,6 +507,7 @@ def valid_train_modes(): return [ "full", "frozen", + "freeze-gt-film", "ft-film", "ft-embed-affine", "ft-transducer", @@ -497,6 +533,8 @@ def filter_args(**kwargs): "loss_weight_transducer", "loss_weight_lid", "loss_weight_embed", + "loss_reg_weight_transducer", + "loss_reg_weight_lid", "languageid", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -523,14 +561,24 @@ def get_config(self): "loss_weight_transducer": self.loss_weight_transducer, "loss_weight_lid": self.loss_weight_lid, "loss_weight_embed": self.loss_weight_embed, + "loss_reg_weight_transducer": self.loss_reg_weight_transducer, + "loss_reg_weight_lid": self.loss_reg_weight_lid, "lid_length": self.lid_length, } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - def change_config(self, hf_feats, transducer, languageid): - logging.info("changing hf wav2transducer config") + def change_config(self, loss_weight_transducer, loss_weight_lid, loss_weight_embed, loss_reg_weight_transducer, loss_reg_weight_lid, lid_length, hf_feats, transducer, languageid): + logging.info("changing hf wav2film_transducer_languageid config") + + self.loss_weight_transducer = loss_weight_transducer + self.loss_weight_lid = loss_weight_lid + self.loss_weight_embed = loss_weight_embed + self.lid_length = lid_length + self.loss_reg_weight_transducer = loss_reg_weight_transducer + self.loss_reg_weight_lid = loss_reg_weight_lid + self.hf_feats.change_config(**hf_feats) self.transducer.change_config(**transducer) self.languageid.change_config(**languageid) @@ -620,6 +668,22 @@ def add_class_args(parser, prefix=None, skip=set()): The weight of the embedding loss """, ) + parser.add_argument( + "--loss-reg-weight-transducer", + default=0.0, + type=float, + help=""" + The weight of the transducer regularization loss + """, + ) + parser.add_argument( + "--loss-reg-weight-lid", + default=0.0, + type=float, + help=""" + The weight of the lid regularization loss + """, + ) parser.add_argument( "--lid-length", diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py index 4215ea1d..cad64e99 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_film_transducer_languageid.py @@ -48,6 +48,8 @@ def __init__( loss_weight_transducer: float = 0.005, loss_weight_lid: float = 1.0, loss_weight_embed: float = 0.005, + loss_reg_weight_transducer: float = 0.0, + loss_reg_weight_lid: float = 0.0, lid_length: float = 3.0, ): @@ -82,6 +84,8 @@ def __init__( loss_class_weight_exp=loss_class_weight_exp, loss_weight_transducer=loss_weight_transducer, loss_weight_lid=loss_weight_lid, + loss_reg_weight_transducer=loss_reg_weight_transducer, + loss_reg_weight_lid=loss_reg_weight_lid, loss_weight_embed=loss_weight_embed, lid_length=lid_length) @@ -116,16 +120,20 @@ def add_class_args(parser, prefix=None): @staticmethod def filter_finetune_args(**kwargs): - base_args = {} valid_args = ( - "loss_lid_type", - "loss_class_weight_exp", + # "loss_lid_type", + # "loss_class_weight_exp", "loss_weight_transducer", "loss_weight_lid", "loss_weight_embed", + "loss_reg_weight_transducer", + "loss_reg_weight_lid", "lid_length", ) + + base_args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) base_args["hf_feats"] = child_args child_args = RNNFiLMTransducer.filter_finetune_args(**kwargs["transducer"]) @@ -183,6 +191,24 @@ def add_finetune_args(parser, prefix=None): """, ) + parser.add_argument( + "--loss-reg-weight-transducer", + default=0.0, + type=float, + help=""" + The weight of the transducer regularization loss + """, + ) + + parser.add_argument( + "--loss-reg-weight-lid", + default=0.0, + type=float, + help=""" + The weight of the lid regularization loss + """, + ) + parser.add_argument( "--lid-length", default=3.0, diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py index c8cd974b..4a8ca173 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py @@ -99,13 +99,15 @@ def add_class_args(parser, prefix=None): @staticmethod def filter_finetune_args(**kwargs): - base_args = {} valid_args = ( "loss_weight_transducer", "loss_weight_lid", "lid_length", ) + + base_args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) base_args["hf_feats"] = child_args child_args = RNNTransducer.filter_finetune_args(**kwargs["transducer"]) From b387dddad98594340d29b5a6db24e0a25198b617 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Sun, 2 Jul 2023 14:02:20 -0400 Subject: [PATCH 66/89] update config --- .../v1/global_conf/config_lid_v2.2_13langs.sh | 6 +-- .../global_conf/config_lid_v4.0_13langs_v3.sh | 2 +- .../v1/global_conf/config_lid_v6.0_13langs.sh | 4 +- .../v1/global_conf/config_lid_v6.2_13langs.sh | 4 +- .../v1/global_conf/config_lid_v6.3_13langs.sh | 4 +- .../v1/global_conf/config_lid_v6.4_13langs.sh | 4 +- ...uned_filmed_transducer_lid_v1.0_13langs.sh | 6 +-- ...g_pruned_filmed_transducer_v2.0_13langs.sh | 2 +- ...pruned_filmed_transducer_v4.2.1_13langs.sh | 8 ++-- ...g_pruned_filmed_transducer_v5.1_13langs.sh | 4 +- ...g_pruned_filmed_transducer_v6.0_13langs.sh | 11 +++-- ...nfig_pruned_transducer_lid_v1.0_13langs.sh | 2 +- .../config_pruned_transducer_v4.0_13langs.sh | 6 +-- .../global_conf/config_transducer_v3.3_it.sh | 2 +- egs/commonvoice/v1/run_004_compute_bpe.sh | 42 +++++++++---------- egs/commonvoice/v1/run_030_inference.sh | 2 + egs/commonvoice/v1/run_031_inference_film.sh | 2 + egs/commonvoice/v1/run_032_identificate.sh | 2 + .../identificate_wav2vec2resnet1d.sh | 5 ++- .../decode_wav2vec2rnn_film_transducer.sh | 6 ++- .../decode_wav2vec2rnn_transducer.sh | 9 ++-- .../decode_wav2vec2rnn_transducer_lid.sh | 8 ++++ .../preprocess_audios_for_nnet_train.sh | 5 +++ hyperion/bin/identificate_wav2languageid.py | 1 + hyperion/torch/layers/global_pool.py | 9 ++++ .../hf_wav2rnn_transducer_languageid.py | 23 ++++++++-- 26 files changed, 115 insertions(+), 64 deletions(-) diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh index debd9377..ec13ae3d 100644 --- a/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh @@ -9,8 +9,8 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" - +test_data="br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +#sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs @@ -34,7 +34,7 @@ nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml nnet_s2_args="" nnet_s2_name=${hf_model_name}_resnet1d_v2.2_13_langs.s2 nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/model_ep0020.pth +nnet_s2=$nnet_s2_dir/model_ep0003.pth nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml nnet_s3_args="" diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh index 8d6cbc80..9a154499 100644 --- a/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.0_13langs_v3.sh @@ -28,7 +28,7 @@ nnet_name=${hf_model_name}_resnet1d_v4.0_13_langs nnet_s1_name=$nnet_name.s3 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0008.pth +nnet_s1=$nnet_s1_dir/model_ep0003.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml nnet_s2_args="" nnet_s2_name=${hf_model_name}_resnet1d_v4.0_13_langs.s4 diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh index ebbd7fd1..28404ba5 100644 --- a/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.0_13langs.sh @@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs @@ -32,7 +32,7 @@ nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.0.yaml nnet_s2_args="" nnet_s2_name=${hf_model_name}_resnet1d_v6.0_13_langs.s2 nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/model_ep0020.pth +nnet_s2=$nnet_s2_dir/model_ep0003.pth nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml nnet_s3_args="" diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh index 57fb5d0b..f9d932e4 100644 --- a/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.2_13langs.sh @@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs @@ -26,7 +26,7 @@ nnet_s1_args="" nnet_name=${hf_model_name}_resnet1d_v6.2_13_langs nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0034.pth +nnet_s1=$nnet_s1_dir/model_ep0024.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.2.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh index d1847910..cedfb6e3 100644 --- a/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.3_13langs.sh @@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs @@ -26,7 +26,7 @@ nnet_s1_args="" nnet_name=${hf_model_name}_resnet1d_v6.3_13_langs nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0034.pth +nnet_s1=$nnet_s1_dir/model_ep0033.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.3.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh index 88190921..5124da23 100644 --- a/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_lid_v6.4_13langs.sh @@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +test_data="tt_test_proc_audio sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs @@ -26,7 +26,7 @@ nnet_s1_args="" nnet_name=${hf_model_name}_resnet1d_v6.4_13_langs nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0034.pth +nnet_s1=$nnet_s1_dir/model_ep0035.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v6.4.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh index 8d9e95d3..69dcb809 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_lid_v1.0_13langs.sh @@ -9,8 +9,8 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" - +test_data=" ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +#sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs_weighted @@ -27,7 +27,7 @@ nnet_s1_args="" nnet_name=${hf_model_name}_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage1_v1.0_13_langs_weighted_8000_bpe nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0007.pth +nnet_s1=$nnet_s1_dir/model_ep0016.pth nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v1.0.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh index 0f3845d7..b3a07306 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v2.0_13langs.sh @@ -40,7 +40,7 @@ nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v2.0.yaml nnet_s2_args="" nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/model_ep0003.pth +nnet_s2=$nnet_s2_dir/model_ep0047.pth nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml nnet_s3_args="" diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh index d209d421..f7480a61 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v4.2.1_13langs.sh @@ -9,8 +9,8 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" - +test_data="kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio" #ca_test_proc_audio +#sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs_weighted @@ -29,13 +29,13 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v4.2.1_13_langs_weighted_8000_bpe nnet_s1_name=$nnet_name.s3 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0012.pth +nnet_s1=$nnet_s1_dir/model_ep0025.pth nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml nnet_s2_args="" nnet_s2_name=${nnet_name}.s4 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/model_ep0003.pth +nnet_s2=$nnet_s2_dir/model_ep0001.pth nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.2.1.yaml nnet_s3_args="" diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh index ab3d1ec8..09a139ab 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh @@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs_weighted @@ -29,7 +29,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v5.1_13_langs_weighted_8000_bpe nnet_s1_name=$nnet_name.s3 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0008.pth +nnet_s1=$nnet_s1_dir/model_ep0042.pth nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v5.1.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh index 71d38168..28f381ea 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh @@ -9,8 +9,8 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" - +test_data="en_test_proc_audio ca_test_proc_audio" +#ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs_weighted @@ -29,17 +29,16 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v6.0_13_langs_weighted_8000_bpe nnet_s1_name=$nnet_name.s3 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0003.pth +nnet_s1=$nnet_s1_dir/model_ep0005.pth nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v6.0.yaml nnet_s2_args="" nnet_s2_name=${nnet_name}.s4 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/model_ep0003.pth +nnet_s2=$nnet_s2_dir/model_ep0005.pth nnet_s3_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage5_v6.0.yaml nnet_s3_args="" nnet_s3_name=${nnet_name}.s5 nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name -nnet_s3=$nnet_s3_dir/model_ep0002.pth -nnet_s3=$nnet_s3_dir/model_ep0005.pth +nnet_s3=$nnet_s3_dir/model_ep0011.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh index aaafecc1..ffa2a057 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_lid_v1.0_13langs.sh @@ -38,7 +38,7 @@ nnet_s1_args="" nnet_name=${hf_model_name}_rnnt_k2_pruned_transducer_ecapadnn1024x3.v1.0_13_langs_weighted_8000_bpe nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_resnet1d_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0007.pth +nnet_s1=$nnet_s1_dir/model_ep0003.pth nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_transducer_ecapadnn1024x3_stage2_v1.0.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh index 29a762fa..a809e51d 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh @@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs_weighted @@ -28,13 +28,13 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0007.pth +nnet_s1=$nnet_s1_dir/model_ep0015.pth nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml nnet_s2_args="" nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/model_ep0020.pth +nnet_s2=$nnet_s2_dir/model_ep0015.pth nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml nnet_s3_args="" diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh index c0fbe9dc..b3648580 100644 --- a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh +++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh @@ -25,7 +25,7 @@ nnet_name=${hf_model_name}_transducer_v3.3_it nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0015.pth +nnet_s1=$nnet_s1_dir/model_ep0114.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/run_004_compute_bpe.sh b/egs/commonvoice/v1/run_004_compute_bpe.sh index 617f03ae..ee14ca2b 100755 --- a/egs/commonvoice/v1/run_004_compute_bpe.sh +++ b/egs/commonvoice/v1/run_004_compute_bpe.sh @@ -6,10 +6,8 @@ set -e vocab_sizes=( - # 5000 - 2000 - 1000 - 500 + 8000 + 16000 ) dl_dir=$PWD/download @@ -23,14 +21,14 @@ config_file=default_config.sh . $config_file -if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - echo "Stage 1: Dump transcripts for LM training" - mkdir -p data/lm - gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ - | jq '.text' \ - | sed 's:"::g' \ - > data/lm/${language}_transcript_words.txt -fi +# if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then +# echo "Stage 1: Dump transcripts for LM training" +# mkdir -p data/lm +# gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ +# | jq '.text' \ +# | sed 's:"::g' \ +# > data/lm/${language}_transcript_words.txt +# fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then echo "Stage 2: Prepare BPE based lang" @@ -44,16 +42,16 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then echo "!SIL 1" >> $lang_dir/words.txt echo " 2" >> $lang_dir/words.txt - # Add regular words to words.txt - gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ - | jq '.text' \ - | sed 's:"::g' \ - | sed 's: :\n:g' \ - | sort \ - | uniq \ - | sed '/^$/d' \ - | awk '{print $0,NR+2}' \ - >> $lang_dir/words.txt + # # Add regular words to words.txt + # gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ + # | jq '.text' \ + # | sed 's:"::g' \ + # | sed 's: :\n:g' \ + # | sort \ + # | uniq \ + # | sed '/^$/d' \ + # | awk '{print $0,NR+2}' \ + # >> $lang_dir/words.txt # Add remaining special word symbols expected by LM scripts. num_words=$(cat $lang_dir/words.txt | wc -l) diff --git a/egs/commonvoice/v1/run_030_inference.sh b/egs/commonvoice/v1/run_030_inference.sh index ec5b140b..72da282e 100755 --- a/egs/commonvoice/v1/run_030_inference.sh +++ b/egs/commonvoice/v1/run_030_inference.sh @@ -34,6 +34,8 @@ fi transducer_dir=exp/transducer/$nnet_name +rm -f $transducer_dir/overall_wer_char.txt + # test_data=test_clean diff --git a/egs/commonvoice/v1/run_031_inference_film.sh b/egs/commonvoice/v1/run_031_inference_film.sh index 7b796107..d5749eb4 100755 --- a/egs/commonvoice/v1/run_031_inference_film.sh +++ b/egs/commonvoice/v1/run_031_inference_film.sh @@ -34,6 +34,8 @@ fi transducer_dir=exp/transducer/$nnet_name +rm -f $transducer_dir/overall_wer_char.txt + # test_data=test_clean diff --git a/egs/commonvoice/v1/run_032_identificate.sh b/egs/commonvoice/v1/run_032_identificate.sh index a9a8cee5..76b98c34 100755 --- a/egs/commonvoice/v1/run_032_identificate.sh +++ b/egs/commonvoice/v1/run_032_identificate.sh @@ -34,6 +34,8 @@ fi lid_dir=exp/resnet1d/$nnet_name +rm -f $lid_dir/overall_lid_score.txt + # Extracts x-vectors for evaluation for name in $test_data # $dev_data $test_data do diff --git a/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh index 5a2bbc27..5a9a30c8 100755 --- a/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh +++ b/egs/commonvoice/v1/steps_lid/identificate_wav2vec2resnet1d.sh @@ -77,8 +77,11 @@ if [ $stage -le 1 ];then echo "compute error rate" cat $output_dir/languageid.* > $output_dir/langs + python steps_lid/cal_lid_score.py $output_dir/langs > $output_dir/lid_score - python steps_lid/lid_score.py $output_dir/langs >> $output_dir/scores + echo $(basename "$output_dir") >> $output_dir/../overall_lid_score.txt + cat $output_dir/lid_score >> $output_dir/../overall_lid_score.txt + echo " " >> $output_dir/../overall_lid_score.txt # python steps_transducer/word2char.py $data_dir/text $data_dir/text_char # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh index ebd6398d..17378c29 100755 --- a/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh +++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_film_transducer.sh @@ -76,7 +76,11 @@ if [ $stage -le 1 ];then python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer - # compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char + compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe + echo $(basename "$output_dir") >> $output_dir/../overall_wer_char.txt + cat $output_dir/wer_char >> $output_dir/../overall_wer_char.txt + echo " " >> $output_dir/../overall_wer_char + fi diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh index 986c8190..18d6ad4c 100755 --- a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh +++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer.sh @@ -69,11 +69,14 @@ if [ $stage -le 1 ];then python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text python steps_transducer/word2char.py $data_dir/text $data_dir/text_char - python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model - python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model + # python steps_transducer/word2bpe.py $output_dir/transducer.text $output_dir/transducer_bpe.text $bpe_model + # python steps_transducer/word2bpe.py $data_dir/text $data_dir/text_bpe $bpe_model # compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer - # compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char + compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe + echo $(basename "$output_dir") >> $output_dir/../overall_wer_char.txt + cat $output_dir/wer_char >> $output_dir/../overall_wer_char.txt + echo " " >> $output_dir/../overall_wer_char fi diff --git a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh index 3bf84cbd..0363eaf1 100755 --- a/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh +++ b/hyp_utils/steps_transducer/decode_wav2vec2rnn_transducer_lid.sh @@ -69,6 +69,7 @@ if [ $stage -le 1 ];then echo "compute wer" cat $output_dir/transducer.*.text > $output_dir/transducer.text cat $output_dir/languageid.* > $output_dir/langs + python steps_lid/cal_lid_score.py $output_dir/langs > $output_dir/lid_score python steps_transducer/word2char.py $output_dir/transducer.text $output_dir/transducer_char.text python steps_transducer/word2char.py $data_dir/text $data_dir/text_char @@ -79,5 +80,12 @@ if [ $stage -le 1 ];then compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text > $output_dir/wer compute-wer --text --mode=present ark:$data_dir/text_char ark:$output_dir/transducer_char.text > $output_dir/wer_char # compute-wer --text --mode=present ark:$data_dir/text_bpe ark:$output_dir/transducer_bpe.text > $output_dir/wer_bpe + echo $(basename "$output_dir") >> $output_dir/../overall_lid_score.txt + cat $output_dir/lid_score >> $output_dir/../overall_lid_score.txt + echo " " >> $output_dir/../overall_lid_score.txt + echo $(basename "$output_dir") >> $output_dir/../overall_wer_char.txt + cat $output_dir/wer_char >> $output_dir/../overall_wer_char.txt + echo " " >> $output_dir/../overall_wer_char.txt + fi diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh index 8321169f..c6c3ea9f 100755 --- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh +++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh @@ -11,6 +11,7 @@ nodes=b1 storage_name=$(date +'%m_%d_%H_%M') proc_opts="--remove-dc-offset" use_bin_vad=false +osr=0 echo "$0 $@" # Print the command line for logging @@ -90,6 +91,10 @@ else fi fi +if [ "$osr" != 0 ];then + args="${args} --output-sampling-rate ${osr}" +fi + $cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \ hyp_utils/conda_env.sh \ preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \ diff --git a/hyperion/bin/identificate_wav2languageid.py b/hyperion/bin/identificate_wav2languageid.py index 8b01ac25..37cf22e4 100755 --- a/hyperion/bin/identificate_wav2languageid.py +++ b/hyperion/bin/identificate_wav2languageid.py @@ -120,6 +120,7 @@ def decode_languageid(input_spec, output_spec, scp_sep, model_path, lang_file, device = init_device(use_gpu) model = load_model(model_path, device) + logging.info(nn.functional.softmax(model.feat_fuser, dim=-1)) # load language dict form langfile by row number lang_dict = {} with open(lang_file, "r") as f: diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 8fe67792..d97b8c9e 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -800,6 +800,15 @@ def forward(self, x, x_lengths=None, weights=None): else: min_value = -1e20 mask = weights.eq(0) + # #logging mask type, shape + # logging.info('mask type={}, shape={}'.format(mask.dtype, mask.shape)) + # #logging attn type, min_value type + # logging.info('attn type={}'.format(attn.dtype)) + # logging.info('attn={}'.format(attn)) + # logging.info('min_value={}'.format(min_value)) + + + attn = attn.masked_fill(mask, min_value) attn = nnf.softmax(attn, dim=-1) diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py index d8374e77..60920a36 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py @@ -296,17 +296,32 @@ def infer(self, List of list of integer indexes of the recognizer's symbols. """ - feats_transducer, _, _, feat_lengths = self.forward_feats(x, x_lengths) + feats_transducer, feats_languageid, _, feat_lengths = self.forward_feats(x, x_lengths) + # logging.info(f"feat_lengths: {feat_lengths}") + # logging.info(f"feats_transducer.shape: {feats_transducer.shape}") + # logging.info(f"feats_languageid.shape: {feats_languageid.shape}") + # logging.info(f"feats_transducer: {feats_transducer}") + # logging.info(f"feats_languageid: {feats_languageid}") + lid = self.languageid( + feats_languageid.float(), + feat_lengths, + None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ) - feats = feats_transducer.permute(0, 2, 1) # (N, C, T) ->(N, T, C) - y = self.transducer.infer(feats, + feats_transducer = feats_transducer.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + text = self.transducer.infer(feats_transducer, feat_lengths, decoding_method=decoding_method, beam_width=beam_width, max_sym_per_frame=max_sym_per_frame, max_sym_per_utt=max_sym_per_utt) - return y + + return text, lid # def freeze_feat_fuser(self): # if self.feat_fuser is None: From acbfc06941ce066c99cd1a9c3de3674a0a200f39 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Tue, 4 Jul 2023 17:34:05 -0400 Subject: [PATCH 67/89] update joint training for ASR-LID --- egs/commonvoice/v1/conf/clsp.conf | 13 ++- egs/commonvoice/v1/conf/slurm.conf | 4 +- ...2base_rnnt_film_k2_pruned_stage2_v1.0.yaml | 17 ++-- ...2base_rnnt_film_k2_pruned_stage3_v6.0.yaml | 4 +- ...v2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml | 4 +- ...v2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml | 4 +- ...v2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml | 3 +- ...c2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml | 2 +- ...c2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml | 8 +- ...v2vec2xlsr300m_transducer_stage1_v3.3.yaml | 4 +- ...v2vec2xlsr300m_transducer_stage2_v3.2.yaml | 5 +- egs/commonvoice/v1/datapath.sh | 2 +- .../v1/global_conf/config_lid_v2.1_13langs.sh | 2 +- .../v1/global_conf/config_lid_v2.2_13langs.sh | 2 +- ...g_pruned_filmed_transducer_v1.0_13langs.sh | 2 +- ...g_pruned_filmed_transducer_v6.0_13langs.sh | 2 +- .../config_pruned_transducer_v1.3_13langs.sh | 2 +- .../config_pruned_transducer_v4.0_13langs.sh | 2 +- .../global_conf/config_transducer_v3.3_it.sh | 3 +- egs/commonvoice/v1/run_004_compute_bpe.sh | 42 ++++----- egs/commonvoice/v1/run_011_train_asr.sh | 4 +- egs/commonvoice/v1/run_015_train_film_asr.sh | 13 ++- egs/commonvoice/v1/run_020_train_asr_lid.sh | 2 +- .../preprocess_audios_for_nnet_train.sh | 2 +- ..._wav2vec2rnn_film_transducer_languageid.py | 2 +- hyperion/bin/train_wav2vec2rnn_transducer.py | 4 +- ...train_wav2vec2rnn_transducer_languageid.py | 4 +- hyperion/bin/train_wav2vec2transducer.py | 7 ++ hyperion/torch/layers/global_pool.py | 5 + .../hf_wav2rnn_transducer_languageid.py | 91 ++++++++++++++++--- .../hf_wav2vec2rnn_transducer_languageid.py | 35 ++++++- 31 files changed, 203 insertions(+), 93 deletions(-) diff --git a/egs/commonvoice/v1/conf/clsp.conf b/egs/commonvoice/v1/conf/clsp.conf index 959c62a7..1c75f327 100644 --- a/egs/commonvoice/v1/conf/clsp.conf +++ b/egs/commonvoice/v1/conf/clsp.conf @@ -1,11 +1,16 @@ # Default configuration -command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V -option mem=* -l mem_free=$0,ram_free=$0 +command sbatch --export=PATH +#command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* --mem-per-cpu $0 +# option mem=* -l mem_free=$0,ram_free=$0 option mem=0 # Do not add anything to qsub_opts option num_threads=* -pe smp $0 option num_threads=1 # Do not add anything to qsub_opts option max_jobs_run=* -tc $0 default gpu=0 -option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*' -option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' +option gpu=0 +option gpu=* -p GPU-shared --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU +#option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*' +#option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' + diff --git a/egs/commonvoice/v1/conf/slurm.conf b/egs/commonvoice/v1/conf/slurm.conf index 262344ea..423d9133 100644 --- a/egs/commonvoice/v1/conf/slurm.conf +++ b/egs/commonvoice/v1/conf/slurm.conf @@ -1,7 +1,7 @@ # Default configuration command sbatch --export=PATH option name=* --job-name $0 -default time=48:00:00 +default time=24:00:00 option time=* --time $0 option mem=* --mem-per-cpu $0 option mem=0 @@ -10,6 +10,6 @@ option num_threads=1 --cpus-per-task 1 option num_nodes=* --nodes $0 default gpu=0 option gpu=0 -option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 1 --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU +option gpu=* -p a100 -A jvillal7_gpu --ntasks-per-node 4 --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU # note: the --max-jobs-run option is supported as a special case # by slurm.pl and you don't have to handle it in the config file. diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml index a867f12a..5a1555dd 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml @@ -10,8 +10,8 @@ data: sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' - max_batch_length: 15. - max_audio_length: 15. + max_batch_length: 40. + max_audio_length: 20. min_batch_size: 1 drop_last: false # for class_weighted_random_bucketing_seg_sampler @@ -19,7 +19,7 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.1 + num_chunks_per_seg_epoch: 0.3 data_loader: num_workers: 1 @@ -34,8 +34,8 @@ data: sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' - max_batch_length: 15. - max_audio_length: 15. + max_batch_length: 40. + max_audio_length: 20. min_batch_size: 1 drop_last: true # for class_weighted_random_bucketing_seg_sampler @@ -43,13 +43,14 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.5 + num_chunks_per_seg_epoch: 1.0 data_loader: num_workers: 1 model: transducer: decoder: prune_range: 15 + override_dropouts: false trainer: optim: opt_type: sgd @@ -59,8 +60,8 @@ trainer: lrsched: lrsch_type: exp_lr decay_rate: 0.5 - decay_steps: 45000 - hold_steps: 30000 + decay_steps: 180000 + hold_steps: 60000 min_lr: 4e-5 warmup_steps: 6000 update_lr_on_opt_step: true diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml index 4a72296d..d2f01bd9 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v6.0.yaml @@ -22,7 +22,7 @@ data: num_chunks_per_seg_epoch: 0.1 data_loader: - num_workers: 8 + num_workers: 4 val: dataset: aug_cfgs: @@ -45,7 +45,7 @@ data: weight_exponent: 0.3 num_chunks_per_seg_epoch: 1.0 data_loader: - num_workers: 8 + num_workers: 4 model: hf_feats: pretrained_model_path: facebook/wav2vec2-xls-r-300m diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml index 3712babc..39c61fa7 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml @@ -9,7 +9,7 @@ data: sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' - max_batch_length: 70. + max_batch_length: 50. min_batch_size: 1 drop_last: false # for class_weighted_random_bucketing_seg_sampler @@ -30,7 +30,7 @@ data: sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' - max_batch_length: 70. + max_batch_length: 50. min_batch_size: 1 drop_last: true # for class_weighted_random_bucketing_seg_sampler diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml index f41f8dad..7e059b3b 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v4.0.yaml @@ -70,8 +70,8 @@ trainer: lrsched: lrsch_type: exp_lr decay_rate: 0.5 - decay_steps: 180000 - hold_steps: 60000 + decay_steps: 45000 + hold_steps: 30000 min_lr: 4e-5 warmup_steps: 6000 update_lr_on_opt_step: true diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml index 9db63d77..e5ae33a4 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml @@ -49,10 +49,11 @@ model: decoder: prune_range: 15 override_dropouts: false + reduction: mean trainer: optim: opt_type: sgd - lr: 0.005 + lr: 0.0002 momentum: 0.9 weight_decay: 4e-4 lrsched: diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml index 77cd2d26..8c62ac1b 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.2.yaml @@ -47,7 +47,7 @@ model: wav2vec2xlsr300m_ecapatdnn512x3_do0.2.yaml trainer: optim: opt_type: sgd - lr: 0.003 + lr: 0.001 momentum: 0.9 weight_decay: 4e-4 lrsched: diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml index c73c7130..a40db186 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml @@ -49,16 +49,16 @@ model: trainer: optim: opt_type: sgd - lr: 0.0005 + lr: 0.0001 momentum: 0.9 weight_decay: 4e-4 lrsched: lrsch_type: exp_lr decay_rate: 0.5 - decay_steps: 420000 - hold_steps: 300000 + decay_steps: 60000 + hold_steps: 30000 min_lr: 4e-5 - warmup_steps: 15000 + warmup_steps: 5000 update_lr_on_opt_step: true grad_clip: 100 use_amp: true diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml index 96e0c4aa..e9fe0b05 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml @@ -13,7 +13,7 @@ data: min_batch_size: 1 drop_last: false data_loader: - num_workers: 1 + num_workers: 2 val: dataset: aug_cfgs: @@ -28,7 +28,7 @@ data: min_batch_size: 1 drop_last: true data_loader: - num_workers: 1 + num_workers: 2 model: wav2vec2xlsr300m_transducer_do0.4.yaml trainer: optim: diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml index 69c489b0..2e5a9ea5 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml @@ -32,10 +32,7 @@ data: model: transducer: decoder: - override_dropouts: true - embedding_dropout_rate: 0.3 - rnn_dropout_rate: 0.3 - + override_dropouts: false trainer: optim: opt_type: sgd diff --git a/egs/commonvoice/v1/datapath.sh b/egs/commonvoice/v1/datapath.sh index e844d6cd..56b242ed 100644 --- a/egs/commonvoice/v1/datapath.sh +++ b/egs/commonvoice/v1/datapath.sh @@ -5,7 +5,7 @@ if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then - commonvoice_root= + commonvoice_root=/scratch4/jvillal7/ylu125/corpora/commonvoice musan_root=/export/corpora5/JHU/musan echo "Put your database paths here" exit 1 diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh index c5febd98..9d35d162 100644 --- a/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_lid_v2.1_13langs.sh @@ -28,7 +28,7 @@ nnet_name=${hf_model_name}_resnet1d_v2.1_13_langs nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0022.pth +nnet_s1=$nnet_s1_dir/model_ep0002.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.1.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh index debd9377..1db9b7a6 100644 --- a/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_lid_v2.2_13langs.sh @@ -28,7 +28,7 @@ nnet_name=${hf_model_name}_resnet1d_v2.2_13_langs nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0014.pth +nnet_s1=$nnet_s1_dir/model_ep0013.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v2.2.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh index 1fc49fdd..b0ed4451 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh @@ -34,7 +34,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v1.0_13_langs_weighted_8000_bpe nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0016.pth +nnet_s1=$nnet_s1_dir/model_ep0007.pth nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh index 71d38168..ea68b945 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v6.0_13langs.sh @@ -29,7 +29,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v6.0_13_langs_weighted_8000_bpe nnet_s1_name=$nnet_name.s3 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0003.pth +nnet_s1=$nnet_s1_dir/model_ep0005.pth nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v6.0.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh index 575a8436..fb6709db 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v1.3_13langs.sh @@ -28,7 +28,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned.v1.3_13_langs_16000_bpe nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0019.pth +nnet_s1=$nnet_s1_dir/model_ep0002.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh index 29a762fa..f43b323f 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh @@ -28,7 +28,7 @@ nnet_name=${hf_model_name}_rnnt_k2_pruned.v4.0_13_langs_weighted_8000_bpe nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0007.pth +nnet_s1=$nnet_s1_dir/model_ep0016.pth nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml nnet_s2_args="" diff --git a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh index c0fbe9dc..d62fcef4 100644 --- a/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh +++ b/egs/commonvoice/v1/global_conf/config_transducer_v3.3_it.sh @@ -25,7 +25,7 @@ nnet_name=${hf_model_name}_transducer_v3.3_it nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0015.pth +nnet_s1=$nnet_s1_dir/model_ep0042.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.3.yaml nnet_s2_args="" @@ -39,3 +39,4 @@ nnet_s3_name=${nnet_name}.s3 nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name nnet_s3=$nnet_s3_dir/model_ep0002.pth nnet_s3=$nnet_s3_dir/model_ep0005.pth + diff --git a/egs/commonvoice/v1/run_004_compute_bpe.sh b/egs/commonvoice/v1/run_004_compute_bpe.sh index 617f03ae..ee14ca2b 100755 --- a/egs/commonvoice/v1/run_004_compute_bpe.sh +++ b/egs/commonvoice/v1/run_004_compute_bpe.sh @@ -6,10 +6,8 @@ set -e vocab_sizes=( - # 5000 - 2000 - 1000 - 500 + 8000 + 16000 ) dl_dir=$PWD/download @@ -23,14 +21,14 @@ config_file=default_config.sh . $config_file -if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - echo "Stage 1: Dump transcripts for LM training" - mkdir -p data/lm - gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ - | jq '.text' \ - | sed 's:"::g' \ - > data/lm/${language}_transcript_words.txt -fi +# if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then +# echo "Stage 1: Dump transcripts for LM training" +# mkdir -p data/lm +# gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ +# | jq '.text' \ +# | sed 's:"::g' \ +# > data/lm/${language}_transcript_words.txt +# fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then echo "Stage 2: Prepare BPE based lang" @@ -44,16 +42,16 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then echo "!SIL 1" >> $lang_dir/words.txt echo " 2" >> $lang_dir/words.txt - # Add regular words to words.txt - gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ - | jq '.text' \ - | sed 's:"::g' \ - | sed 's: :\n:g' \ - | sort \ - | uniq \ - | sed '/^$/d' \ - | awk '{print $0,NR+2}' \ - >> $lang_dir/words.txt + # # Add regular words to words.txt + # gunzip -c data/${language}/cv-${language}_supervisions_train.jsonl.gz \ + # | jq '.text' \ + # | sed 's:"::g' \ + # | sed 's: :\n:g' \ + # | sort \ + # | uniq \ + # | sed '/^$/d' \ + # | awk '{print $0,NR+2}' \ + # >> $lang_dir/words.txt # Add remaining special word symbols expected by LM scripts. num_words=$(cat $lang_dir/words.txt | wc -l) diff --git a/egs/commonvoice/v1/run_011_train_asr.sh b/egs/commonvoice/v1/run_011_train_asr.sh index 284a68f5..55cb04a3 100755 --- a/egs/commonvoice/v1/run_011_train_asr.sh +++ b/egs/commonvoice/v1/run_011_train_asr.sh @@ -18,7 +18,7 @@ set -e #export CONV_RSH=ssh #export LD_LIBRARY_PATH=/scratch4/jvillal7/ylu125/miniconda3/envs/gsp_hyp/lib/:$LD_LIBRARY_PATH -export CUDA_VISIBLE_DEVICES=0,1 +# export CUDA_VISIBLE_DEVICES=0,1 stage=1 ngpu=2 config_file=default_config.sh @@ -106,8 +106,8 @@ if [ $stage -le 2 ]; then --in-model-file $nnet_s1 \ --data.train.dataset.time-durs-file $train_dir/utt2dur \ --data.val.dataset.time-durs-file $val_dir/utt2dur \ - --master-port 1236 \ --num-gpus $ngpu + # --master-port 1236 \ fi diff --git a/egs/commonvoice/v1/run_015_train_film_asr.sh b/egs/commonvoice/v1/run_015_train_film_asr.sh index fbf30558..638384bb 100755 --- a/egs/commonvoice/v1/run_015_train_film_asr.sh +++ b/egs/commonvoice/v1/run_015_train_film_asr.sh @@ -74,7 +74,6 @@ if [ $stage -le 1 ]; then --trainer.exp-path $nnet_s1_dir $args \ --data.train.dataset.time-durs-file $train_dir/utt2dur \ --data.val.dataset.time-durs-file $val_dir/utt2dur \ - --in-model-file $nnet_rnn_transducer \ --master-port 1237 \ --num-gpus $ngpu @@ -117,26 +116,30 @@ if [ $stage -le 3 ]; then if [ "$use_wandb" == "true" ];then extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" fi - mkdir -p $nnet_s3_dir/log $cuda_cmd \ --gpu $ngpu $nnet_s3_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - - .py $nnet_type \ + finetune_wav2vec2rnn_film_transducer.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ + --data.train.dataset.class-names "language" \ + --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ --data.val.dataset.audio-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ + --data.val.dataset.class-names "language" \ + --data.val.dataset.class-files $train_dir/langs \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ - --in-model-file $nnet_s2 \ + --in-model-file $nnet_s1 \ --data.train.dataset.time-durs-file $train_dir/utt2dur \ --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1237 \ --num-gpus $ngpu + fi diff --git a/egs/commonvoice/v1/run_020_train_asr_lid.sh b/egs/commonvoice/v1/run_020_train_asr_lid.sh index 4b312e76..6a4b3252 100755 --- a/egs/commonvoice/v1/run_020_train_asr_lid.sh +++ b/egs/commonvoice/v1/run_020_train_asr_lid.sh @@ -20,7 +20,7 @@ set -e stage=1 -ngpu=4 +ngpu=2 config_file=default_config.sh interactive=false num_workers="" diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh index 8321169f..0678b63c 100755 --- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh +++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh @@ -89,7 +89,7 @@ else cp $data_in/$f $data_out/$f fi fi - +echo $cmd $cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \ hyp_utils/conda_env.sh \ preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \ diff --git a/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py b/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py index 22808dbd..514fe4d1 100755 --- a/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py +++ b/hyperion/bin/finetune_wav2vec2rnn_film_transducer_languageid.py @@ -196,7 +196,7 @@ def train_model(gpu_id, args): trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {"acc": CategoricalAccuracy()} + metrics = {} #{"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py index 7018c406..6965f9f9 100755 --- a/hyperion/bin/train_wav2vec2rnn_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -123,7 +123,7 @@ def train_model(gpu_id, args): set_float_cpu("float32") #torch.backends.cudnn.deterministic = True #torch.backends.cudnn.benchmark = False - torch.backends.cudnn.enabled = False + # torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) @@ -252,5 +252,5 @@ def make_parser(model_class): args_sc.model_class = model_dict[model_type] # torch docs recommend using forkserver - # multiprocessing.set_start_method("forkserver") + multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) diff --git a/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py b/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py index 85689ac3..bafe8f66 100755 --- a/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py +++ b/hyperion/bin/train_wav2vec2rnn_transducer_languageid.py @@ -105,7 +105,7 @@ def init_data(partition, rank, num_gpus, **kwargs): collate_fn=transducer_language_collate) return data_loader -def init_model(blank_id, vocab_size, num_classes, rank, model_class, **kwargs): +def init_model(blank_id, vocab_size, num_classes, loss_class_weight, rank, model_class, **kwargs): model_args = model_class.filter_args(**kwargs["model"]) if rank == 0: logging.info("model network args={}".format(model_args)) @@ -113,6 +113,7 @@ def init_model(blank_id, vocab_size, num_classes, rank, model_class, **kwargs): model_args["transducer"]["decoder"]["blank_id"] = blank_id model_args["transducer"]["decoder"]["vocab_size"] = vocab_size model_args["languageid"]["num_classes"] = num_classes + model_args["loss_class_weight"] = loss_class_weight model = model_class(**model_args) if rank == 0: logging.info("model={}".format(model)) @@ -149,6 +150,7 @@ def train_model(gpu_id, args): model = init_model(train_loader.dataset.sp.piece_to_id(""), train_loader.dataset.sp.get_piece_size(), list(train_loader.dataset.num_classes.values())[0], + train_loader.batch_sampler.class_info["weights"], **kwargs) trn_args = Trainer.filter_args(**kwargs["trainer"]) diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 2368f1c2..c0264299 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -27,6 +27,7 @@ model_dict = { "hf_wav2vec2transducer": HFWav2Vec2Transducer, + "hf_wav2vec2rnn_transducer": HFWav2Vec2Transducer, } @@ -51,9 +52,12 @@ def transducer_collate(batch): def init_data(partition, rank, num_gpus, **kwargs): + logging.getLogger().setLevel(logging.INFO) data_kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**data_kwargs["dataset"]) sampler_args = data_kwargs["sampler"] + logging.info("rank={}".format(rank)) + logging.info("{} audio dataset args={}".format(partition, ad_args)) if rank == 0: logging.info("{} audio dataset args={}".format(partition, ad_args)) logging.info("{} sampler args={}".format(partition, sampler_args)) @@ -85,6 +89,7 @@ def init_data(partition, rank, num_gpus, **kwargs): def init_model(blank_id, vocab_size, rank, model_class, **kwargs): + logging.getLogger().setLevel(logging.INFO) model_args = model_class.filter_args(**kwargs["model"]) if rank == 0: logging.info("model network args={}".format(model_args)) @@ -123,6 +128,8 @@ def train_model(gpu_id, args): train_loader.dataset.sp.get_piece_size(), **kwargs) trn_args = Trainer.filter_args(**kwargs["trainer"]) + logging.info("trainer args={}".format(trn_args)) + logging.info("rank={}".format(rank)) if rank == 0: logging.info("trainer args={}".format(trn_args)) metrics = {} #{"acc": CategoricalAccuracy()} diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 8fe67792..4967a2c5 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -781,9 +781,13 @@ def forward(self, x, x_lengths=None, weights=None): x = x.transpose(1, self.dim) # x = (batch, feat_dim, time) + # logging.info("x_lengths",x_lengths) + # logging.info("weights_bef",weights) weights = self._standardize_weights(x, x_lengths, weights) # (batch, 1, time) x_inner = self.conv1(x) # (batch, inner_dim, time) + # logging.info("weights_aft",weights) # logging.info('x_inner1={} {}'.format(torch.sum(torch.isnan(x_inner)), torch.sum(torch.isinf(x_inner)))) + # logging.info('weights shape={} {}'.format(weights.shape, weights.dtype)) if self.use_global_context: global_mus = self.stats_pool(x, weights=weights) x_inner = x_inner + self.lin_global(global_mus).unsqueeze(-1) @@ -800,6 +804,7 @@ def forward(self, x, x_lengths=None, weights=None): else: min_value = -1e20 mask = weights.eq(0) + # logging.info("attn", attn.shape, mask.shape) attn = attn.masked_fill(mask, min_value) attn = nnf.softmax(attn, dim=-1) diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py index d8374e77..fe6cee1d 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py @@ -46,9 +46,13 @@ def __init__(self, hf_feats: TorchModel, transducer: Union[Dict, TorchModel], languageid: Union[Dict, TorchModel], - feat_fusion_start: int = 0, + feat_fusion_start_transducer: int = 0, + feat_fusion_start_lid: int = 0, feat_fusion_method_transducer: str = "weighted-avg", feat_fusion_method_lid: str = "weighted-avg", + loss_lid_type: str = "weightedCE", + loss_class_weight: Optional[torch.Tensor] = None, + loss_class_weight_exp= 1.0, loss_weight_transducer: float = 0.005, loss_weight_lid: float = 1.0, lid_length: float = 3.0, @@ -82,22 +86,34 @@ def __init__(self, self.transducer = transducer self.languageid = languageid - - self.feat_fusion_start = feat_fusion_start + self.feat_fusion_start_transducer = feat_fusion_start_transducer + self.feat_fusion_start_lid = feat_fusion_start_lid self.feat_fusion_method_transducer = feat_fusion_method_transducer self.feat_fusion_method_lid = feat_fusion_method_lid + self.loss_lid_type = loss_lid_type + self.loss_class_weight = loss_class_weight + self.loss_class_weight_exp = loss_class_weight_exp + + if loss_lid_type == "CE" or loss_lid_type is None: + self.loss_lid = nn.CrossEntropyLoss() + elif loss_lid_type == "weightedCE": + self.loss_lid = nn.CrossEntropyLoss(weight=torch.tensor(loss_class_weight.values, dtype=torch.float)**(-loss_class_weight_exp)) + logging.info(torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp)) + elif loss_lid_type == "focal_loss": + self.loss_lid = FocalLoss(alpha=torch.tensor(loss_class_weight.values)**(-loss_class_weight_exp), gamma=2, size_average=True) + self.loss_weight_transducer = loss_weight_transducer self.loss_weight_lid = loss_weight_lid self.lid_length = lid_length self._hf_context = contextlib.nullcontext() - self.transducer_fuser = self._make_fuser(self.feat_fusion_method_transducer) - self.languageid_fuser = self._make_fuser(self.feat_fusion_method_lid) + self.transducer_fuser = self._make_fuser(self.feat_fusion_method_transducer, self.feat_fusion_start_transducer) + self.languageid_fuser = self._make_fuser(self.feat_fusion_method_lid, self.feat_fusion_start_lid) - def _make_fuser(self, method): + def _make_fuser(self, method, start): if method == "last": feat_fuser = None return feat_fuser - num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + num_layers = self.hf_feats.num_encoder_layers + 1 - start layer_dim = self.hf_feats.hidden_size if method == "weighted-avg": feat_fuser = nn.Parameter(torch.zeros(num_layers)) @@ -126,7 +142,7 @@ def _fuse_hid_feats(self, hid_feats): # There is only one layer of features return hid_feats[0] - hid_feats = hid_feats[self.feat_fusion_start:] + hid_feats = hid_feats[self.feat_fusion_start_transducer:] if self.feat_fusion_method_transducer == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) norm_transducer_weights = nn.functional.softmax(self.transducer_fuser, dim=-1) @@ -257,7 +273,8 @@ def forward( return_logits=return_logits, ) - loss_lid = nn.CrossEntropyLoss()(logits, languageid) + # loss_lid = nn.CrossEntropyLoss()(logits, languageid) + loss_lid = self.loss_lid(logits, languageid) trans_output = self.transducer( feats_transducer, @@ -390,9 +407,13 @@ def filter_args(**kwargs): valid_args = ( "hf_feats", "transducer", - "feat_fusion_start", + "feat_fusion_start_transducer", + "feat_fusion_start_lid", "feat_fusion_method_transducer", "feat_fusion_method_lid", + "loss_lid_type", + "loss_class_weight", + "loss_class_weight_exp", "loss_weight_transducer", "loss_weight_lid", "languageid", @@ -411,9 +432,13 @@ def get_config(self): "hf_feats": hf_cfg, "transducer": tran_cfg, "languageid": lid_cfg, - "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_start_transducer": self.feat_fusion_start_transducer, + "feat_fusion_start_lid": self.feat_fusion_start_lid, "feat_fusion_method_transducer": self.feat_fusion_method_transducer, "feat_fusion_method_lid": self.feat_fusion_method_lid, + "loss_lid_type": self.loss_lid_type, + "loss_class_weight": self.loss_class_weight, + "loss_class_weight_exp": self.loss_class_weight_exp, "loss_weight_transducer": self.loss_weight_transducer, "loss_weight_lid": self.loss_weight_lid, "lid_length": self.lid_length, @@ -422,8 +447,16 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - def change_config(self, hf_feats, transducer, languageid): + # def change_config(self, hf_feats, transducer, languageid): + def change_config(self, loss_weight_transducer, loss_weight_lid, lid_length, hf_feats, transducer, languageid): logging.info("changing hf wav2transducer config") + + self.loss_weight_transducer = loss_weight_transducer + self.loss_weight_lid = loss_weight_lid + self.lid_length = lid_length + self.loss_reg_weight_transducer = loss_reg_weight_transducer + self.loss_reg_weight_lid = loss_reg_weight_lid + self.hf_feats.change_config(**hf_feats) self.transducer.change_config(**transducer) self.languageid.change_config(**languageid) @@ -436,14 +469,24 @@ def add_class_args(parser, prefix=None, skip=set()): parser = ArgumentParser(prog="") parser.add_argument( - "--feat-fusion-start", + "--feat-fusion-start-transducer", + default=0, + type=int, + help=""" + the input to transducer model will fuse the wav2vec + layers from feat_fusion_start_transducer to + the wav2vec num_layers""", + ) + parser.add_argument( + "--feat-fusion-start-lid", default=0, type=int, help=""" - the input to x-vector model will fuse the wav2vec - layers from feat_fusion_start to + the input to lid model will fuse the wav2vec + layers from feat_fusion_start_lid to the wav2vec num_layers""", ) + parser.add_argument( "--feat-fusion-method-transducer", default="weighted-avg", @@ -459,6 +502,24 @@ def add_class_args(parser, prefix=None, skip=set()): "in [weighted-avg, linear, cat, last]"), ) + parser.add_argument( + "--loss-lid-type", + default="weightedCE", + choices=["CE", "weightedCE", "focal_loss"], + help=("loss type for language identification"), + ) + parser.add_argument( + "--loss-class-weight", + default=None, + type=str, + help=("class weight for language identification"), + ) + parser.add_argument( + "--loss-class-weight-exp", + default=1.0, + type=float, + help=("class weight exponent for language identification"), + ) parser.add_argument( "--loss-weight-transducer", default=0.005, diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py index 4a8ca173..28d51679 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2vec2rnn_transducer_languageid.py @@ -38,9 +38,13 @@ def __init__( hf_feats: Union[Dict, HFWav2Vec2], transducer: Union[Dict, RNNTransducer], languageid: Union[Dict, ResNet1dLanguageID], - feat_fusion_start: int = 0, + feat_fusion_start_transducer: int = 0, + feat_fusion_start_lid: int = 0, feat_fusion_method_transducer: str = "weighted-avg", feat_fusion_method_lid: str = "weighted-avg", + loss_lid_type: str = "weightedCE", + loss_class_weight: Optional[torch.Tensor] = None, + loss_class_weight_exp: float = 1.0, loss_weight_transducer: float = 0.005, loss_weight_lid: float = 1.0, lid_length: float = 3.0, @@ -67,8 +71,17 @@ def __init__( # languageid = wav2languageid.languageid - super().__init__(hf_feats, transducer, languageid, feat_fusion_start, - feat_fusion_method_transducer, feat_fusion_method_lid, loss_weight_transducer, loss_weight_lid, lid_length) + super().__init__(hf_feats, transducer, languageid, + feat_fusion_start_transducer=feat_fusion_start_transducer, + feat_fusion_start_lid=feat_fusion_start_lid, + feat_fusion_method_transducer=feat_fusion_method_transducer, + feat_fusion_method_lid=feat_fusion_method_lid, + loss_lid_type=loss_lid_type, + loss_class_weight=loss_class_weight, + loss_class_weight_exp=loss_class_weight_exp, + loss_weight_transducer=loss_weight_transducer, + loss_weight_lid=loss_weight_lid, + lid_length=lid_length) @staticmethod def filter_args(**kwargs): @@ -121,6 +134,22 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") + parser.add_argument( + "--loss-lid-type", + default="weightedCE", + type=str, + help=""" + The type of the loss for language id + """, + ) + parser.add_argument( + "--loss-class-weight-exp", + default=1.0, + type=float, + help=""" + The exponent of the class weight for language id + """, + ) parser.add_argument( "--loss-weight-transducer", From 47fae72f38076d7278ba83ef1651f2c446e7d2af Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 5 Jul 2023 22:00:52 +0000 Subject: [PATCH 68/89] merge commit --- egs/commonvoice/v1/cmd.sh | 7 +++++-- ...ec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml | 5 +++-- ...ec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml | 7 ++++--- ...ec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml | 6 +++--- ...ec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml | 2 +- ...2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml | 2 +- ...wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml | 12 ++++++------ ...vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml | 6 +++--- ...sducer_ecapadnn512x3_1layer_stage2_v2.1.yaml | 6 ++++-- ...sducer_ecapadnn512x3_1layer_stage2_v2.2.yaml | 4 ++-- ...2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml | 1 + ...lsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml | 2 +- .../conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml | 1 + .../v1/global_conf/config_lid_v4.1_13langs.sh | 2 +- .../v1/global_conf/config_lid_v4.2_13langs.sh | 7 +++---- ...fig_pruned_filmed_transducer_v1.0_13langs.sh | 2 +- ...fig_pruned_filmed_transducer_v5.1_13langs.sh | 2 +- .../config_pruned_transducer_v4.0_13langs.sh | 4 ++-- .../v1/local/initailize_lid_model.py | 4 +++- egs/commonvoice/v1/run_011_train_asr.sh | 1 + egs/commonvoice/v1/run_012_train_lid.sh | 17 ++++++++++------- egs/commonvoice/v1/run_015_train_film_asr.sh | 12 ++++++------ egs/commonvoice/v1/run_020_train_asr_lid.sh | 12 ++++++------ .../v1/run_025_train_film_asr_lid.sh | 3 +-- egs/commonvoice/v1/run_030_inference.sh | 2 +- egs/commonvoice/v1/run_031_inference_film.sh | 2 +- hyperion/bin/finetune_wav2vec2rnn_transducer.py | 5 ++++- hyperion/bin/train_wav2vec2xvector.py | 4 ++++ .../hf_wav2rnn_film_transducer.py | 3 +++ .../hf_wav2rnn_film_transducer_languageid.py | 14 ++++++++------ .../hf_wav2rnn_transducer_languageid.py | 2 ++ hyperion/torch/models/xvectors/xvector.py | 15 +++++++++++++++ .../torch/narchs/rnn_film_transducer_decoder.py | 12 ++++++++---- 33 files changed, 116 insertions(+), 70 deletions(-) diff --git a/egs/commonvoice/v1/cmd.sh b/egs/commonvoice/v1/cmd.sh index 6606a180..cedd70f9 100755 --- a/egs/commonvoice/v1/cmd.sh +++ b/egs/commonvoice/v1/cmd.sh @@ -23,8 +23,11 @@ elif [ "$(hostname -d)" == "rockfish.cluster" ];then export cuda_cmd="slurm.pl --config conf/slurm.conf --mem 20G" export cuda_eval_cmd="$train_cmd" else - export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " - export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G" + export train_cmd="run.pl" + export cuda_cmd="run.pl" export cuda_eval_cmd="$train_cmd" + #export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " + #export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G" + #export cuda_eval_cmd="$train_cmd" fi diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml index 7d3d133e..15e06f93 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage1_v1.0.yaml @@ -10,7 +10,7 @@ data: sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' - max_batch_length: 50 + max_batch_length: 40 max_audio_length: 15. min_batch_size: 1 drop_last: false @@ -34,7 +34,7 @@ data: sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' - max_batch_length: 50 + max_batch_length: 40 max_audio_length: 15. min_batch_size: 1 drop_last: true @@ -62,6 +62,7 @@ model: rnn_dropout_rate: 0.4 rnn_type: lstm joiner: + joiner_type: film_joiner hid_feats: 512 feat_fusion_method: film-weighted-avg feat_fusion_start: 2 diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml index 5a1555dd..b391f50c 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml @@ -22,7 +22,7 @@ data: num_chunks_per_seg_epoch: 0.3 data_loader: - num_workers: 1 + num_workers: 8 val: dataset: aug_cfgs: @@ -45,12 +45,13 @@ data: weight_exponent: 0.3 num_chunks_per_seg_epoch: 1.0 data_loader: - num_workers: 1 + num_workers: 8 model: transducer: decoder: prune_range: 15 - override_dropouts: false + joiner: + joiner_type: film_joiner trainer: optim: opt_type: sgd diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml index 72a4c6a6..208a094c 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v4.4.yaml @@ -19,10 +19,10 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.1 + num_chunks_per_seg_epoch: 0.05 data_loader: - num_workers: 8 + num_workers: 4 val: dataset: aug_cfgs: @@ -45,7 +45,7 @@ data: weight_exponent: 0.3 num_chunks_per_seg_epoch: 1.0 data_loader: - num_workers: 8 + num_workers: 4 model: hf_feats: pretrained_model_path: facebook/wav2vec2-xls-r-300m diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml index 8947cfd0..6d7317f7 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v5.1.yaml @@ -77,7 +77,7 @@ trainer: lrsch_type: exp_lr decay_rate: 0.8 decay_steps: 45000 - hold_steps: 90000 + hold_steps: 40000 min_lr: 4e-5 warmup_steps: 3000 update_lr_on_opt_step: true diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml index aaf5dedb..7a5b5dd1 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v4.2.1.yaml @@ -19,7 +19,7 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.1 + num_chunks_per_seg_epoch: 0.05 data_loader: num_workers: 8 diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml index e5ae33a4..2833099f 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml @@ -9,8 +9,8 @@ data: sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' - max_batch_length: 40. - max_audio_length: 20. + max_batch_length: 15. + max_audio_length: 15. min_batch_size: 1 drop_last: false # for class_weighted_random_bucketing_seg_sampler @@ -18,7 +18,7 @@ data: weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.3 + num_chunks_per_seg_epoch: 0.05 data_loader: num_workers: 1 @@ -32,8 +32,8 @@ data: sampler: #sampler_type: 'bucketing_seg_sampler' sampler_type: 'class_weighted_random_bucketing_seg_sampler' - max_batch_length: 40. - max_audio_length: 20. + max_batch_length: 15. + max_audio_length: 15. min_batch_size: 1 drop_last: true # for class_weighted_random_bucketing_seg_sampler @@ -71,4 +71,4 @@ trainer: # eff_batch_size: 1024 eff_batch_size: 128 train_mode: full - \ No newline at end of file + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml index d270d62c..221698d0 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.0.yaml @@ -9,14 +9,14 @@ data: wav_scale: 1 sampler: sampler_type: 'class_weighted_random_seg_chunk_sampler' - min_batch_size: 32 + min_batch_size: 24 max_chunk_length: 3.0 min_chunk_length: 3.0 # weighted weight_mode: "data-prior" class_name: "language" weight_exponent: 0.3 - num_chunks_per_seg_epoch: 0.1 + num_chunks_per_seg_epoch: 0.2 data_loader: num_workers: 8 val: @@ -29,7 +29,7 @@ data: wav_scale: 1 sampler: sampler_type: 'class_weighted_random_seg_chunk_sampler' - min_batch_size: 32 + min_batch_size: 24 max_chunk_length: 3.0 min_chunk_length: 3.0 # weighted diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml index 716a9d8f..4d6b8bed 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.1.yaml @@ -59,8 +59,10 @@ model: # loss_class_weight_exp: 1.0 # 0~1 loss_weight_transducer: 1.0 - loss_weight_lid: 10.0 - loss_weight_embed: 10 + loss_weight_lid: 20.0 + loss_weight_embed: 20 + loss_reg_weight_transducer: 0.0 + loss_reg_weight_lid: 10.0 # lid_length: 3.0 # feat_fusion_method_transducer: film-fused-feature diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml index 2f625da0..4197c653 100644 --- a/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr_rnnt_k2_pruned_film_transducer_ecapadnn512x3_1layer_stage2_v2.2.yaml @@ -61,8 +61,8 @@ model: loss_weight_transducer: 1.0 loss_weight_lid: 0.0 loss_weight_embed: 10 - loss_reg_weight_transducer: 0.5 - loss_reg_weight_lid: 0.0 + loss_reg_weight_transducer: 0.0 + loss_reg_weight_lid: 1.0 # lid_length: 3.0 # feat_fusion_method_transducer: film-fused-feature diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml index 27132c2d..a647c80b 100644 --- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml @@ -28,6 +28,7 @@ languageid: multilayer_concat: true endpoint_channels: 3072 hid_act: swish + dropout_rate: 0.1 pool_net: pool_type: ch-wise-att-mean+stddev inner_feats: 128 diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml index 63c914e3..803dc396 100644 --- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter_do0.2.yaml @@ -32,7 +32,7 @@ languageid: pool_type: ch-wise-att-mean+stddev inner_feats: 128 embed_dim: 128 - loss_type: subcenter-arc-softmax + loss_type: arc-softmax num_subcenters: 2 cos_scale: 32.0 margin: 0. diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml index 5ca98bd9..86d1e7c0 100644 --- a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml @@ -27,6 +27,7 @@ languageid: multilayer: true multilayer_concat: true endpoint_channels: 1536 + dropout_rate: 0.1 pool_net: pool_type: ch-wise-att-mean+stddev inner_feats: 128 diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh index 7d0ed120..9b398388 100644 --- a/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.1_13langs.sh @@ -29,7 +29,7 @@ nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name nnet_s1=$nnet_s1_dir/model_ep0014.pth - + nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v4.1.yaml nnet_s2_args="" nnet_s2_name=${hf_model_name}_resnet1d_v4.1_13_langs.s2 diff --git a/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh index 49721635..1989a904 100644 --- a/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_lid_v4.2_13langs.sh @@ -23,15 +23,14 @@ nnet_type=hf_wav2vec2resnet1d nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage1_v4.2.yaml nnet_s1_args="" - nnet_name=${hf_model_name}_resnet1d_v4.2_13_langs nnet_s1_name=$nnet_name.s1 - nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0008.pth +nnet_s1=$nnet_s1_dir/model_ep0003.pth + nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapadnn1024x3_stage2_v4.2.yaml nnet_s2_args="" -nnet_s2_name=${hf_model_name}_resnet1d_v2_13_langs.s2 +nnet_s2_name=${hf_model_name}_resnet1d_v4.2_13_langs.s2 nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh index b0ed4451..aca7859c 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v1.0_13langs.sh @@ -40,7 +40,7 @@ nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage2_v1.0.yaml nnet_s2_args="" nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/model_ep0020.pth +nnet_s2=$nnet_s2_dir/model_ep0003.pth nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml nnet_s3_args="" diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh index ab3d1ec8..951be9e0 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v5.1_13langs.sh @@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs_weighted diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh index f43b323f..43bc7282 100644 --- a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_13langs.sh @@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=13_langs_train_proc_audio dev_data=13_langs_dev_proc_audio -test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio de_test_proc_audio fr_test_proc_audio en_test_proc_audio kab_test_proc_audio it_test_proc_audio" lans="sl ga-IE cv br tr cy tt ca kab de fr it en" language=13_langs_weighted @@ -34,7 +34,7 @@ nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml nnet_s2_args="" nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/model_ep0020.pth +nnet_s2=$nnet_s2_dir/model_ep0014.pth nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml nnet_s3_args="" diff --git a/egs/commonvoice/v1/local/initailize_lid_model.py b/egs/commonvoice/v1/local/initailize_lid_model.py index 22e32bed..1862333c 100644 --- a/egs/commonvoice/v1/local/initailize_lid_model.py +++ b/egs/commonvoice/v1/local/initailize_lid_model.py @@ -17,7 +17,9 @@ def copy_model_parameters(ASR_model, LID_model): ASR_state_dict = ASR_model["model_state_dict"] LID_state_dict = LID_model["model_state_dict"] - #ASR_state_dict = {name.replace("module.", ""): param for name, param in ASR_state_dict.items()} + # LID_state_dict = {name.replace("module.", ""): param for name, param in LID_state_dict.items()} + + # ASR_state_dict = {name.replace("module.", ""): param for name, param in ASR_state_dict.items()} update_state_dict = {name: param for name, param in ASR_state_dict.items() if name in LID_state_dict and param.shape == LID_state_dict[name].shape and "hf_feats" in name} # remove feature fuser diff --git a/egs/commonvoice/v1/run_011_train_asr.sh b/egs/commonvoice/v1/run_011_train_asr.sh index 55cb04a3..b6a50e7f 100755 --- a/egs/commonvoice/v1/run_011_train_asr.sh +++ b/egs/commonvoice/v1/run_011_train_asr.sh @@ -75,6 +75,7 @@ if [ $stage -le 1 ]; then --trainer.exp-path $nnet_s1_dir $args \ --data.train.dataset.time-durs-file $train_dir/utt2dur \ --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1236 \ --num-gpus $ngpu fi diff --git a/egs/commonvoice/v1/run_012_train_lid.sh b/egs/commonvoice/v1/run_012_train_lid.sh index 3b250e16..bf14500e 100755 --- a/egs/commonvoice/v1/run_012_train_lid.sh +++ b/egs/commonvoice/v1/run_012_train_lid.sh @@ -7,8 +7,10 @@ . ./path.sh set -e +# export CUDA_VISIBLE_DEVICES=3 + stage=1 -ngpu=2 +ngpu=4 config_file=default_config.sh interactive=false num_workers="" @@ -48,19 +50,20 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ train_wav2vec2languageid.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ --data.train.dataset.time-durs-file $train_dir/utt2dur \ --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --master-port 1234 \ --num-gpus $ngpu fi @@ -77,13 +80,13 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2languageid.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ --data.val.dataset.text-file $val_dir/text \ @@ -108,13 +111,13 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2languageid.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ --data.val.dataset.class-files $val_dir/langs \ diff --git a/egs/commonvoice/v1/run_015_train_film_asr.sh b/egs/commonvoice/v1/run_015_train_film_asr.sh index 638384bb..e86cf62d 100755 --- a/egs/commonvoice/v1/run_015_train_film_asr.sh +++ b/egs/commonvoice/v1/run_015_train_film_asr.sh @@ -60,13 +60,13 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ train_wav2vec2rnn_film_transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ --data.val.dataset.class-files $train_dir/langs \ @@ -91,13 +91,13 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2rnn_film_transducer.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ --data.val.dataset.class-files $train_dir/langs \ @@ -123,13 +123,13 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2rnn_film_transducer.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ --data.val.dataset.class-files $train_dir/langs \ diff --git a/egs/commonvoice/v1/run_020_train_asr_lid.sh b/egs/commonvoice/v1/run_020_train_asr_lid.sh index 6a4b3252..a2422eb5 100755 --- a/egs/commonvoice/v1/run_020_train_asr_lid.sh +++ b/egs/commonvoice/v1/run_020_train_asr_lid.sh @@ -61,13 +61,13 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ train_wav2vec2rnn_transducer_languageid.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ --data.val.dataset.class-files $train_dir/langs \ @@ -92,13 +92,13 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer_languageid.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.class-names "language" \ --data.train.dataset.class-files $train_dir/langs \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.class-names "language" \ --data.val.dataset.class-files $train_dir/langs \ @@ -125,11 +125,11 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2seg.csv \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2seg.csv \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ diff --git a/egs/commonvoice/v1/run_025_train_film_asr_lid.sh b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh index f5976ee1..9058ee5a 100755 --- a/egs/commonvoice/v1/run_025_train_film_asr_lid.sh +++ b/egs/commonvoice/v1/run_025_train_film_asr_lid.sh @@ -104,8 +104,7 @@ if [ $stage -le 2 ]; then --data.val.dataset.class-files $train_dir/langs \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s2_dir $args \ - --in-model-transducer $nnet_transducer \ - --in-model-lid $nnet_lid \ + --in-model-file $nnet_s1 \ --data.train.dataset.time-durs-file $train_dir/utt2dur \ --data.val.dataset.time-durs-file $val_dir/utt2dur \ --num-gpus $ngpu diff --git a/egs/commonvoice/v1/run_030_inference.sh b/egs/commonvoice/v1/run_030_inference.sh index ec5b140b..9c5eaaa4 100755 --- a/egs/commonvoice/v1/run_030_inference.sh +++ b/egs/commonvoice/v1/run_030_inference.sh @@ -40,7 +40,7 @@ transducer_dir=exp/transducer/$nnet_name # Extracts x-vectors for evaluation for name in $test_data do - nj=40 + nj=20 steps_transducer/decode_wav2vec2rnn_transducer.sh \ --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ $nnet data/$name \ diff --git a/egs/commonvoice/v1/run_031_inference_film.sh b/egs/commonvoice/v1/run_031_inference_film.sh index 7b796107..d8af0e1b 100755 --- a/egs/commonvoice/v1/run_031_inference_film.sh +++ b/egs/commonvoice/v1/run_031_inference_film.sh @@ -40,7 +40,7 @@ transducer_dir=exp/transducer/$nnet_name # Extracts x-vectors for evaluation for name in $test_data do - nj=40 + nj=16 steps_transducer/decode_wav2vec2rnn_film_transducer.sh \ --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ $nnet data/$name \ diff --git a/hyperion/bin/finetune_wav2vec2rnn_transducer.py b/hyperion/bin/finetune_wav2vec2rnn_transducer.py index 4092ecd7..64d352e0 100755 --- a/hyperion/bin/finetune_wav2vec2rnn_transducer.py +++ b/hyperion/bin/finetune_wav2vec2rnn_transducer.py @@ -18,7 +18,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory -from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, HFWav2Vec2RNNTransducer) from hyperion.torch.trainers import TransducerTrainer as Trainer @@ -27,6 +26,10 @@ namespace_to_dict) from torch.nn.utils.rnn import pad_sequence +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + model_dict = { "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 8e1653b1..1be2b456 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -25,6 +25,10 @@ from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, namespace_to_dict) +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, "hf_hubert2resnet1d": HFHubert2ResNet1dXVector, diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py index 84f2239c..77579c94 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py @@ -246,6 +246,9 @@ def unfreeze_film(self): if "film" in name: logging.info(f"unfreezing {name}") param.requires_grad = True + if "lang_embedding" in name: + logging.info(f"unfreezing {name}") + param.requires_grad = True def freeze_feat_fuser(self): if self.feat_fuser is None: diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py index 7daeddcb..7ee44b01 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py @@ -339,13 +339,13 @@ def forward( if i in return_feat_layers ] - loss_reg_lid = 0 - if self.loss_reg_weight_lid > 0: - loss_reg_lid = self.languageid.get_regularization_loss() + # loss_reg_lid = 0 + # if self.loss_reg_weight_lid > 0: + loss_reg_lid = self.languageid.get_regularization_loss() - loss_reg_transducer = 0 - if self.loss_reg_weight_transducer > 0: - loss_reg_transducer = self.transducer.get_regularization_loss() + # loss_reg_transducer = 0 + # if self.loss_reg_weight_transducer > 0: + loss_reg_transducer = self.transducer.get_regularization_loss() @@ -353,6 +353,8 @@ def forward( loss_transducer=trans_output.loss, loss_lid=loss_lid, loss_embed=loss_embed, + loss_reg_lid=loss_reg_lid, + loss_reg_transducer=loss_reg_transducer, loss_transducer_simple=trans_output.loss_simple, loss_transducer_pruned=trans_output.loss_pruned, h_feats=trans_output.h_feats, diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py index fe6cee1d..6b608368 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py @@ -23,6 +23,8 @@ class RNNTransducerLanguageIDOutput(HypDataClass): loss_transducer: torch.Tensor # Loss from the transducer loss_lid: torch.Tensor # Loss from the language ID loss_embed: Optional[torch.Tensor] = None # Loss from the embedding + loss_reg_lid: Optional[torch.Tensor] = None # Regularization loss from the language ID + loss_reg_transducer: Optional[torch.Tensor] = None # Regularization loss from the transducer loss_transducer_simple: Optional[torch.Tensor] = None # Simple loss from the transducer, if available loss_transducer_pruned: Optional[torch.Tensor] = None # Pruned loss from the transducer, if available h_feats: Optional[List[torch.Tensor]] = None # Hidden features, if available diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index d67785d2..04895971 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -558,6 +558,21 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) + + def get_regularization_loss(self): + reg_loss = 0.0 + total_params = 0 + + for param in self.parameters(): + reg_loss += torch.norm(param)**2 + total_params += torch.numel(param) + + reg_loss = (reg_loss) / total_params + + return reg_loss + + + @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index 9d030ae7..25f0c5f6 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -309,10 +309,14 @@ def forward( self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, lang_embedding: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: # embed lang - if self.film_cond_type == ["one-hot"]: + # logging.info(f"lang_embedding.shape: {lang_embedding.shape}") + # import pdb; pdb.set_trace() + if self.film_cond_type == "one-hot": lang_embedding = self.lang_embedding(lang_embedding) - elif self.film_cond_type == ["lid_pred"]: + elif self.film_cond_type == "lid_pred": lang_embedding = self.lid_lang_embedding(lang_embedding) + # logging.info(f"lang_embedding.shape: {lang_embedding.shape}") + # logging.info(f"film_cond_type: {self.film_cond_type}") # get y_lengths row_splits = y.shape.row_splits(1) y_lengths = row_splits[1:] - row_splits[:-1] @@ -348,9 +352,9 @@ def decode(self, # if self.film_cond_type in ["one-hot", "lid_pred"]: # lang_embedding = self.lang_embedding(lang) - if self.film_cond_type == ["one-hot"]: + if self.film_cond_type == "one-hot": lang_embedding = self.lang_embedding(lang) - elif self.film_cond_type == ["lid_pred"]: + elif self.film_cond_type == "lid_pred": lang_embedding = self.lid_lang_embedding(lang) if method == "time_sync_beam_search": return self.decode_time_sync_beam_search(x, From 562498f69dca3cfab24a8ee452a1e86c58ee85c0 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Wed, 5 Jul 2023 18:02:26 -0400 Subject: [PATCH 69/89] update decode code --- .../hf_wav2rnn_film_transducer_languageid.py | 27 ++++++++++++++----- .../hf_wav2rnn_transducer_languageid.py | 2 +- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py index 7daeddcb..9697d32c 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_film_transducer_languageid.py @@ -381,29 +381,42 @@ def infer(self, feats_languageid, hid_feats, feat_lengths = self.forward_lid_feats( - x, x_lengths, return_feat_layers) + x, x_lengths, None) + # logging.info(f"feat_lengths: {feat_lengths}") + # logging.info(f"feats_languageid.shape: {feats_languageid.shape}") + # logging.info(f"feats_languageid: {feats_languageid}") - lid = self.languageid( - feats_languageid.float(), - feat_lengths, + output = self.languageid( + feats_languageid, + None, None, return_enc_layers=None, - return_classif_layers=None, + return_classif_layers=[0], return_logits=True, ) + + # output = self.languageid( + # feats_languageid, + # feat_lengths, + # None, + # return_enc_layers=None, + # return_classif_layers=[0], + # return_logits=True, + # ) - feats_transducer = self._fuse_transducer_hid_feats(hid_feats, lid) # (N, T, C) + feats_transducer = self._fuse_transducer_hid_feats(hid_feats, output["h_classif"][0]) # (N, T, C) text = self.transducer.infer(feats_transducer, feat_lengths, + lang=output["h_classif"][0], decoding_method=decoding_method, beam_width=beam_width, max_sym_per_frame=max_sym_per_frame, max_sym_per_utt=max_sym_per_utt) - return text, lid + return text, output["logits"] def unfreeze_lid_film(self): for name, param in self.named_parameters(): diff --git a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py index 952cbb65..278b09ad 100644 --- a/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py +++ b/hyperion/torch/models/wav2transducer_languageid/hf_wav2rnn_transducer_languageid.py @@ -321,7 +321,7 @@ def infer(self, # logging.info(f"feats_languageid: {feats_languageid}") lid = self.languageid( feats_languageid.float(), - feat_lengths, + None, None, return_enc_layers=None, return_classif_layers=None, From 458e65ed9918fe082aabf0f9fb59a29d394addb6 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 7 Jul 2023 03:14:36 +0000 Subject: [PATCH 70/89] add rnn_original for film-rnn --- .../narchs/rnn_film_transducer_decoder.py | 58 ++++++++++++++----- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/hyperion/torch/narchs/rnn_film_transducer_decoder.py b/hyperion/torch/narchs/rnn_film_transducer_decoder.py index 94fe4b17..17bbe515 100644 --- a/hyperion/torch/narchs/rnn_film_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_film_transducer_decoder.py @@ -22,7 +22,8 @@ from ...utils.text import add_sos from ..layer_blocks import TransducerFiLMJoiner as FiLMJoiner from ..layer_blocks import TransducerJoiner as Joiner -from ..layer_blocks import TransducerRNNFiLMPredictor as RNNPredictor +from ..layer_blocks import TransducerRNNFiLMPredictor as FiLMRNNPredictor +from ..layer_blocks import TransducerRNNPredictor as RNNPredictor from .net_arch import NetArch @@ -125,9 +126,13 @@ def _make_predictor(self): self.predictor_args["condition_size"] = self.condition_size # Add FiLM args to the predictor args if pred_type == "rnn": + pred_args = filter_func_args(FiLMRNNPredictor.__init__, + self.predictor_args) + self.predictor = FiLMRNNPredictor(**pred_args, film_type=self.film_type, film_cond_type=self.film_cond_type) + elif pred_type == "rnn_original": pred_args = filter_func_args(RNNPredictor.__init__, self.predictor_args) - self.predictor = RNNPredictor(**pred_args, film_type=self.film_type, film_cond_type=self.film_cond_type) + self.predictor = RNNPredictor(**pred_args) # elif pred_type == "conv": # pred_args = filter_func_args(ConvPredictor.__init__, # self.predictor_args) @@ -326,7 +331,10 @@ def forward( sos_y_padded = sos_y.pad(mode="constant", padding_value=self.blank_id) sos_y_padded = sos_y_padded.to(torch.int64) # apply predictor and joiner - pred_out, _ = self.predictor(sos_y_padded, lang_embedding) + if self.predictor_args["pred_type"] == "rnn": + pred_out, _ = self.predictor(sos_y_padded, lang_embedding) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, _ = self.predictor(sos_y_padded) loss_simple = loss_pruned = None if self.rnnt_loss == "k2_pruned": loss, loss_simple, loss_pruned = self._rnnt_loss_k2_pruned( @@ -399,7 +407,10 @@ def decode_greedy(self, sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1) - pred_out, (h, c) = self.predictor(sos, lang_embedding) + if self.predictor_args["pred_type"] == "rnn": + pred_out, (h, c) = self.predictor(sos, lang_embedding) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, (h, c) = self.predictor(sos) T = x.size(1) t = 0 hyp = [] @@ -422,8 +433,11 @@ def decode_greedy(self, if y != blank_id: hyp.append(y.item()) y = y.reshape(1, 1) - pred_out, (h, c) = self.predictor(y, lang_embedding, (h, c)) - + if self.predictor_args["pred_type"] == "rnn": + pred_out, (h, c) = self.predictor(y, lang_embedding, (h, c)) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, (h, c) = self.predictor(y, (h, c)) + sym_per_utt += 1 sym_per_frame += 1 @@ -445,7 +459,10 @@ def decode_time_sync_beam_search(self, device = x.device sos = torch.tensor([blank_id], device=device).reshape(1, 1) - pred_out, (h, c) = self.predictor(sos, lang_embedding) + if self.predictor_args["pred_type"] == "rnn": + pred_out, (h, c) = self.predictor(sos, lang_embedding) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, state = self.predictor(sos) T = x.size(1) t = 0 B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] @@ -472,11 +489,20 @@ def decode_time_sync_beam_search(self, pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) - pred_out, pred_state = self.predictor( + # pred_out, pred_state = self.predictor( + # pred_in, + # lang_embedding, + # y_star.pred_state, + # ) + if self.predictor_args["pred_type"] == "rnn": + pred_out, pred_state = self.predictor( pred_in, lang_embedding, y_star.pred_state, - ) + ) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) + cache[cached_key] = (pred_out, pred_state) else: pred_out, pred_state = cache[cached_key] @@ -572,7 +598,11 @@ def decode_align_length_sync_beam_search( device = x.device sos = torch.tensor([blank_id], device=device).reshape(1, 1) - pred_out, (h, c) = self.predictor(sos, lang_embedding) + if self.predictor_args["pred_type"] == "rnn": + pred_out, (h, c) = self.predictor(sos, lang_embedding) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, state = self.predictor(sos) + T = x.size(1) #t = 0 B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] @@ -602,12 +632,14 @@ def decode_align_length_sync_beam_search( if cached_key not in cache: pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) - - pred_out, pred_state = self.predictor( + if self.predictor_args["pred_type"] == "rnn": + pred_out, pred_state = self.predictor( pred_in, lang_embedding, y_star.pred_state, - ) + ) + elif self.predictor_args["pred_type"] == "rnn_original": + pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) cache[cached_key] = (pred_out, pred_state) else: pred_out, pred_state = cache[cached_key] From c1d193abd8161a35017d316382b6025ef2c22db0 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 19 Jul 2023 16:36:06 -0400 Subject: [PATCH 71/89] finished experiments of models 2.0 in voxceleb/v2 --- egs/voxceleb/v1.2/run_001_prepare_data.sh | 18 +---- egs/voxceleb/v2/README.md | 60 +++++++++++++++ ...lsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++ ...c2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++ ...baseplus9l_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++ ...baseplus_ecapatdnn512x3_stage1_v2.0_0.yaml | 59 +++++++++++++++ ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml | 2 +- ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml | 2 +- ...lmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++ ...lmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++++++++++++++ ...lmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml | 73 +++++++++++++++++++ ...wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++ ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++++++++++++++ ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml | 73 +++++++++++++++++++ ...v2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml | 45 ++++++++++++ .../wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml | 44 +++++++++++ .../wavlmlarge12l_ecapatdnn512x3_v2.0.yaml | 45 ++++++++++++ .../conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml | 44 +++++++++++ ...wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++ ...ig_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++ ...fig_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++ ...onfig_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++ .../config_wavlmlarge_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++ hyp_utils/create_data_split_dirs.sh | 4 +- hyp_utils/create_data_split_links.sh | 6 +- hyp_utils/feats/make_evad.sh | 2 +- hyperion/bin/compute_energy_vad.py | 37 ++++++++-- hyperion/io/ark_data_writer.py | 20 +++-- hyperion/io/audio_reader.py | 12 ++- hyperion/io/data_rw_factory.py | 8 +- hyperion/io/data_writer.py | 36 ++++++++- hyperion/io/h5_data_writer.py | 19 +++-- hyperion/io/rw_specifiers.py | 47 ++++++++---- hyperion/utils/__init__.py | 1 + 34 files changed, 1281 insertions(+), 67 deletions(-) create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh index c151e270..aef70e96 100755 --- a/egs/voxceleb/v1.2/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh @@ -23,34 +23,24 @@ fi if [ $stage -le 2 ];then # prepare voxceleb1 for test - # hyp_utils/conda_env.sh prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \ --use-kaldi-ids \ --output-dir data/voxceleb1_test - #local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then prepare_data.py voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ --vox1-corpus-dir $voxceleb1_root \ --output-dir data/voxsrc22_dev - # local/prepare_voxsrc22_dev.py \ - # --vox1-corpus-dir $voxceleb1_root \ - # --voxsrc22-corpus-dir $voxsrc22_root \ - # --output-dir data/voxsrc22_dev - prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \ - --vox1-corpus-dir $voxceleb1_root \ - --output-dir data/voxsrc22_test fi # if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then -# local/prepare_voxsrc22_test.py \ -# --corpus-dir $voxsrc22_root \ -# --output-dir data/voxsrc22_test +# prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \ +# --vox1-corpus-dir $voxceleb1_root \ +# --output-dir data/voxsrc22_test # fi if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then - # # split vox2 into 2 parts, for cohort and qmf training + # split vox2 into 2 parts, for cohort and qmf training split_dataset_into_trials_and_cohort.py --data-dir data/voxceleb2cat_train - #local/make_vox2_trials.py --data-dir data/voxceleb2cat_train fi diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md index c64a4b41..a005b6e8 100644 --- a/egs/voxceleb/v2/README.md +++ b/egs/voxceleb/v2/README.md @@ -78,6 +78,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.060 | 0.116 | | | | | Cosine + AS-Norm | 0.81 | 0.058 | 0.108 | | | | | Cosine + QMF | 0.75 | 0.054 | 0.086 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.069 | 0.108 | +| | | | Cosine + AS-Norm | 0.86 | 0.067 | 0.108 | +| | | | Cosine + QMF | 0.77 | 0.066 | 0.105 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.057 | 0.085 | +| | | | Cosine + AS-Norm | 0.73 | 0.055 | 0.093 | +| | | | Cosine + QMF | 0.66 | 0.051 | 0.094 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.053 | 0.080 | +| | | | Cosine + AS-Norm | 0.71 | 0.050 | 0.087 | +| | | | Cosine + QMF | 0.64 | 0.045 | 0.087 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.063 | 0.111 | +| | | | Cosine + AS-Norm | 0.68 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.63 | 0.048 | 0.071 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.14 | 0.074 | 0.107 | +| | | | Cosine + AS-Norm | 0.94 | 0.060 | 0.089 | +| | | | Cosine + QMF | 0.89 | 0.054 | 0.076 | ### VoxCeleb 1 Entire-Clean trial list @@ -86,6 +101,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.81 | 0.051 | 0.087 | | | | | Cosine + AS-Norm | 0.78 | 0.047 | 0.083 | | | | | Cosine + QMF | 0.75 | 0.046 | 0.076 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.056 | 0.099 | +| | | | Cosine + AS-Norm | 0.86 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.82 | 0.050 | 0.085 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.049 | 0.088 | +| | | | Cosine + AS-Norm | 0.76 | 0.045 | 0.080 | +| | | | Cosine + QMF | 0.73 | 0.043 | 0.078 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.91 | 0.056 | 0.094 | +| | | | Cosine + AS-Norm | 0.87 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.086 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.050 | 0.086 | +| | | | Cosine + AS-Norm | 0.73 | 0.045 | 0.074 | +| | | | Cosine + QMF | 0.69 | 0.042 | 0.069 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.99 | 0.058 | 0.103 | +| | | | Cosine + AS-Norm | 0.87 | 0.052 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.085 | ### VoxCeleb 1 Hard-Clean trial list @@ -94,6 +124,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.73 | 0.113 | 0.182 | | | | | Cosine + AS-Norm | 1.63 | 0.100 | 0.160 | | | | | Cosine + QMF | 1.56 | 0.096 | 0.155 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.88 | 0.122 | 0.200 | +| | | | Cosine + AS-Norm | 1.77 | 0.110 | 0.175 | +| | | | Cosine + QMF | 1.66 | 0.104 | 0.168 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.67 | 0.103 | 0.165 | +| | | | Cosine + AS-Norm | 1.54 | 0.093 | 0.152 | +| | | | Cosine + QMF | 1.45 | 0.089 | 0.145 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.78 | 0.106 | 0.174 | +| | | | Cosine + AS-Norm | 1.70 | 0.099 | 0.162 | +| | | | Cosine + QMF | 1.61 | 0.094 | 0.153 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.49 | 0.087 | 0.137 | +| | | | Cosine + AS-Norm | 1.29 | 0.074 | 0.117 | +| | | | Cosine + QMF | 1.22 | 0.069 | 0.111 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.84 | 0.107 | 0.172 | +| | | | Cosine + AS-Norm | 1.47 | 0.083 | 0.128 | +| | | | Cosine + QMF | 1.39 | 0.079 | 0.123 | ### VoxSRC2022 dev @@ -102,3 +147,18 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.60 | 0.163 | 0.257 | | | | | Cosine + AS-Norm | 2.43 | 0.150 | 0.244 | | | | | Cosine + QMF | 2.31 | 0.143 | 0.232 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.82 | 0.183 | 0.286 | +| | | | Cosine + AS-Norm | 2.69 | 0.168 | 0.265 | +| | | | Cosine + QMF | 2.52 | 0.158 | 0.252 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.65 | 0.176 | 0.289 | +| | | | Cosine + AS-Norm | 2.55 | 0.171 | 0.292 | +| | | | Cosine + QMF | 2.38 | 0.159 | 0.266 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.62 | 0.153 | 0.251 | +| | | | Cosine + AS-Norm | 2.53 | 0.149 | 0.247 | +| | | | Cosine + QMF | 0.242 | 0.144 | 0.231 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.225 | +| | | | Cosine + AS-Norm | 2.01 | 0.125 | 0.209 | +| | | | Cosine + QMF | 1.92 | 0.117 | 0.200 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.83 | 0.175 | 0.276 | +| | | | Cosine + AS-Norm | 2.31 | 0.149 | 0.244 | +| | | | Cosine + QMF | 2.22 | 0.137 | 0.229 | diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..ad991124 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..254ff796 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..52be6db5 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml new file mode 100644 index 00000000..ebeedde6 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4850 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-4 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml index eed0ad1f..69a8322b 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -58,6 +58,6 @@ trainer: update_lr_on_opt_step: true use_amp: true log_interval: 1000 - epochs: 30 + epochs: 8 eff_batch_size: 512 train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml index d66d6877..3443591a 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -68,6 +68,6 @@ trainer: update_lr_on_opt_step: true use_amp: true log_interval: 1000 - epochs: 8 + epochs: 4 eff_batch_size: 256 train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..abe5da6e --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..7287188c --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..3443591a --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..2addaa1e --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..69a8322b --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..5e1260ad --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..c3466259 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..dc3737e3 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..5025f047 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..0a6303f5 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..67a4665e --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params layers 2-12 + +# hugging face model +hf_model_name=wav2vec2xlsr300m12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..80ee785b --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..c2b30f68 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..530096cc --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..1b276bcd --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/hyp_utils/create_data_split_dirs.sh b/hyp_utils/create_data_split_dirs.sh index 06c30779..b8aad6c8 100755 --- a/hyp_utils/create_data_split_dirs.sh +++ b/hyp_utils/create_data_split_dirs.sh @@ -6,7 +6,7 @@ storage_name=$(date +'%m_%d_%H_%M') -echo "$0 $@" # Print the command line for logging + if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; @@ -15,6 +15,7 @@ if [ $# -ne 3 ]; then echo "Usage: $0 " echo "$0 exp/vad_dir $USER/hyp-data/voxceleb/v1/vad/storage b0" fi + output_dir=$1 storage_dir=$2 nodes=$3 @@ -22,6 +23,7 @@ nodes=$3 link_dir=$output_dir/storage if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $linkdir ]; then + echo "$0 $@" # Print the command line for logging echo "Prepare to distribute data over multiple $nodes nodes" dir_name=$storage_dir/$storage_name/storage if [ "$nodes" == "b0" ];then diff --git a/hyp_utils/create_data_split_links.sh b/hyp_utils/create_data_split_links.sh index fb5b8ca0..8416742e 100755 --- a/hyp_utils/create_data_split_links.sh +++ b/hyp_utils/create_data_split_links.sh @@ -6,11 +6,11 @@ storage_name=$(date +'%m_%d_%H_%M') -echo "$0 $@" # Print the command line for logging -if [ $# -ne 3 ]; then - echo "Usage: $0 < " +if [ $# -ne 2 ]; then + echo "Usage: $0 " echo "$0 exp/vad_dir/vad.JOB.ark 40" fi +echo "$0 $@" # Print the command line for logging output_file_pattern=$1 nj=$2 diff --git a/hyp_utils/feats/make_evad.sh b/hyp_utils/feats/make_evad.sh index 373fc4a6..16ddbf74 100755 --- a/hyp_utils/feats/make_evad.sh +++ b/hyp_utils/feats/make_evad.sh @@ -87,7 +87,7 @@ fi $cmd JOB=1:$nj $logdir/make_vad_${name}.JOB.log \ hyp_utils/conda_env.sh \ compute_energy_vad.py --cfg $vad_config $opt_args \ - --input $scp --output ark,scp:$vaddir/vad_$name.JOB.ark,$vaddir/vad_$name.JOB.scp \ + --recordings-file $scp --output-spec ark,scp:$vaddir/vad_$name.JOB.ark,$vaddir/vad_$name.JOB.scp \ --part-idx JOB --num-parts $nj || exit 1 # concatenate the .scp files together. diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index e9773fff..9d50388c 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -13,19 +13,31 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.feats import EnergyVAD -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) -def compute_vad(input_path, output_path, write_num_frames, **kwargs): +def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): vad_args = EnergyVAD.filter_args(**kwargs) vad = EnergyVAD(**vad_args) input_args = AR.filter_args(**kwargs) - reader = AR(input_path, **input_args) + reader = AR(recordings_file, **input_args) - writer = DWF.create(output_path) + metadata_columns = [ + "frame_shift", + "frame_length", + "num_frames", + "num_speech_frames", + "prob_speech", + ] + + writer = DWF.create(output_spec, metadata_columns=metadata_columns) if write_num_frames is not None: f_num_frames = open(write_num_frames, "w") @@ -39,6 +51,7 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): rtf = vad.frame_shift * y.shape[0] / dt num_speech_frames = np.sum(y) prob_speech = num_speech_frames / y.shape[0] * 100 + logging.info( "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f", key, @@ -48,7 +61,14 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): dt, rtf, ) - writer.write([key], [y]) + metadata = { + "frame_shift": vad.frame_shift, + "frame_length": vad.frame_length, + "num_frames": y.shape[0], + "num_speech_frames": num_speech_frames, + "prob_speech": prob_speech, + } + writer.write([key], [y], metadata) if write_num_frames is not None: f_num_frames.write("%s %d\n" % (key, y.shape[0])) @@ -63,9 +83,10 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): parser = ArgumentParser(description="Compute Kaldi Energy VAD") parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) - parser.add_argument("--output", dest="output_path", required=True) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--output-spec", required=True) parser.add_argument("--write-num-frames", default=None) + parser.add_argument("--write-stats", default=None) AR.add_class_args(parser) EnergyVAD.add_class_args(parser) diff --git a/hyperion/io/ark_data_writer.py b/hyperion/io/ark_data_writer.py index 6adf78b2..26f77112 100644 --- a/hyperion/io/ark_data_writer.py +++ b/hyperion/io/ark_data_writer.py @@ -3,10 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from typing import Union, Optional, List +from typing import Union, Optional, List, Dict import numpy as np - +import pandas as pd from ..hyp_defs import float_save from ..utils.kaldi_io_funcs import init_kaldi_output_stream, is_token, write_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix @@ -46,7 +46,10 @@ def __init__( self.f = open(archive_path, "w") if script_path is not None and not self.script_is_scp: - row = self.script_sep.join(["id", "storage_path", "storage_byte"]) + columns = ["id", "storage_path", "storage_byte"] + if self.metadata_columns is not None: + columns += self.metadata_columns + row = self.script_sep.join(columns) self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): @@ -97,6 +100,7 @@ def write( self, keys: Union[str, List[str], np.array], data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, ): """Writes data to file. @@ -107,9 +111,7 @@ def write( it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ - if isinstance(keys, str): - keys = [keys] - data = [data] + keys, data, metadata = self.standardize_write_args(keys, data, metadata) for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -125,7 +127,11 @@ def write( if self.script_is_scp: self.f_script.write(f"{key_i} {self.archive_path}:{pos}\n") else: - row = self.script_sep.join([key_i, self.archive_path, str(pos)]) + columns = [key_i, str(self.archive_path), str(pos)] + if metadata is not None: + metadata_i = [str(m[i]) for m in metadata] + columns += metadata_i + row = self.script_sep.join(columns) self.f_script.write(f"{row}\n") if self._flush: diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index 1052ce8c..6c152cc5 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -346,7 +346,9 @@ def read(self, num_records: int = 0, time_offset: float = 0, time_durs: float = key = segment["id"] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: - key, file_path = self.recordings.iloc[self.cur_item] + segment = self.recordings.iloc[self.cur_item] + key = segment["id"] + file_path = segment["storage_path"] x_i, fs_i = self.read_wavspecifier( file_path, self.wav_scale, offset_i, dur_i ) @@ -397,7 +399,8 @@ def add_class_args(parser, prefix: Optional[str] = None): if prefix is not None: outer_parser.add_argument( - "--" + prefix, action=ActionParser(parser=parser), + "--" + prefix, + action=ActionParser(parser=parser), ) add_argparse_args = add_class_args @@ -423,7 +426,7 @@ def read( Args: keys: List of recording/segment_ids names. time_offset: float or float list with time-offsets - time_durs: float or float list with durations + time_durs: float or float list with durations Returns: data: List of waveforms @@ -527,7 +530,8 @@ def add_class_args(parser, prefix: Optional[str] = None): ) if prefix is not None: outer_parser.add_argument( - "--" + prefix, action=ActionParser(parser=parser), + "--" + prefix, + action=ActionParser(parser=parser), ) add_argparse_args = add_class_args diff --git a/hyperion/io/data_rw_factory.py b/hyperion/io/data_rw_factory.py index b56e8c27..092f5549 100644 --- a/hyperion/io/data_rw_factory.py +++ b/hyperion/io/data_rw_factory.py @@ -30,7 +30,10 @@ class DataWriterFactory(object): @staticmethod def create( - wspecifier: PathLike, compress: bool = False, compression_method: str = "auto" + wspecifier: PathLike, + compress: bool = False, + compression_method: str = "auto", + metadata_columns: Optional[List[str]] = None, ): if isinstance(wspecifier, str): wspecifier = WSpecifier.create(wspecifier) @@ -47,6 +50,7 @@ def create( flush=wspecifier.flush, compress=compress, compression_method=compression_method, + metadata_columns=metadata_columns, ) else: return ADW( @@ -56,6 +60,7 @@ def create( flush=wspecifier.flush, compress=compress, compression_method=compression_method, + metadata_columns=metadata_columns, ) @staticmethod @@ -76,7 +81,6 @@ def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='data writer options') class SequentialDataReaderFactory(object): diff --git a/hyperion/io/data_writer.py b/hyperion/io/data_writer.py index 8adbf87a..ff35ef2a 100644 --- a/hyperion/io/data_writer.py +++ b/hyperion/io/data_writer.py @@ -5,9 +5,10 @@ import os from abc import ABCMeta, abstractmethod -from typing import Union, Optional, List +from typing import Union, Optional, List, Dict from pathlib import Path import numpy as np +import pandas as pd from ..utils import PathLike @@ -34,12 +35,14 @@ def __init__( flush: bool = False, compress: bool = False, compression_method: str = "auto", + metadata_columns: Optional[List[str]] = None, ): self.archive_path = Path(archive_path) self.script_path = Path(script_path) if script_path is not None else None self._flush = flush self.compress = compress self.compression_method = compression_method + self.metadata_columns = metadata_columns archive_dir = self.archive_path.parent archive_dir.mkdir(exist_ok=True, parents=True) @@ -56,9 +59,7 @@ def __init__( self.f_script = open(self.script_path, "w") else: self.script_sep = "," if script_ext == ".csv" else "\t" - self.f_script = open(self.script_path, "w", "utf-8") - row = self.script_sep.join(["id", "storage_path"]) - self.f_script.write(f"{row}\n") + self.f_script = open(self.script_path, "w", encoding="utf-8") def __enter__(self): """Function required when entering contructions of type @@ -87,11 +88,37 @@ def flush(self): """Flushes the file""" pass + def standardize_write_args( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, + ): + if isinstance(keys, str): + keys = [keys] + data = [data] + + if metadata is not None: + if isinstance(metadata, pd.DataFrame): + metadata = metadata.to_dict() + + metadata_list = [] + for c in self.metadata_columns: + m_c = metadata[c] + if not isinstance(m_c, (list, np.ndarray)): + m_c = [m_c] + metadata_list.append(m_c) + + metadata = metadata_list + + return keys, data, metadata + @abstractmethod def write( self, keys: Union[str, List[str], np.array], data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, ): """Writes data to file. @@ -101,5 +128,6 @@ def write( If all the matrices have the same dimension it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. + metadata: dictionary/DataFrame with metadata """ pass diff --git a/hyperion/io/h5_data_writer.py b/hyperion/io/h5_data_writer.py index c34aa0ca..4d05f963 100644 --- a/hyperion/io/h5_data_writer.py +++ b/hyperion/io/h5_data_writer.py @@ -3,10 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from typing import Union, Optional, List +from typing import Union, Optional, List, Dict import h5py import numpy as np +import pandas as pd from ..hyp_defs import float_save from ..utils.kaldi_io_funcs import is_token @@ -37,7 +38,10 @@ def __init__( self.f = h5py.File(archive_path, "w") if script_path is not None and not self.script_is_scp: - row = self.script_sep.join(["id", "storage_path"]) + columns = ["id", "storage_path"] + if self.metadata_columns is not None: + columns += self.metadata_columns + row = self.script_sep.join(columns) self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): @@ -89,6 +93,7 @@ def write( self, keys: Union[str, List[str], np.array], data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, ): """Writes data to file. @@ -99,9 +104,7 @@ def write( it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ - if isinstance(keys, str): - keys = [keys] - data = [data] + keys, data, metadata = self.standardize_write_args(keys, data, metadata) for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -115,7 +118,11 @@ def write( if self.script_is_scp: self.f_script.write(f"{key_i} {self.archive_path}\n") else: - row = self.script_sep.join([key_i, self.archive_path]) + columns = [key_i, str(self.archive_path)] + if metadata is not None: + metadata_i = [str(m[i]) for m in metadata] + columns += metadata_i + row = self.script_sep.join(columns) self.f_script.write(f"{row}\n") if self._flush: diff --git a/hyperion/io/rw_specifiers.py b/hyperion/io/rw_specifiers.py index 37f579b4..93123247 100644 --- a/hyperion/io/rw_specifiers.py +++ b/hyperion/io/rw_specifiers.py @@ -7,6 +7,8 @@ import re from enum import Enum +from pathlib import Path +import pandas as pd class ArchiveType(Enum): @@ -174,6 +176,11 @@ def create(cls, wspecifier): archive_type = ArchiveType.AUDIO archive = archives[cur_archive] cur_archive += 1 + elif option == "csv": + assert script is None, "Repeated csv in wspecifier %s" % script + assert len(archives) > cur_archive + script = archives[cur_archive] + cur_archive += 1 elif option == "scp": assert script is None, "Repeated scp in wspecifier %s" % script assert len(archives) > cur_archive @@ -332,7 +339,7 @@ def create(cls, rspecifier): assert len(archives) == 1 spec_type = None - archive = archives[0] + archive = Path(archives[0]) archive_type = None once = False is_sorted = False @@ -361,6 +368,9 @@ def create(cls, rspecifier): assert spec_type is None spec_type = RSpecType.ARCHIVE archive_type = ArchiveType.RTTM + elif option == "csv": + assert spec_type is None + spec_type = RSpecType.SCRIPT elif option == "scp": assert spec_type is None spec_type = RSpecType.SCRIPT @@ -374,24 +384,31 @@ def create(cls, rspecifier): assert spec_type is not None, "Wrong wspecifier options %s" % fields[0] if spec_type == RSpecType.SCRIPT: - with open(archive, "r") as f: - scp_f2 = f.readline().strip().split(" ")[1] - if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None: + if archive.suffix == ".csv": + df = pd.read_csv(archive, nrows=2) + storage_path = df["storage_path"].values[0] + if re.match(r".*\.h5$", scp_f2) is not None: archive_type = ArchiveType.H5 - elif re.match(r".*\.ark:.*$", scp_f2) is not None: + elif re.match(r".*\.ark$", scp_f2) is not None: archive_type = ArchiveType.ARK - elif ( - re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2) is not None - ): + elif re.match(r".*[cvg]$", scp_f2) is not None: archive_type = ArchiveType.AUDIO else: - archive_type = ArchiveType.ARK - - # .split('[')[0].split(':') - # if len(scp) == 1: - # archive_type = ArchiveType.H5 - # else: - # archive_type = ArchiveType.ARK + raise ValueError(f"Unknown format for {storage_path}") + else: + with open(archive, "r") as f: + scp_f2 = f.readline().strip().split(" ")[1] + if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None: + archive_type = ArchiveType.H5 + elif re.match(r".*\.ark:.*$", scp_f2) is not None: + archive_type = ArchiveType.ARK + elif ( + re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2) + is not None + ): + archive_type = ArchiveType.AUDIO + else: + archive_type = ArchiveType.ARK if archive_type == ArchiveType.ARK: for option in options: diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index 51b476aa..e8ad5056 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .info_table import InfoTable from .class_info import ClassInfo from .dataset import Dataset from .enrollment_map import EnrollmentMap From 26eca97bdab59182bc00f29a5a55294988f46d04 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 7 Aug 2023 18:10:21 +0000 Subject: [PATCH 72/89] add configs for commonvoice speaker verification --- ..._speaker_ecapatdnn512x3_stage1_v1.3.1.yaml | 70 +++++++++++++++++++ ...0m_speaker_ecapatdnn512x3_stage1_v1.3.yaml | 70 +++++++++++++++++++ ...0m_speaker_ecapatdnn512x3_stage2_v1.3.yaml | 70 +++++++++++++++++++ ...2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml | 44 ++++++++++++ .../global_conf/config_spk_v1.3.1_13langs.sh | 42 +++++++++++ .../v1/global_conf/config_spk_v1.3_13langs.sh | 42 +++++++++++ hyperion/bin/finetune_wav2vec2xvector.py | 5 ++ 7 files changed, 343 insertions(+) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml create mode 100644 egs/commonvoice/v1/conf/wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_spk_v1.3.1_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_spk_v1.3_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml new file mode 100644 index 00000000..b03a0282 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'seg_chunk_sampler' + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # # weighted + # weight_mode: "data-prior" + # class_name: "language" + # weight_exponent: 1.0 + # num_chunks_per_seg_epoch: 0.3 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'seg_chunk_sampler' + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # # weighted + # weight_mode: "data-prior" + # class_name: "language" + # weight_exponent: 1.0 + # num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml new file mode 100644 index 00000000..523bf6fd --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'seg_chunk_sampler' + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # # weighted + # weight_mode: "data-prior" + # class_name: "language" + # weight_exponent: 1.0 + # num_chunks_per_seg_epoch: 0.3 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'seg_chunk_sampler' + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # # weighted + # weight_mode: "data-prior" + # class_name: "language" + # weight_exponent: 1.0 + # num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 28000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml new file mode 100644 index 00000000..39b94671 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 0.3 + class_name: language + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - speaker + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1.0 + class_name: language + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 4 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 28000 + hold_steps: 20000 + min_lr: 4e-4 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 512 + train_mode: full \ No newline at end of file diff --git a/egs/commonvoice/v1/conf/wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml new file mode 100644 index 00000000..1abfea29 --- /dev/null +++ b/egs/commonvoice/v1/conf/wav2vec2xlsr300m_speaker_ecapatdnn512x3_v1.3.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_spk_v1.3.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_spk_v1.3.1_13langs.sh new file mode 100644 index 00000000..d820ac2d --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_spk_v1.3.1_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio_overlap_spk +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.1.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v1.3.1_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/speaker_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.1.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v1.3.1_13_langs.s2 +nnet_s2_dir=exp/speaker_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage3_v1.3.1.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/speaker_resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_spk_v1.3_13langs.sh b/egs/commonvoice/v1/global_conf/config_spk_v1.3_13langs.sh new file mode 100644 index 00000000..2e583f03 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_spk_v1.3_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio_overlap_spk +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage1_v1.3.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v1.3_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/speaker_resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage2_v1.3.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v1.3_13_langs.s2 +nnet_s2_dir=exp/speaker_resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_speaker_ecapatdnn512x3_stage3_v1.3.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/speaker_resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index fc3c7084..3bc2fae4 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -26,6 +26,10 @@ from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, namespace_to_dict) +import warnings + +warnings.filterwarnings('ignore', category=UserWarning, module='torch.distributed.distributed_c10d') + model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, "hf_hubert2resnet1d": HFHubert2ResNet1dXVector, @@ -126,6 +130,7 @@ def train_model(gpu_id, args): device=device, metrics=metrics, ddp=world_size > 1, + # loss_weight=train_loader.batch_sampler.class_info["weights"], **trn_args, ) trainer.load_last_checkpoint() From 89efce43a3c25b1fc3284afb84823af803d92add Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 4 Sep 2023 18:59:26 -0400 Subject: [PATCH 73/89] voxceleb v1.2 works up to snorm backend --- egs/sre19-av-v/v0.1/steps_be/face_be_utils.py | 9 +- .../v1/steps_be/eval-tel-be-snorm-v2.py | 2 +- egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py | 2 +- .../v1/steps_be/train-tel-be-knn-v1.py | 2 +- .../v1/steps_be/train-tel-be-knn-v3.py | 2 +- .../v1/steps_be/train-tel-be-knn-v4.py | 2 +- .../adv.v2/steps_backend/eval-be-cos-Nvs1.py | 2 +- .../adv.v2/steps_backend/eval-be-cos.py | 2 +- egs/voxceleb/v1.1/conf/vad_16k.yaml | 1 + ...rain_ecapatdnn2048x4_xvec_stage1_v3.0.yaml | 84 +-- ...rain_ecapatdnn2048x4_xvec_stage2_v3.0.yaml | 16 +- ...train_ecapatdnn512x3_xvec_stage1_v3.0.yaml | 89 +-- ...train_ecapatdnn512x3_xvec_stage2_v3.0.yaml | 30 +- egs/voxceleb/v1.2/conf/vad_16k.yaml | 3 +- egs/voxceleb/v1.2/run_002_compute_evad.sh | 66 +++ .../v1.2/run_003_prepare_noises_rirs.sh | 102 ++++ .../v1.2/run_004_prepare_xvec_train_data.sh | 76 +++ egs/voxceleb/v1.2/run_005_train_xvector.sh | 78 +++ egs/voxceleb/v1.2/run_006_extract_xvectors.sh | 103 ++++ egs/voxceleb/v1/steps_be/eval_be_cos.py | 2 +- egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py | 2 +- egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py | 2 +- hyp_utils/create_audios_split_links.sh | 27 + hyp_utils/create_data_split_links.sh | 2 - .../xvectors/extract_wav2vec2xvectors.sh | 8 +- .../xvectors/extract_xvectors_from_wav.sh | 10 +- .../make_babble_noise_for_nnet_train.sh | 22 +- .../xvectors/pack_rirs_for_nnet_train.sh | 9 - .../preprocess_audios_for_nnet_train.sh | 8 +- hyperion/bin/eval_cosine_scoring_backend.py | 200 +++++++ .../eval_cosine_scoring_backend_with_qmf.py | 472 +++++++++++++++ hyperion/bin/eval_verification_metrics.py | 96 +++ hyperion/bin/eval_xvec_logits_from_wav.py | 20 +- hyperion/bin/extract_wav2vec2xvectors.py | 41 +- hyperion/bin/extract_wav2xvectors.py | 333 +++++++++++ hyperion/bin/extract_xvectors_from_feats.py | 20 +- hyperion/bin/extract_xvectors_from_wav.py | 26 +- .../extract_xvectors_slidwin_from_feats.py | 10 +- .../bin/extract_xvectors_slidwin_from_wav.py | 10 +- hyperion/bin/finetune_wav2xvector.py | 228 ++++++++ .../generate_adv_attacks_xvector_classif.py | 8 +- hyperion/bin/hyperion_dataset.py | 406 ++++++++++++- hyperion/bin/hyperion_tables.py | 33 +- hyperion/bin/make_babble_noise_audio_files.py | 102 ++-- hyperion/bin/make_wav2xvector.py | 91 +++ hyperion/bin/merge_scores.py | 99 ++++ hyperion/bin/pack_wav_rirs.py | 17 +- hyperion/bin/plot_embedding_tsne_per_class.py | 11 +- hyperion/bin/prepare_data.py | 9 +- hyperion/bin/preprocess_audio_files.py | 163 +++--- hyperion/bin/train_wav2vec2xvector.py | 19 +- hyperion/bin/train_wav2xvector.py | 196 +++++++ hyperion/data_prep/__init__.py | 2 + hyperion/data_prep/data_prep.py | 3 +- hyperion/data_prep/musan.py | 107 ++++ hyperion/data_prep/rirs.py | 103 ++++ hyperion/data_prep/voxceleb1.py | 18 +- hyperion/data_prep/voxceleb2.py | 23 +- hyperion/data_prep/voxsrc22.py | 49 +- hyperion/helpers/trial_data_reader.py | 2 +- hyperion/helpers/vector_class_reader.py | 2 +- hyperion/io/ark_data_reader.py | 6 +- hyperion/io/audio_reader.py | 18 +- hyperion/io/audio_writer.py | 71 ++- hyperion/io/hyp_data_reader.py | 5 +- hyperion/io/packed_audio_reader.py | 6 +- hyperion/io/rw_specifiers.py | 6 +- hyperion/np/augment/noise_augment.py | 26 +- hyperion/np/augment/reverb_augment.py | 15 +- hyperion/np/augment/speech_augment.py | 2 +- hyperion/np/augment/speed_augment.py | 22 +- .../classifiers/binary_logistic_regression.py | 2 +- hyperion/np/classifiers/greedy_fusion.py | 4 +- hyperion/np/classifiers/linear_gbe.py | 8 +- hyperion/np/classifiers/linear_gbe_up.py | 9 +- hyperion/np/classifiers/linear_svmc.py | 8 +- .../np/classifiers/logistic_regression.py | 6 +- hyperion/np/classifiers/q_scoring_homo_gbe.py | 2 +- hyperion/np/classifiers/svmc.py | 4 +- hyperion/np/feats/energy_vad.py | 40 +- hyperion/np/feats/mfcc.py | 57 +- hyperion/np/metrics/__init__.py | 7 +- hyperion/np/metrics/cllr.py | 2 +- hyperion/np/metrics/utils.py | 2 +- hyperion/np/metrics/verification_evaluator.py | 78 ++- hyperion/np/pdfs/core/normal.py | 21 +- hyperion/np/pdfs/core/normal_diag_cov.py | 11 +- hyperion/np/pdfs/hmm/hmm.py | 4 +- hyperion/np/pdfs/jfa/jfa_total.py | 9 +- .../np/pdfs/mixtures/exp_family_mixture.py | 2 +- hyperion/np/pdfs/mixtures/gmm.py | 24 +- hyperion/np/pdfs/mixtures/gmm_diag_cov.py | 13 +- .../np/pdfs/mixtures/gmm_tied_diag_cov.py | 13 +- hyperion/np/pdfs/plda/frplda.py | 4 +- hyperion/np/pdfs/plda/plda.py | 4 +- hyperion/np/pdfs/plda/splda.py | 4 +- hyperion/np/transforms/skl_tsne.py | 4 +- hyperion/torch/data/audio_dataset.py | 12 + hyperion/torch/layers/audio_feats_factory.py | 2 +- hyperion/torch/models/__init__.py | 18 +- hyperion/torch/models/plda/splda.py | 2 +- .../models/wav2xvectors/hf_wav2xvector.py | 4 +- .../wav2xvectors/wav2resnet1d_xvector.py | 18 + .../models/wav2xvectors/wav2resnet_xvector.py | 18 + .../torch/models/wav2xvectors/wav2xvector.py | 113 +++- hyperion/torch/narchs/audio_feats_mvn.py | 4 + hyperion/torch/torch_model.py | 19 +- hyperion/utils/class_info.py | 16 + hyperion/utils/dataset.py | 552 +++++++++++++----- hyperion/utils/fold_list.py | 2 +- hyperion/utils/info_table.py | 72 ++- hyperion/utils/{math.py => math_funcs.py} | 22 +- hyperion/utils/plotting.py | 3 +- hyperion/utils/scp_list.py | 2 +- hyperion/utils/segment_set.py | 42 +- hyperion/utils/sparse_trial_key.py | 18 +- hyperion/utils/sparse_trial_scores.py | 124 +++- hyperion/utils/train_val_eval_list.py | 2 +- hyperion/utils/trial_key.py | 16 +- hyperion/utils/trial_ndx.py | 84 ++- hyperion/utils/trial_scores.py | 86 ++- hyperion/utils/utt2info.py | 2 +- 122 files changed, 4509 insertions(+), 945 deletions(-) create mode 100755 egs/voxceleb/v1.2/run_002_compute_evad.sh create mode 100755 egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh create mode 100755 egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh create mode 100755 egs/voxceleb/v1.2/run_005_train_xvector.sh create mode 100755 egs/voxceleb/v1.2/run_006_extract_xvectors.sh create mode 100755 hyp_utils/create_audios_split_links.sh create mode 100755 hyperion/bin/eval_cosine_scoring_backend.py create mode 100755 hyperion/bin/eval_cosine_scoring_backend_with_qmf.py create mode 100755 hyperion/bin/eval_verification_metrics.py create mode 100755 hyperion/bin/extract_wav2xvectors.py create mode 100755 hyperion/bin/finetune_wav2xvector.py mode change 100644 => 100755 hyperion/bin/hyperion_dataset.py create mode 100755 hyperion/bin/make_wav2xvector.py create mode 100755 hyperion/bin/merge_scores.py create mode 100755 hyperion/bin/train_wav2xvector.py create mode 100644 hyperion/data_prep/musan.py create mode 100644 hyperion/data_prep/rirs.py rename hyperion/utils/{math.py => math_funcs.py} (93%) diff --git a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py index 14e3fc20..b6252df7 100644 --- a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py +++ b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py @@ -2,15 +2,11 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - import logging import numpy as np from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import softmax +from hyperion.utils.math_funcs import softmax from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.transforms import LNorm from hyperion.np.clustering import AHC @@ -23,9 +19,6 @@ def lnorm(x): def cosine_scr(x1, x2): - # t = LNorm() - # x1 = t.predict(x1) - # x2 = t.predict(x2) x1 = lnorm(x1) x2 = lnorm(x2) return np.dot(x1, x2.T) diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py index 907509fd..c9657a66 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores from hyperion.helpers import TrialDataReader as TDR -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.np.transforms import TransformList from hyperion.np.score_norm import AdaptSNorm as SNorm diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py index b661cbde..24ef731b 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py index 8e7715e0..bdef3fc3 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py @@ -17,7 +17,7 @@ from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py index 12f1725b..51795676 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py @@ -17,7 +17,7 @@ from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank, svd diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py index 234f966c..79c1cd6f 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py @@ -17,7 +17,7 @@ from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank, svd diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py index 85e82149..48094d0f 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py index d5cd6a55..49720cb5 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F from hyperion.np.transforms import TransformList diff --git a/egs/voxceleb/v1.1/conf/vad_16k.yaml b/egs/voxceleb/v1.1/conf/vad_16k.yaml index 5fb0111c..a8d7b4d4 100644 --- a/egs/voxceleb/v1.1/conf/vad_16k.yaml +++ b/egs/voxceleb/v1.1/conf/vad_16k.yaml @@ -6,3 +6,4 @@ vad_energy_threshold: 5.5 vad_energy_mean_scale: 0.5 vad_proportion_threshold: 0.12 vad_frames_context: 2 +wav_scale: 32767 diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml index 1633f4a2..2cf31713 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml @@ -29,48 +29,50 @@ data: min_chunk_length: 2.0 data_loader: num_workers: 8 -feats: fbank80_specaug1_stmn_16k.yaml -model: - resnet_enc: - in_feats: 80 - in_conv_channels: 2048 - in_kernel_size: 5 - in_stride: 1 - resb_type: seres2bn - resb_repeats: - - 1 - - 1 - - 1 - - 1 - resb_channels: - - 2048 - resb_kernel_sizes: - - 3 - resb_dilations: - - 2 - - 3 - - 4 - - 5 - resb_strides: - - 1 - res2net_width_factor: 1 - res2net_scale: 8 - se_r: 4 - multilayer: true - multilayer_concat: true - endpoint_channels: 4096 - norm_before: false + +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 dropout_rate: 0.2 - hid_act: swish - pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 128 - embed_dim: 192 - cos_scale: 30.0 - margin: 0.2 - margin_warmup_epochs: 5.0 - dropout_rate: 0.2 - norm_before: false + norm_before: false trainer: optim: opt_type: adam diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml index 877736b3..21f0db8b 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml @@ -37,15 +37,15 @@ data: num_hard_prototypes: 8 data_loader: num_workers: 8 -feats: fbank80_stmn_16k.yaml model: - cos_scale: 30.0 - margin: 0.3 - margin_warmup_epochs: 0 - intertop_margin: 0.1 - resnet_enc: - override_dropouts: true - dropout_rate: 0.25 + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0.25 trainer: optim: opt_type: sgd diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml index f15d453d..03a7f736 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -17,11 +17,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -29,47 +29,48 @@ data: min_chunk_length: 2.0 data_loader: num_workers: 8 -feats: fbank80_specaug1_stmn_16k.yaml -model: - resnet_enc: - in_feats: 80 - in_conv_channels: 512 - in_kernel_size: 5 - in_stride: 1 - resb_type: seres2bn - resb_repeats: - - 1 - - 1 - - 1 - resb_channels: - - 512 - resb_kernel_sizes: - - 3 - resb_dilations: - - 2 - - 3 - - 4 - resb_strides: - - 1 - res2net_width_factor: 1 - res2net_scale: 8 - se_r: 4 - multilayer: true - multilayer_concat: true - endpoint_channels: 1536 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 norm_before: false - dropout_rate: 0.002 hid_act: swish - pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 128 - embed_dim: 192 - cos_scale: 30.0 - margin: 0.2 - margin_warmup_epochs: 5.0 - dropout_rate: 0.0 - norm_before: false - hid_act: swish trainer: optim: opt_type: adam @@ -91,3 +92,5 @@ trainer: log_interval: 1000 epochs: 40 eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml index 45e55d97..9788bb7c 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml @@ -2,18 +2,18 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -21,31 +21,31 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: num_workers: 8 -feats: fbank80_stmn_16k.yaml model: - cos_scale: 30.0 - margin: 0.3 - margin_warmup_epochs: 0 - intertop_margin: 0.1 - resnet_enc: - override_dropouts: true - dropout_rate: 0. + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0. trainer: optim: opt_type: sgd @@ -67,3 +67,5 @@ trainer: swa_start: 31 swa_lr: 1e-4 swa_anneal_epochs: 2 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/vad_16k.yaml b/egs/voxceleb/v1.2/conf/vad_16k.yaml index 5fb0111c..e5a6bb82 100644 --- a/egs/voxceleb/v1.2/conf/vad_16k.yaml +++ b/egs/voxceleb/v1.2/conf/vad_16k.yaml @@ -2,7 +2,8 @@ sample_frequency: 16000 frame_shift: 10 frame_length: 25 snip_edges: false -vad_energy_threshold: 5.5 +vad_energy_threshold: -4.89 vad_energy_mean_scale: 0.5 vad_proportion_threshold: 0.12 vad_frames_context: 2 +wav_scale: 1 diff --git a/egs/voxceleb/v1.2/run_002_compute_evad.sh b/egs/voxceleb/v1.2/run_002_compute_evad.sh new file mode 100755 index 00000000..e7593df2 --- /dev/null +++ b/egs/voxceleb/v1.2/run_002_compute_evad.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +vad_dir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml +nj=40 + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ -z "$vad_config" ];then + echo "We are not using VAD in this configuration" + exit 0 +fi + +if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" +fi + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + hyp_utils/create_data_split_dirs.sh \ + $vad_dir/$name \ + $USER/hyp-data/voxceleb/v1.2/vad $nodes + done +fi + +#Train datasets +if [ $stage -le 2 ];then + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj + echo "compute vad for $name" + $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ + hyp_utils/conda_env.sh \ + compute_energy_vad.py --cfg $vad_config \ + --recordings-file data/$name/recordings.csv \ + --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ + --part-idx JOB --num-parts $nj || exit 1 + + hyperion_tables.py cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion_dataset.py add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv + done +fi + + diff --git a/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..aed1dae4 --- /dev/null +++ b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + prepare_data.py musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + preprocess_audio_files.py \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion_tables.py cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion_dataset.py set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + make_babble_noise_audio_files.py \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion_dataset.py make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + prepare_data.py rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + pack_wav_rirs.py ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion_dataset.py add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh new file mode 100755 index 00000000..7649ff22 --- /dev/null +++ b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +nodes=b1 +nj=40 +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + hyp_utils/create_data_split_dirs.sh \ + exp/xvector_audios/$nnet_data \ + $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes +fi + +if [ $stage -le 2 ];then + output_dir=exp/proc_audio/$nnet_data + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$nnet_data/vad.csv" + update_durs="--update-seg-durs" + fi + + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ + hyp_utils/conda_env.sh \ + preprocess_audio_files.py \ + --audio-format flac --remove-dc-offset $vad_args \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$nnet_data/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion_tables.py cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + + hyperion_dataset.py set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad +fi + +if [ $stage -le 3 ];then + hyperion_dataset.py remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 + + hyperion_dataset.py remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 +fi + +if [ $stage -le 4 ];then + hyperion_dataset.py split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --joint-classes speaker --min-train-samples 1 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val +fi + diff --git a/egs/voxceleb/v1.2/run_005_train_xvector.sh b/egs/voxceleb/v1.2/run_005_train_xvector.sh new file mode 100755 index 00000000..d2f31ea1 --- /dev/null +++ b/egs/voxceleb/v1.2/run_005_train_xvector.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_data_dir=data/${nnet_data}_xvector_train +val_data_dir=data/${nnet_data}_xvector_val + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_wav2xvector.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu \ + +fi + + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2xvector.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v1.2/run_006_extract_xvectors.sh b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh new file mode 100755 index 00000000..09b8c8e9 --- /dev/null +++ b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +use_gpu=false +xvec_chunk_length=120.0 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" + num_gpus=1 +else + xvec_cmd="$train_cmd --mem 12G" + num_gpus=0 +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then + # Extract xvectors for training LDA/PLDA + nj=100 + for name in voxceleb2cat_train + do + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + extract_wav2xvectors.py ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --random-utt-length --min-utt-length 2 --max-utt-length 30 \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion_tables.py cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + nj=100 + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}') + nj=$(($num_segs < 100 ? $num_segs:100)) + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + extract_wav2xvectors.py ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion_tables.py cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + + diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos.py b/egs/voxceleb/v1/steps_be/eval_be_cos.py index 1f9978ee..a9bc03d1 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos.py @@ -20,7 +20,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.list_utils import ismember from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F from hyperion.np.transforms import TransformList diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py index 7034126a..bf66d72b 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores, Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.utils.list_utils import ismember from hyperion.helpers import TrialDataReader as TDR diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py index dad89ced..0eca769d 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py @@ -20,7 +20,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.list_utils import ismember from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F from hyperion.np.transforms import TransformList diff --git a/hyp_utils/create_audios_split_links.sh b/hyp_utils/create_audios_split_links.sh new file mode 100755 index 00000000..7125a2c4 --- /dev/null +++ b/hyp_utils/create_audios_split_links.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo "$0 exp/xvector_audios/voxceleb data/voxceleb/recordings.csv flac" +fi +echo "$0 $@" # Print the command line for logging +output_dir=$1 +rec_file=$2 +file_format=$3 + +if [[ $(hostname -f) != *.clsp.jhu.edu ]]; then + exit 0 +fi + +for f in $(awk -F "," '$1!="id" { print $1}' $rec_file); do + # the next command does nothing unless $output_dir/storage/ exists, see + # utils/create_data_link.pl for more info. + hyp_utils/create_data_link.pl $output_dir/$f.$file_format +done + + + diff --git a/hyp_utils/create_data_split_links.sh b/hyp_utils/create_data_split_links.sh index 8416742e..c7cfa3eb 100755 --- a/hyp_utils/create_data_split_links.sh +++ b/hyp_utils/create_data_split_links.sh @@ -4,8 +4,6 @@ # Apache 2.0. # Creates links to distrubute data into multiple nodes in clsp grid -storage_name=$(date +'%m_%d_%H_%M') - if [ $# -ne 2 ]; then echo "Usage: $0 " echo "$0 exp/vad_dir/vad.JOB.ark 40" diff --git a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh index 6c6f0fdf..d8ae2e55 100755 --- a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh +++ b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh @@ -87,9 +87,9 @@ if [ $stage -le 0 ];then extract_wav2vec2xvectors.py \ ${args} $write_speech_dur_opt \ --part-idx JOB --num-parts $nj \ - --input $data_dir/wav.scp \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \ - --output ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp + --output-spec ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp set -e fi @@ -109,9 +109,9 @@ if [ $stage -le 1 ];then extract_wav2vec2xvectors.py \ ${args} $write_speech_dur_opt \ --part-idx $i --num-parts $nj \ - --input $data_dir/wav.scp \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \ - --output ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & + --output-spec ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & fi done wait diff --git a/hyp_utils/xvectors/extract_xvectors_from_wav.sh b/hyp_utils/xvectors/extract_xvectors_from_wav.sh index 0b5227cc..b763a25c 100755 --- a/hyp_utils/xvectors/extract_xvectors_from_wav.sh +++ b/hyp_utils/xvectors/extract_xvectors_from_wav.sh @@ -87,10 +87,10 @@ if [ $stage -le 0 ];then hyp_utils/conda_env.sh --num-gpus $num_gpus \ extract_xvectors_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ - --part-idx JOB --num-parts $nj \ - --input $data_dir/wav.scp \ + --part-idx JOB --num-parts $nj \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --chunk-length $chunk_length \ - --output ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp + --output-spec ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp set -e fi @@ -110,9 +110,9 @@ if [ $stage -le 1 ];then extract_xvectors_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ --part-idx $i --num-parts $nj \ - --input $data_dir/wav.scp \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --chunk-length $chunk_length \ - --output ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & + --output-spec ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & fi done wait diff --git a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh index 27c77454..4530ad3b 100755 --- a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh +++ b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh @@ -8,9 +8,7 @@ nj=1 cmd="run.pl" stage=0 file_format=flac -nodes=b1 storage_name=$(date +'%m_%d_%H_%M') -#proc_opts="--remove-dc-offset" min_spks=3 max_spks=10 num_reuses=5 @@ -23,10 +21,8 @@ if [ $# != 3 ]; then echo "Usage: $0 " echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" echo "Options: " - #echo " --nj # number of parallel jobs" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." echo " --file-format # Output file_format supported by soundfile (flac,ogg,wav,...)" - #echo " --proc-opts # Extra arguments for proc-audio-files.py" echo " --min-spks # max number of spks per utterance" echo " --max-spks # max number of spks per utterance" echo " --num-reuses # number of times a signal is reused to create babble" @@ -51,22 +47,12 @@ output_dir=$(utils/make_absolute.sh $dir) args="" $cmd $dir/log/make_babble_noise_${name}.log \ hyp_utils/conda_env.sh \ - make_babble_noise_audio_files.py ${args} \ - --output-audio-format $file_format $args $proc_opts \ + make_babble_noise_audio_files.py \ + --audio-format $file_format $args $proc_opts \ --min-spks $min_spks --max-spks $max_spks --num-reuses $num_reuses \ --write-time-durs $data_out/utt2dur \ - --input $data_in/wav.scp \ + --recordings-file $data_in/wav.scp \ --output-path $output_dir \ - --output-script $data_out/wav.scp - - - -# for n in $(seq $nj); do -# cat $output_dir/wav.${name}.$n.scp || exit 1; -# done > ${data_out}/wav.scp || exit 1 - -# for n in $(seq $nj); do -# cat $output_dir/utt2dur.${name}.$n || exit 1; -# done > ${data_out}/utt2dur || exit 1 + --output-recordings-file $data_out/wav.scp echo "$0: Succeeded making babble noise for $name" diff --git a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh index c6634135..437cd208 100755 --- a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh +++ b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh @@ -66,13 +66,4 @@ $cmd $dir/log/pack_rirs_${name}.log \ pack_wav_rirs.py ${args} --input $data_in/wav.scp \ --output ${file_format},scp:$output_dir/rirs_${name}.${file_format},$data_out/rirs.scp || exit 1; - -# for n in $(seq $nj); do -# cat $output_dir/wav.${name}.$n.scp || exit 1; -# done > ${data_out}/wav.scp || exit 1 - -# for n in $(seq $nj); do -# cat $output_dir/utt2dur.${name}.$n || exit 1; -# done > ${data_out}/utt2dur || exit 1 - echo "$0: Succeeded packing RIRs for $name" diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh index 8321169f..aed40672 100755 --- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh +++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh @@ -92,12 +92,14 @@ fi $cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \ hyp_utils/conda_env.sh \ - preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \ + preprocess_audio_files.py ${args} --audio-format $file_format $args $proc_opts \ --write-time-durs $output_dir/utt2dur.${name}.JOB \ --part-idx JOB --num-parts $nj \ - --input $data_in/wav.scp \ + # --input $data_in/wav.scp \ + --recordings-file $data_in/wav.scp \ --output-path $output_dir \ - --output-script $output_dir/wav.${name}.JOB.scp + --output-recordings-file $output_dir/wav.${name}.JOB.scp + #--output-script $output_dir/wav.${name}.JOB.scp for n in $(seq $nj); do cat $output_dir/wav.${name}.$n.scp || exit 1; diff --git a/hyperion/bin/eval_cosine_scoring_backend.py b/hyperion/bin/eval_cosine_scoring_backend.py new file mode 100755 index 00000000..1a740024 --- /dev/null +++ b/hyperion/bin/eval_cosine_scoring_backend.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import TrialNdx, TrialKey, TrialScores, EnrollmentMap, SegmentSet +from hyperion.utils.math_funcs import cosine_scoring +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm + + +def load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + test_feats_reader = DRF.create(feats_file) + if enroll_feats_file is not None and enroll_feats_file != feats_file: + enroll_feats_reader = DRF.create(enroll_feats_file) + else: + enroll_feats_reader = test_feats_reader + + enroll_map = EnrollmentMap.load(enroll_map_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_enroll_parts > 1 or num_test_parts > 1: + ndx = ndx.split( + enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts + ) + + enroll_map = enroll_map.filter(items=ndx.model_set) + x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True) + x_t = test_feats_reader.read(ndx.seg_set, squeeze=True) + return enroll_map, ndx, x_e, x_t + + +def load_cohort_data(segments_file, feats_file): + + segments = SegmentSet.load(segments_file) + feats_reader = DRF.create(feats_file) + x = feats_reader.read(segments["id"], squeeze=True) + return segments, x + + +def eval_backend( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + preproc_file, + score_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + cohort_segments_file, + cohort_feats_file, + cohort_nbest, + avg_cohort_by, +): + + logging.info("loading data") + enroll_map, ndx, x_e, x_t = load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) + + t1 = time.time() + logging.info("computing score") + if preproc_file is not None: + preprocessor = TransformList.load(preproc_file) + x_e = preprocessor(x_e) + x_t = preprocessor(x_t) + + scores = cosine_scoring(x_e, x_t, ids1=enroll_ids) + dt = time.time() - t1 + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if cohort_segments_file is not None: + t1 = time.time() + cohort_segments, x_coh = load_cohort_data( + cohort_segments_file, cohort_feats_file + ) + if preproc_file is not None: + x_coh = preprocessor(x_coh) + + if avg_cohort_by is not None: + cohort_class = cohort_segments[avg_cohort_by] + _, cohort_ids = np.unique(cohort_class, return_inverse=True) + else: + cohort_ids = None + + logging.info("computing enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh, ids2=cohort_ids) + logging.info("computing cohort vs test") + scores_coh_test = cosine_scoring(x_coh, x_t, ids1=cohort_ids) + snorm = AdaptSNorm(cohort_nbest) + scores = snorm(scores, scores_coh_test, scores_enr_coh) + dt = time.time() - t1 + logging.info( + "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if num_enroll_parts > 1 or num_test_parts > 1: + score_file = Path(score_file) + new_suffix = f".{enroll_part_idx}.{test_part_idx}{score_file.suffix}" + score_file = score_file.with_suffix(new_suffix) + + logging.info("saving scores to %s", score_file) + # sort scores rows to match the ndx model_set order + sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set] + scores = scores[sort_idx] + scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + scores.save(score_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Eval cosine-scoring with optional AS-Norm") + + parser.add_argument("--enroll-feats-file", default=None) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-map-file", required=True) + parser.add_argument("--preproc-file", default=None) + parser.add_argument("--cohort-segments-file", default=None) + parser.add_argument("--cohort-feats-file", default=None) + parser.add_argument("--cohort-nbest", type=int, default=1000) + parser.add_argument( + "--avg-cohort-by", + default=None, + help="segments file column to average vectors from same class class", + ) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "--enroll-part-idx", default=1, type=int, help="enroll part index" + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts in which we divide the enroll + list to run evaluation in parallel""", + ) + parser.add_argument("--test-part-idx", default=1, type=int, help="test part index") + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts in which we divide the test list + to run evaluation in parallel""", + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_backend(**namespace_to_dict(args)) diff --git a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py new file mode 100755 index 00000000..f567dd81 --- /dev/null +++ b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py @@ -0,0 +1,472 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +from pathlib import Path + +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import ( + TrialNdx, + TrialKey, + TrialScores, + EnrollmentMap, + SegmentSet, + InfoTable, +) +from hyperion.utils.math_funcs import cosine_scoring, average_vectors +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm + + +def get_precomp_qm_names(quality_measures): + # snorm qm will be calculated later + return [q for q in quality_measures if q not in ["snorm-mu", "snorm-mu/s"]] + + +def normalize_duration(q, min_dur, max_dur, frame_rate): + + q = q / frame_rate + q = np.log(np.clip(q / frame_rate, a_min=min_dur, a_max=max_dur)) + log_min_dur = np.log(min_dur) + log_max_dur = np.log(max_dur) + q = (q - log_min_dur) / (log_max_dur - log_min_dur) + return q + + +def load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + test_feats_reader = DRF.create(feats_file) + if enroll_feats_file is not None and enroll_feats_file != feats_file: + enroll_feats_reader = DRF.create(enroll_feats_file) + else: + enroll_feats_reader = test_feats_reader + + enroll_map = EnrollmentMap.load(enroll_map_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_enroll_parts > 1 or num_test_parts > 1: + ndx = ndx.split( + enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts + ) + + enroll_map = enroll_map.filter(items=ndx.model_set) + x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True) + x_t = test_feats_reader.read(ndx.seg_set, squeeze=True) + + # quality measures may be in segments file or/and feature_set file + # so we combine both if both are given + if segments_file is not None: + test_segments = SegmentSet.load(segments_file) + if enroll_segments_file is not None and segments_file != enroll_segments_file: + enroll_segments = SegmentSet.load(enroll_segments_file) + else: + enroll_segments = test_segments + + test_feats_set = test_feats_reader.feature_set + enroll_feats_set = enroll_feats_reader.feature_set + if segments_file: + test_segments.add_columns(test_feats_set) + if enroll_feats_set != test_feats_set or enroll_segments != test_segments: + enroll_segments.add_columns(enroll_feats_set) + + # now we retrive the quality measures + q_e = [] + q_t = [] + # snorm qm will be calculated later + retrieve_qm = get_precomp_qm_names(quality_measures) + q_e = enroll_segments.loc[enroll_map["segmentid"], retrieve_qm] + q_t = test_segments.loc[ndx.seg_set, retrieve_qm] + + # normalize durations + if "speech_duration" in retrieve_qm: + q_e["speech_duration"] = normalize_duration( + q_e["speech_duration"], min_dur, max_dur, 1 + ) + q_t["speech_duration"] = normalize_duration( + q_t["speech_duration"], min_dur, max_dur, 1 + ) + + if "num_speech_frames" in retrieve_qm: + q_e["num_speech_frames"] = normalize_duration( + q_e["num_speech_frames"], min_dur, max_dur, frame_rate + ) + q_t["num_speech_frames"] = normalize_duration( + q_t["num_speech_frames"], min_dur, max_dur, frame_rate + ) + + # q_e = np.asarray(q_e) + # q_t = np.asarray(q_t) + + return enroll_map, ndx, x_e, x_t, q_e, q_t + + +def load_cohort_data(segments_file, feats_file): + + segments = SegmentSet.load(segments_file) + feats_reader = DRF.create(feats_file) + x = feats_reader.read(segments["id"], squeeze=True) + + # segments.add_columns(feats_reader.feature_set) + + # retrieve_qm = get_precomp_qm_names(quality_measures) + # q = np.asarray(segments[retrieve_qm]) + return segments, x # , q + + +def average_qm(q, model_set, ids): + q_avg = average_vectors(q.values, ids) + q_avg = pd.DataFrame(q, columns=q.columns) + q_avg["id"] = model_set + q_avg.set_index("id", drop=False, inplace=True) + return q_avg + + +def get_score_filepath( + score_file, + score_name, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + + score_file = Path(score_file) + new_suffix = "" + if score_name is not None: + new_suffix = f".{score_name}" + + if num_enroll_parts > 1 or num_test_parts > 1: + new_suffix = ( + f"{new_suffix}.{enroll_part_idx}.{test_part_idx}{score_file.suffix}" + ) + + if new_suffix: + new_suffix = f"{new_suffix}{score_file.suffix}" + score_file = score_file.with_suffix(new_suffix) + + return score_file + +def save_scores(ndx, scores, score_file, score_name, enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts): + +def save_empty_scores(ndx, score_file, score_name, enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts): + scores = np.zeros(ndx.trial_mask.shape, dtype="float32") + score_file = get_score_filepath(score_file, score_name,enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts) + + scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + scores.save(score_file) + + + + +def segment_to_trial_qm(q_e, q_t): + q_trial = {} + for q_name in ["speech_duration", "num_speech_frames"]: + if q_name in q_e: + q_trial_name = f"max_{q_name}" + q_trial[q_trial_name] = np.maximum( + q_e[q_name].values[:, None], q_t[q_name].values[None, :] + ) + q_trial_name = f"min_{q_name}" + q_trial[q_trial_name] = np.minimum( + q_e[q_name].values[:, None], q_t[q_name].values[None, :] + ) + + return q_trial + + +def align_scores_to_ndx(enroll_set, ndx, scores, scores_norm, q_trial): + # sort scores rows to match the ndx model_set order + sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set] + scores = scores[sort_idx] + if scores_norm is not None: + scores_norm = scores_norm[sort_idx] + for qm in q_trial: + q_trial[qm] = q_trial[qm][sort_idx] + + return scores, scores_norm, q_trial + + +def make_qm_table(ndx, scores, scores_norm, q_trial): + if scores_norm is None: + scores = scores[ndx.trial_mask] + else: + scores = scores_norm[ndx.trial_mask] + + for qm in q_trial: + q_trial[qm] = q_trial[qm][ndx.trial_mask] + + I, J = np.nonzero(ndx.trial_mask) + modelid = ndx.model_set[I] + segmentid = ndx.seg_set[J] + unique_id = [f"{a}-{b}" for a, b in zip(modelid, segmentid)] + + q_dict = { + "id": unique_id, + "modelid": modelid, + "segmentid": segmentid, + "scores": scores, + } + q_dict.update(q_trial) + df = pd.DataFrame(q_dict) + return InfoTable(df) + + + + +def eval_backend( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + preproc_file, + qmf_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + cohort_segments_file, + cohort_feats_file, + cohort_nbest, + avg_cohort_by, + score_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + + logging.info("loading data") + enroll_map, ndx, x_e, x_t, q_e, q_t = load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if not np.any(ndx.trial_mask): + # this part doesn't have any trials, save empty files + + + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) + q_e = average_qm(q_e, enroll_set, enroll_ids) + + t1 = time.time() + logging.info("computing score") + if preproc_file is not None: + preprocessor = TransformList.load(preproc_file) + x_e = preprocessor(x_e) + x_t = preprocessor(x_t) + + scores = cosine_scoring(x_e, x_t, ids1=enroll_ids) + dt = time.time() - t1 + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + q_trial = segment_to_trial_qm(q_e, q_t) + scores_norm = None + if cohort_segments_file is not None: + t1 = time.time() + cohort_segments, x_coh = load_cohort_data( + cohort_segments_file, cohort_feats_file + ) + if preproc_file is not None: + x_coh = preprocessor(x_coh) + + if avg_cohort_by is not None: + cohort_class = cohort_segments[avg_cohort_by] + _, cohort_ids = np.unique(cohort_class, return_inverse=True) + else: + cohort_ids = None + + logging.info("computing enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh, ids2=cohort_ids) + logging.info("computing cohort vs test") + scores_coh_test = cosine_scoring(x_coh, x_t, ids1=cohort_ids) + snorm = AdaptSNorm(cohort_nbest) + scores_norm, mu_z, s_z, mu_t, s_t = snorm( + scores, scores_coh_test, scores_enr_coh, return_stats=True + ) + if "snorm-mu" in quality_measures: + q_trial["max_snorm-mu"] = np.maximum(mu_z, mu_t) + q_trial["min_snorm-mu"] = np.minimum(mu_z, mu_t) + if "snorm-mu/s" in quality_measures: + mu_z = mu_z / s_z + mu_t = mu_t / s_t + q_trial["max_snorm-mu/s"] = np.maximum(mu_z, mu_t) + q_trial["min_snorm-mu/s"] = np.minimum(mu_z, mu_t) + + dt = time.time() - t1 + logging.info( + "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + scores, scores_norm, q_trial = align_scores_to_ndx( + enroll_set, ndx, scores, scores_norm, q_trial + ) + if qmf_file is None: + qm_table = make_qm_table(ndx, scores, scores_norm, q_trial) + qm_file = get_score_filepath( + score_file, + "qm", + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + qm_table.save(qm_file) + return + + score_file_nonorm = get_score_filepath( + score_file, + None, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + logging.info("saving scores to %s", score_file_nonorm) + scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + scores.save(score_file_nonorm) + + if scores_norm is not None: + score_file_snorm = get_score_filepath( + score_file, + "snorm", + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + logging.info("saving scores with AS-Norm to %s", score_file_snorm) + scores.scores = scores_norm + scores.save(score_file_snorm) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Eval cosine-scoring with optional AS-Norm and QMF" + ) + + parser.add_argument("--enroll-feats-file", default=None) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-map-file", required=True) + parser.add_argument("--enroll-segments-file", default=None) + parser.add_argument("--segments-file", default=None) + parser.add_argument("--preproc-file", default=None) + parser.add_argument("--qmf-file", default=None) + parser.add_argument( + "--quality-measures", + default=["snorm-mu/s", "speech_duration"], + nargs="+", + choices=["snorm-mu/s", "snorm-mu", "speech_duration", "num_speech_frames"], + ) + parser.add_argument( + "--min-dur", default=0.1, type=float, help="lower bound to clip durations" + ) + parser.add_argument( + "--max-dur", default=30.0, type=float, help="upper bound to clip durations" + ) + parser.add_argument( + "--frame-rate", + default=100, + type=float, + help="frames/sec when durationa are expressed in frames", + ) + parser.add_argument("--cohort-segments-file", default=None) + parser.add_argument("--cohort-feats-file", default=None) + parser.add_argument("--cohort-nbest", type=int, default=1000) + parser.add_argument( + "--avg-cohort-by", + default=None, + help="segments file column to average vectors from same class class", + ) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "--enroll-part-idx", default=1, type=int, help="enroll part index" + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts in which we divide the enroll + list to run evaluation in parallel""", + ) + parser.add_argument("--test-part-idx", default=1, type=int, help="test part index") + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts in which we divide the test list + to run evaluation in parallel""", + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_backend(**namespace_to_dict(args)) diff --git a/hyperion/bin/eval_verification_metrics.py b/hyperion/bin/eval_verification_metrics.py new file mode 100755 index 00000000..83227558 --- /dev/null +++ b/hyperion/bin/eval_verification_metrics.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.np.metrics import VerificationEvaluator as VE + +from jsonargparse import ( + ActionConfigFile, + ActionYesNo, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + + +def eval_verification_metrics( + key_files, + score_files, + key_names, + score_names, + p_tar, + c_miss, + c_fa, + sparse, + output_file, +): + + assert len(key_files) == len(key_names) + assert len(score_files) == len(score_names) + dfs = [] + for score_file, score_name in zip(score_files, score_names): + for key_file, key_name in zip(key_files, key_names): + logging.info("Evaluating %s - %s", score_name, key_name) + evaluator = VE( + key_file, + score_file, + p_tar, + c_miss, + c_fa, + key_name, + score_name, + sparse=sparse, + ) + df_ij = evaluator.compute_dcf_eer() + dfs.append(df_ij) + + df = pd.concat(dfs) + logging.info("saving results to %s", output_file) + output_file = Path(output_file) + output_file.parent.mkdir(exist_ok=True, parents=True) + sep = "\t" if output_file.suffix == ".tsv" else "," + df.to_csv(output_file, sep=sep, index=False, float_format="{:,.4f}".format) + + pd.options.display.float_format = "{:.4}".format + print(df.to_string(), flush=True) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Evaluate speaker verification metrics") + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--key-files", required=True, nargs="+") + parser.add_argument("--score-files", required=True, nargs="+") + parser.add_argument("--key-names", required=True, nargs="+") + parser.add_argument("--score-names", required=True, nargs="+") + parser.add_argument( + "--p-tar", + default=[0.05, 0.01, 0.005, 0.001], + nargs="+", + type=float, + help="target priors", + ) + parser.add_argument( + "--c-miss", default=None, nargs="+", type=float, help="cost of miss" + ) + parser.add_argument( + "--c-fa", default=None, nargs="+", type=float, help="cost of false alarm" + ) + parser.add_argument("--sparse", default=False, action=ActionYesNo) + parser.add_argument("--output-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, + ) + + args = parser.parse_args() + kwargs = namespace_to_dict(args) + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + eval_verification_metrics(**kwargs) diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index 9efbd6dd..f60c7508 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -21,8 +21,12 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def init_device(use_gpu): @@ -76,13 +80,15 @@ def augment(key0, x0, augmenter, aug_df, aug_id): def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( - "extract-random-utt %s of length=%d first-frame=%d" - % (key, x.shape[1], first_frame) + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, ) return x @@ -105,7 +111,7 @@ def eval_xvec( **kwargs ): - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index 6f7d269e..5eba1b99 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -21,8 +21,12 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) resamplers = {} @@ -84,9 +88,11 @@ def augment(key0, x0, augmenter, aug_df, aug_id): def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=fs * min_utt_length, high=fs * max_utt_length + 1) + utt_length = rng.integers( + low=int(fs * min_utt_length), high=int(fs * max_utt_length + 1) + ) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( "extract-random-utt %s of length=%d first-frame=%d", @@ -98,7 +104,7 @@ def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): def extract_xvectors( - input_spec, + recordings_file, output_spec, vad_spec, write_speech_dur, @@ -117,7 +123,7 @@ def extract_xvectors( **kwargs, ): - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) model = load_model(model_path, device) @@ -138,15 +144,12 @@ def extract_xvectors( logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec) as writer: - logging.info(f"opening input stream: {input_spec} with args={ar_args}") - with AR(input_spec, **ar_args) as reader: + logging.info(f"opening input stream: {recordings_file} with args={ar_args}") + with AR(recordings_file, **ar_args) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create( - vad_spec, - path_prefix=vad_path_prefix, - ) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): t1 = time.time() @@ -160,9 +163,7 @@ def extract_xvectors( t2 = time.time() if fs != model.sample_frequency: resampler = get_resampler(fs, model.sample_frequency) - print(f"x01 {x0.shape} {np.max(x0)}") x0 = resampler(x0) - print(f"x01 {x0.shape} {np.max(x0)}") logging.info("processing utt %s", key0) for aug_id in range(num_augs): @@ -260,7 +261,7 @@ def extract_xvectors( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument("--write-speech-dur", default=None) parser.add_argument( @@ -278,7 +279,7 @@ def extract_xvectors( parser.add_argument("--model-path", required=True) parser.add_argument( "--hf-chunk-length", - type=int, + type=float, default=0, help=( "max. chunk length used in each forward pass " @@ -288,7 +289,7 @@ def extract_xvectors( ) parser.add_argument( "--xvec-chunk-length", - type=int, + type=float, default=0, help=( "max. chunk length used in each forward pass " @@ -314,18 +315,18 @@ def extract_xvectors( ) parser.add_argument( "--min-utt-length", - type=int, + type=float, default=5, help=("minimum utterance length in secs when using random utt length"), ) parser.add_argument( "--max-utt-length", - type=int, + type=float, default=120, help=("maximum utterance length in secs when using random utt length"), ) - parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--output-spec", required=True) parser.add_argument( "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" ) diff --git a/hyperion/bin/extract_wav2xvectors.py b/hyperion/bin/extract_wav2xvectors.py new file mode 100755 index 00000000..7b04fcc8 --- /dev/null +++ b/hyperion/bin/extract_wav2xvectors.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +import torchaudio.transforms as tat +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.augment import SpeechAugment +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +resamplers = {} + + +def get_resampler(source_fs, target_fs): + if source_fs in resamplers: + return resamplers[source_fs] + + resampler = tat.Resample( + int(source_fs), + int(target_fs), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492, + ) + resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() + resamplers[source_fs] = resampler_f + return resampler_f + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus=%d", num_gpus) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model %s", model_path) + model = TML.load(model_path) + logging.info(f"xvector-model={model}") + model.to(device) + model.eval() + return model + + +def augment(key0, x0, augmenter, aug_df, aug_id): + if augmenter is None: + x = x0 + key = key0 + else: + x, aug_info = augmenter(x0) + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) + + return key, x + + +def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): + utt_length = rng.integers( + low=int(fs * min_utt_length), high=int(fs * max_utt_length + 1) + ) + if utt_length < x.shape[1]: + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] + logging.info( + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, + ) + return x + + +def extract_xvectors( + recordings_file, + output_spec, + vad_spec, + write_speech_dur, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs, +): + + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + model = load_model(model_path, device) + + if write_speech_dur is not None: + keys = [] + info = [] + + if aug_cfg is not None: + augmenter = SpeechAugment.create(aug_cfg, rng=rng) + aug_df = [] + else: + augmenter = None + aug_df = None + num_augs = 1 + + metadata_columns = ["speech_duration"] + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output stream: %s with args=%s", output_spec, str(ar_args)) + with DWF.create(output_spec, metadata_columns=metadata_columns) as writer: + + logging.info(f"opening input stream: {recordings_file} with args={ar_args}") + with AR(recordings_file, **ar_args) as reader: + + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + fs = fs[0] + t2 = time.time() + if fs != model.sample_frequency: + resampler = get_resampler(fs, model.sample_frequency) + x0 = resampler(x0) + + logging.info("processing utt %s", key0) + for aug_id in range(num_augs): + metadata = {} + t3 = time.time() + key, x = augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + t5 = time.time() + tot_samples = x.shape[1] + if vad_spec is not None: + vad = v_reader.read(key0)[0] + vad = torch.tensor( + vad[None, None, :], dtype=torch.float + ).to(device) + vad = torch.nn.functional.interpolate( + vad, size=x.size(-1), mode="nearest" + ).bool()[0, 0] + x = x[:, vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech samples", + key, + x.shape[1], + tot_samples, + x.shape[1] / tot_samples * 100, + ) + + if random_utt_length: + x = select_random_chunk( + key, x, fs, min_utt_length, max_utt_length, rng + ) + + metadata["speech_duration"] = ( + x.shape[1] / model.sample_frequency + ) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + y = ( + model.extract_embed( + x, + chunk_length=chunk_length, + embed_layer=embed_layer, + ) + .cpu() + .numpy()[0] + ) + + t7 = time.time() + writer.write([key], [y], metadata=metadata) + if write_speech_dur is not None: + keys.append(key) + info.append(str(x.shape[1] / fs)) + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ), + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x.shape[1] / fs / tot_time, + ) + + if write_speech_dur is not None: + logging.info("writing speech duration in secs to %s", write_speech_dur) + u2sd = Utt2Info.create(keys, info) + u2sd.save(write_speech_dur) + + if aug_info_path is not None: + aug_df = pd.concat(aug_df, ignore_index=True) + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="""Extracts x-vectors from waveform computing acoustic features on the fly""" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument("--write-speech-dur", default=None) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + AR.add_class_args(parser) + + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=float, + default=0, + help=( + "max. chunk length used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=float, + default=5, + help=("minimum utterance length in secs when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=float, + default=120, + help=("maximum utterance length in secs when using random utt length"), + ) + + parser.add_argument("--output-spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py index 13ad4277..b02db70c 100755 --- a/hyperion/bin/extract_xvectors_from_feats.py +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -19,8 +19,12 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def init_device(use_gpu): @@ -50,13 +54,15 @@ def load_model(model_path, device): def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( - "extract-random-utt %s of length=%d first-frame=%d" - % (key, x.shape[1], first_frame) + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, ) return x @@ -78,7 +84,7 @@ def extract_xvectors( ): logging.info("initializing") - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) mvn = init_mvn(device, **kwargs) model = load_model(model_path, device) diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index 577bbae7..6a8130d3 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -21,8 +21,12 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def init_device(use_gpu): @@ -76,9 +80,9 @@ def augment(key0, x0, augmenter, aug_df, aug_id): def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( "extract-random-utt %s of length=%d first-frame=%d", @@ -90,7 +94,7 @@ def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): def extract_xvectors( - input_spec, + recordings_file, output_spec, vad_spec, write_num_frames_spec, @@ -108,7 +112,7 @@ def extract_xvectors( **kwargs ): - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) @@ -130,9 +134,9 @@ def extract_xvectors( with DWF.create(output_spec) as writer: logging.info( - "opening input stream: {} with args={}".format(input_spec, ar_args) + "opening input stream: {} with args={}".format(recordings_file, ar_args) ) - with AR(input_spec, **ar_args) as reader: + with AR(recordings_file, **ar_args) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) @@ -235,12 +239,12 @@ def extract_xvectors( parser = ArgumentParser( description=( - "Extracts x-vectors from waveform computing " "acoustic features on the fly" + "Extracts x-vectors from waveform computing acoustic features on the fly" ) ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--write-num-frames", dest="write_num_frames_spec", default=None @@ -299,7 +303,7 @@ def extract_xvectors( help=("maximum utterance length when using random utt length"), ) - parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--output-spec", required=True) parser.add_argument( "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" ) diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index a54c4d64..bcec5133 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -20,8 +20,12 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def init_device(use_gpu): @@ -71,7 +75,7 @@ def extract_xvectors( ): logging.info("initializing") - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) mvn = init_mvn(device, **kwargs) model = load_model(model_path, device) diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index 8939ba91..f1a64e1b 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -22,8 +22,12 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def init_device(use_gpu): @@ -96,7 +100,7 @@ def extract_xvectors( **kwargs ): - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) diff --git a/hyperion/bin/finetune_wav2xvector.py b/hyperion/bin/finetune_wav2xvector.py new file mode 100755 index 00000000..b100b544 --- /dev/null +++ b/hyperion/bin/finetune_wav2xvector.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import torch +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy + +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as RXVec + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +# from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + # "efficientnet": EXVec, + # "tdnn": TDXVec, + # "transformer": TFXVec, + # "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_xvector(num_classes, in_model_file, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_finetune_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["xvector"]["num_classes"] = num_classes + model = TML.load(in_model_file) + model.change_config(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + try: + hard_prototype_mining = val_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + +def train_xvec(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + xvec_class.add_finetune_args(parser, prefix="model") + parser.add_argument("--in-model-file", required=True) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Fine-tune x-vector model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index 209915c5..4336b7b9 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -24,8 +24,12 @@ from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialNdx, Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def read_utt_list(list_file, class2int_file, part_idx, num_parts): diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py old mode 100644 new mode 100755 index c5a3f6b9..2e3a35ec --- a/hyperion/bin/hyperion_dataset.py +++ b/hyperion/bin/hyperion_dataset.py @@ -8,30 +8,40 @@ from typing import List, Optional, Union from hyperion.hyp_defs import config_logger -from hyperion.utils import (ClassInfo, Dataset, EnrollmentMap, FeatureSet, - InfoTable, PathLike, RecordingSet, SegmentSet) -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - -subcommands = ["add_features"] -# table_dict = { -# "segments": SegmentSet, -# "recordings": RecordingSet, -# "features": FeatureSet, -# "classes": ClassInfo, -# "enrollments": EnrollmentMap, -# "generic": InfoTable, -# } +from hyperion.utils import ( + ClassInfo, + Dataset, + EnrollmentMap, + FeatureSet, + InfoTable, + PathLike, + RecordingSet, + SegmentSet, +) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, + ActionYesNo, +) + +subcommand_list = [ + "add_features", + "set_recordings", + "make_from_recordings", + "remove_short_segments", + "rebuild_class_idx", + "remove_classes_few_segments", + "split_train_val", + "copy", + "add_cols_to_segments", +] def add_common_args(parser): parser.add_argument( - "-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int, + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, ) @@ -45,6 +55,11 @@ def make_add_features_parser(): "--features-name", required=True, help="""name of the feature""" ) parser.add_argument("--features-file", required=True, help="""feature set file""") + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) add_common_args(parser) return parser @@ -54,10 +69,353 @@ def add_features( dataset: PathLike, features_name: str, features_file: PathLike, + output_dataset: PathLike, ): + if output_dataset is None: + output_dataset = dataset + dataset = Dataset.load(dataset, lazy=True) dataset.add_features(features_name, features_file) - dataset.save(dataset) + dataset.save(output_dataset) + + +def make_set_recordings_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--recordings-file", required=True, help="""recordings set file""" + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + parser.add_argument( + "--remove-features", + default=None, + nargs="+", + help="""removes feature files from the dataset, + since they maybe obsolote after modifiying the recordings""", + ) + parser.add_argument( + "--update-seg-durs", + default=False, + action=ActionYesNo, + help="""updates the durations in the segment table""", + ) + + add_common_args(parser) + return parser + + +def set_recordings( + dataset: PathLike, + recordings_file: PathLike, + output_dataset: PathLike, + remove_features: List[str], + update_seg_durs: bool, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.set_recordings(recordings_file, update_seg_durs) + if remove_features is not None: + for features_name in remove_features: + dataset.remove_features(features_name) + + dataset.save(output_dataset) + + +def make_make_from_recordings_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--recordings-file", required=True, help="""recordings set file""" + ) + + add_common_args(parser) + return parser + + +def make_from_recordings( + dataset: PathLike, recordings_file: PathLike, +): + output_dataset = dataset + import pandas as pd + + rec_df = pd.read_csv(recordings_file) + seg_df = rec_df[["id"]] + segments = SegmentSet(seg_df) + dataset = Dataset(segments, recordings=recordings_file) + dataset.save(output_dataset) + + +def make_remove_short_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--min-length", + required=True, + type=float, + help="""minimum required length of the segment""", + ) + + parser.add_argument( + "--length-name", + default="duration", + help="""name of the column indicating the length of the segment""", + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def remove_short_segments( + dataset: PathLike, min_length: float, length_name: str, output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.remove_short_segments(min_length, length_name) + dataset.save(output_dataset) + + +def make_rebuild_class_idx_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--class-name", required=True, help="""name of the class type e.g.: speaker""" + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def rebuild_class_idx( + dataset: PathLike, class_name: str, output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.rebuild_class_idx(class_name) + dataset.save(output_dataset) + + +def make_remove_classes_few_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--class-name", required=True, help="""name of the class type e.g.: speaker""" + ) + parser.add_argument( + "--min-segs", default=1, type=int, help="""min. num. of segments/class""" + ) + parser.add_argument( + "--rebuild-idx", + default=False, + action=ActionYesNo, + help="""regenerate class indexes from 0 to new_num_classes-1""", + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def remove_classes_few_segments( + dataset: PathLike, + class_name: str, + min_segs: int, + rebuild_idx: bool, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.remove_classes_few_segments(class_name, min_segs, rebuild_idx) + dataset.save(output_dataset) + + +def make_split_train_val_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""input dataset dir or .yaml file""" + ) + parser.add_argument( + "--val-prob", + default=0.05, + type=float, + help="""proportion of segments used for val""", + ) + parser.add_argument( + "--min-train-samples", + default=1, + type=int, + help="""min. number of training samples / class""", + ) + + parser.add_argument( + "--joint-classes", + default=None, + nargs="+", + help="""types of classes that need to have same classes in train and val""", + ) + parser.add_argument( + "--disjoint-classes", + default=None, + nargs="+", + help="""types of classes that need to have different classes in train and val""", + ) + parser.add_argument( + "--seed", default=11235813, type=int, help="""random seed""", + ) + + parser.add_argument( + "--train-dataset", required=True, help="""output train dataset dir""", + ) + parser.add_argument( + "--val-dataset", required=True, help="""output val dataset dir""", + ) + + add_common_args(parser) + return parser + + +def split_train_val( + dataset: PathLike, + val_prob: float, + joint_classes: List[str], + disjoint_classes: List[str], + min_train_samples: int, + seed: int, + train_dataset: PathLike, + val_dataset: PathLike, +): + dataset = Dataset.load(dataset, lazy=True) + train_ds, val_ds = dataset.split_train_val( + val_prob, joint_classes, disjoint_classes, min_train_samples, seed + ) + train_ds.save(train_dataset) + val_ds.save(val_dataset) + + num_total = len(dataset) + num_train = len(train_ds) + num_val = len(val_ds) + logging.info( + "train: %d (%.2f%%) segments, val: %d (%.2f%%) segments", + num_train, + num_train / num_total * 100, + num_val, + num_val / num_total * 100, + ) + + +def make_copy_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--output-dataset", + required=True, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def copy( + dataset: PathLike, output_dataset: PathLike, +): + dataset = Dataset.load(dataset, lazy=True) + dataset.save(output_dataset) + + +def make_add_cols_to_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--right-table", required=True, help="table where the new data is" + ) + parser.add_argument( + "--columns", + required=True, + nargs="+", + help="""columns to copy to segments table""", + ) + parser.add_argument( + "--on", default=["id"], nargs="+", help="""columns to match both tables rows""", + ) + parser.add_argument( + "--right-on", + default=None, + nargs="+", + help="""columns to match both tables rows""", + ) + + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def add_cols_to_segments( + dataset: PathLike, + right_table: PathLike, + column_names: List[str], + on: List[str], + right_on: List[str], + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.add_cols_to_segments(right_table, column_names, on, right_on) + dataset.save(output_dataset) if __name__ == "__main__": @@ -66,15 +424,15 @@ def add_features( parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() - for subcommand in subcommands: + for subcommand in subcommand_list: parser_func = f"make_{subcommand}_parser" subparser = globals()[parser_func]() - subcommands.add_subcommand(k, subparser) + subcommands.add_subcommand(subcommand, subparser) args = parser.parse_args() subcommand = args.subcommand kwargs = namespace_to_dict(args)[args.subcommand] config_logger(kwargs["verbose"]) del kwargs["verbose"] - + del kwargs["cfg"] globals()[subcommand](**kwargs) diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py index 5a5f0b4f..7f61b35a 100755 --- a/hyperion/bin/hyperion_tables.py +++ b/hyperion/bin/hyperion_tables.py @@ -8,12 +8,23 @@ from typing import List, Optional, Union from hyperion.hyp_defs import config_logger -from hyperion.utils import (ClassInfo, EnrollmentMap, FeatureSet, InfoTable, - PathLike, RecordingSet, SegmentSet) -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - -subcommands = ["cat"] +from hyperion.utils import ( + ClassInfo, + EnrollmentMap, + FeatureSet, + InfoTable, + PathLike, + RecordingSet, + SegmentSet, +) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +subcommand_list = ["cat"] table_dict = { "segments": SegmentSet, "recordings": RecordingSet, @@ -73,11 +84,11 @@ def cat( table_type: str, input_files: Union[List[PathLike], None], output_file: PathLike, - num_table: int, + num_tables: int, base_idx: int = 1, ): - assert input_files is not None or num_jobs != 0 + assert input_files is not None or num_tables != 0 output_file = Path(output_file) if input_files is None: ext = output_file.suffix @@ -103,15 +114,15 @@ def cat( parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() - for subcommand in subcommands: + for subcommand in subcommand_list: parser_func = f"make_{subcommand}_parser" subparser = globals()[parser_func]() - subcommands.add_subcommand(k, subparser) + subcommands.add_subcommand(subcommand, subparser) args = parser.parse_args() subcommand = args.subcommand kwargs = namespace_to_dict(args)[args.subcommand] config_logger(kwargs["verbose"]) del kwargs["verbose"] - + del kwargs["cfg"] globals()[subcommand](**kwargs) diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py index 4a356037..68e5b22b 100755 --- a/hyperion/bin/make_babble_noise_audio_files.py +++ b/hyperion/bin/make_babble_noise_audio_files.py @@ -15,12 +15,15 @@ from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from scipy import ndimage, signal +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) -def make_noise(xs): +def make_noise(xs, max_value): lens = np.array([x.shape[0] for x in xs]) max_len = np.max(lens) @@ -28,73 +31,78 @@ def make_noise(xs): for i in range(len(xs)): xs[i] = np.tile(xs[i], int(num_tiles[i]))[:max_len] + xs[0] -= xs[0].mean() for i in range(1, len(xs)): xs[0] += xs[i] - xs[i].mean() + max_x = np.max(np.abs(xs[0])) + if max_x > max_value: + xs[0] *= max_value / max_x + return xs[0] def make_babble_noise_audio_files( - input_path, + recordings_file, output_path, - output_script, - write_time_durs_spec, + output_recordings_file, + write_time_durs, min_spks=3, max_spks=7, num_reuses=5, random_seed=112358, - **kwargs + **kwargs, ): input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) - logging.info("input_args={}".format(input_args)) - logging.info("output_args={}".format(output_args)) + logging.info(f"input_args={input_args}") + logging.info(f"output_args={output_args}") - rng = np.random.RandomState(seed=random_seed) + rng = np.random.default_rng(seed=random_seed) - if write_time_durs_spec is not None: + if write_time_durs is not None: okeys = [] info = [] count = 0 t1 = time.time() - with AR(input_path, **input_args) as reader: + with AR(recordings_file, **input_args) as reader, Writer( + output_path, output_recordings_file, **output_args + ) as writer: keys = reader.keys - with Writer(output_path, output_script, **output_args) as writer: - - for iters in range(num_reuses): - keys = rng.permutation(keys) - - cur_spks = min_spks + for iters in range(num_reuses): + keys = rng.permutation(keys) + + cur_spks = min_spks + utt_list = [] + for utt_idx in range(len(keys)): + if len(utt_list) < cur_spks: + utt_list.append(keys[utt_idx]) + continue + + x, fs = reader.read(utt_list) + fs = fs[0] + y = make_noise(x, reader.wav_scale) + babble_id = "babble-%05d" % (count) + logging.info("writing file %s", babble_id) + writer.write([babble_id], [y], [fs]) + if write_time_durs is not None: + okeys.append(babble_id) + info.append(y.shape[0] / fs) + + count += 1 utt_list = [] - for utt_idx in range(len(keys)): - if len(utt_list) < cur_spks: - utt_list.append(keys[utt_idx]) - continue - - x, fs = reader.read(utt_list) - fs = fs[0] - y = make_noise(x) - babble_id = "babble-%05d" % (count) - logging.info("writing file % s" % (babble_id)) - writer.write([babble_id], [y], [fs]) - if write_time_durs_spec is not None: - okeys.append(babble_id) - info.append(y.shape[0] / fs) - - count += 1 - utt_list = [] - cur_spks += 1 - if cur_spks > max_spks: - cur_spks = min_spks - - if write_time_durs_spec is not None: - logging.info("writing time durations to %s" % (write_time_durs_spec)) + cur_spks += 1 + if cur_spks > max_spks: + cur_spks = min_spks + + if write_time_durs is not None: + logging.info("writing time durations to %s", write_time_durs) u2td = Utt2Info.create(okeys, info) - u2td.save(write_time_durs_spec) + u2td.save(write_time_durs) - logging.info("finished making babble files, elapsed-time=%f" % (time.time() - t1)) + logging.info("finished making babble files, elapsed-time=%f", time.time() - t1) if __name__ == "__main__": @@ -102,10 +110,10 @@ def make_babble_noise_audio_files( parser = ArgumentParser(description="Creates babble noise by adding speech files") parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--output-path", required=True) - parser.add_argument("--output-script", required=True) - parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None) + parser.add_argument("--output-recordings-file", required=True) + parser.add_argument("--write-time-durs", default=None) AR.add_class_args(parser) Writer.add_class_args(parser) diff --git a/hyperion/bin/make_wav2xvector.py b/hyperion/bin/make_wav2xvector.py new file mode 100755 index 00000000..b5972d1b --- /dev/null +++ b/hyperion/bin/make_wav2xvector.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +""" + Copyright 2023 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +from hyperion.hyp_defs import config_logger + +# from hyperion.torch import TorchModelLoader as TML +from hyperion.torch import TorchModel + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import Wav2ResNet1dXVector as W2R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as W2RXVec +from hyperion.torch.narchs import AudioFeatsMVN as AF +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + + +def init_feats(feats): + feat_args = AF.filter_args(**feats) + logging.info(f"feat args={feat_args}") + logging.info("initializing feature extractor") + feat_extractor = AF(trans=True, **feat_args) + logging.info(f"feat-extractor={feat_extractor}") + return feat_extractor + + +def load_model(model_path): + logging.info("loading model %s", model_path) + model = TorchModel.auto_load(model_path) + logging.info(f"xvector-model={model}") + return model + + +def make_wav2xvector(feats, xvector_path, output_path): + + feats = init_feats(feats) + xvector_model = load_model(xvector_path) + if isinstance(xvector_model, RXVec): + model = W2RXVec(feats, xvector_model) + elif isinstance(xvector_model, R1dXVec): + model = W2R1dXVec(feats, xvector_model) + else: + TypeError( + "Conversion of xvector class=%s not available", xvector_model.__class__ + ) + + logging.info("saving model of class %s to %s", model.__class__, output_path) + model.save(output_path) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="""Combines the feature extractor config with XVector model + to produce a Wav2XVector model with integrated feature extraction""" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + AF.add_class_args(parser, prefix="feats") + parser.add_argument("--xvector-path", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + del args.cfg + logging.debug(args) + + make_wav2xvector(**namespace_to_dict(args)) diff --git a/hyperion/bin/merge_scores.py b/hyperion/bin/merge_scores.py new file mode 100755 index 00000000..6a275f5c --- /dev/null +++ b/hyperion/bin/merge_scores.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +from hyperion.hyp_defs import config_logger + +from hyperion.utils import TrialScores +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + + +def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, base_idx): + + output_file = Path(output_file) + output_file.parent.mkdir(exist_ok=True, parents=True) + + ext = output_file.suffix + + if input_files is None: + input_file_base = output_file.with_suffix("") + input_files = [] + for i in range(num_enroll_parts): + idx_i = base_idx + i + for j in range(num_test_parts): + idx_j = base_idx + j + input_file_i = input_file_base.with_suffix(f".{idx_i}.{idx_j}{ext}") + input_files.append(input_file_i) + + if ext == ".h5": + # if files are h5 we need to load everything in RAM + score_list = [] + for score_file in input_files: + scores = TrialScores.load(score_file) + score_list.append(scores) + + scores = TrialScores.merge(score_list) + scores.save(output_file) + else: + has_header = ext in [".csv", ".tsv"] + write_header = True + with open(output_file, "w", encoding="utf-8") as f_out: + for score_file in input_files: + with open(score_file) as f_in: + for i, line in enumerate(f_in): + if i == 0 and has_header and not write_header: + continue + f_out.write(line) + write_header = False + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--input-files", default=None, nargs="+", help="optional list of input files" + ) + parser.add_argument( + "--output-file", + required=True, + help="""output file, if input-files is None, input files names are derived from it""", + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts we divided the enrollment set""", + ) + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts we divided the test set""", + ) + + parser.add_argument( + "--base-idx", + default=1, + type=int, + help="""index of the first job, typically 0 or 1""", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, + ) + + args = parser.parse_args() + kwargs = namespace_to_dict(args) + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + merge_scores(**kwargs) diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index 78ac59c1..b2a1bc2b 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -13,8 +13,12 @@ from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def pack_wav_rirs(input_path, output_spec, **kwargs): @@ -32,12 +36,15 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): h[h < 1e-3] = 0 h = np.trim_zeros(h) logging.info( - "Packing rir %s h_max=%f h_delay=%d h-length=%d" - % (key, h_max, h_delay, len(h)) + "Packing rir %s h_max=%f h_delay=%d h-length=%d", + key, + h_max, + h_delay, + len(h), ) writer.write([key], [h]) - logging.info("Packed RIRS elapsed-time=%.f" % (time.time() - t1)) + logging.info("Packed RIRS elapsed-time=%.f", time.time() - t1) if __name__ == "__main__": diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py index 6af0202c..14da4d07 100755 --- a/hyperion/bin/plot_embedding_tsne_per_class.py +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -18,9 +18,14 @@ from hyperion.np.clustering import AHC from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet -from hyperion.utils.math import cosine_scoring -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) +from hyperion.utils.math_funcs import cosine_scoring +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py index e90ad0f7..f6723c7d 100755 --- a/hyperion/bin/prepare_data.py +++ b/hyperion/bin/prepare_data.py @@ -8,8 +8,12 @@ from hyperion.data_prep import DataPrep from hyperion.hyp_defs import config_logger -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def make_parser(data_prep_class): @@ -33,6 +37,5 @@ def make_parser(data_prep_class): config_logger(1) data_prep_class = DataPrep.registry[args.subcommand] args = namespace_to_dict(args)[args.subcommand] - data_prep = data_prep_class(**args) data_prep.prepare() diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py index e8adfd16..bda9a503 100755 --- a/hyperion/bin/preprocess_audio_files.py +++ b/hyperion/bin/preprocess_audio_files.py @@ -15,13 +15,26 @@ from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from scipy import ndimage, signal +def resample_vad(vad, length): + step = (len(vad) - 1) / length + assert step < 1 + idx = step * np.arange(length, dtype=float) + idx = np.round(idx).astype(int) + return vad[idx] + + def process_vad(vad, length, fs, dilation, erosion): - vad = signal.resample(vad, length) > 0.5 + # vad = signal.resample(vad, length) > 0.5 + vad = resample_vad(vad, length) if dilation > 0: iters = int(dilation * fs) vad = ndimage.binary_dilation(vad, iterations=iters) @@ -34,9 +47,9 @@ def process_vad(vad, length, fs, dilation, erosion): def process_audio_files( - input_path, + recordings_file, output_path, - output_script, + output_recordings_file, write_time_durs_spec, vad_spec, vad_path_prefix, @@ -44,86 +57,92 @@ def process_audio_files( vad_dilation=0, vad_erosion=0, remove_dc_offset=False, - **kwargs + **kwargs, ): input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) - logging.info("input_args={}".format(input_args)) - logging.info("output_args={}".format(output_args)) + logging.info(f"input_args={input_args}") + logging.info(f"output_args={output_args}") if write_time_durs_spec is not None: keys = [] info = [] - with AR(input_path, **input_args) as reader: - with Writer(output_path, output_script, **output_args) as writer: + with AR(recordings_file, **input_args) as reader, Writer( + output_path, output_recordings_file, **output_args + ) as writer: - if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) - t1 = time.time() - for data in reader: - key, x, fs = data - logging.info("Processing audio %s" % (key)) - t2 = time.time() - - tot_samples = x.shape[0] - if vad_spec is not None: - num_vad_frames = int(round(tot_samples * vad_fs / fs)) - vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype( - "bool", copy=False - ) - logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) - vad = process_vad(vad, tot_samples, fs, vad_dilation, vad_erosion) - logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) - x = x[vad] - - logging.info( - "utt %s detected %f/%f secs (%.2f %%) speech " - % ( - key[0], - x.shape[0] / fs, - tot_samples / fs, - x.shape[0] / tot_samples * 100, - ) - ) + t1 = time.time() + for data in reader: + key, x, fs = data + logging.info("Processing audio %s", key) + t2 = time.time() - if x.shape[0] > 0: - if remove_dc_offset: - x -= np.mean(x) - - writer.write([key], [x], [fs]) - if write_time_durs_spec is not None: - keys.append(key) - info.append(x.shape[0] / fs) - - xmax = np.max(x) - xmin = np.min(x) - else: - xmax = 0 - xmin = 0 - - t3 = time.time() - dt2 = (t2 - t1) * 1000 - dt3 = (t3 - t1) * 1000 - time_dur = len(x) / fs - rtf = (time_dur * 1000) / dt3 - logging.info( - ( - "Packed audio %s length=%0.3f secs " - "elapsed-time=%.2f ms. " - "read-time=%.2f ms. write-time=%.2f ms. " - "real-time-factor=%.2f" - "x-range=[%f-%f]" - ) - % (key, time_dur, dt3, dt2, dt3 - dt2, rtf, xmin, xmax) + tot_samples = x.shape[0] + if vad_spec is not None: + num_vad_frames = int(round(tot_samples * vad_fs / fs)) + vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype( + "bool", copy=False ) - t1 = time.time() + logging.info("vad=%d/%d", np.sum(vad == 1), len(vad)) + vad = process_vad(vad, tot_samples, fs, vad_dilation, vad_erosion) + logging.info("vad=%d/%d", np.sum(vad == 1), len(vad)) + x = x[vad] + + logging.info( + "utt %s detected %f/%f secs (%.2f %%) speech ", + key[0], + x.shape[0] / fs, + tot_samples / fs, + x.shape[0] / tot_samples * 100, + ) + + if x.shape[0] > 0: + if remove_dc_offset: + x -= np.mean(x) + + writer.write([key], [x], [fs]) + if write_time_durs_spec is not None: + keys.append(key) + info.append(x.shape[0] / fs) + + xmax = np.max(x) + xmin = np.min(x) + else: + xmax = 0 + xmin = 0 + + t3 = time.time() + dt2 = (t2 - t1) * 1000 + dt3 = (t3 - t1) * 1000 + time_dur = len(x) / fs + rtf = (time_dur * 1000) / dt3 + logging.info( + ( + "Packed audio %s length=%0.3f secs " + "elapsed-time=%.2f ms. " + "read-time=%.2f ms. write-time=%.2f ms. " + "real-time-factor=%.2f " + "x-range=[%f - %f]" + ), + key, + time_dur, + dt3, + dt2, + dt3 - dt2, + rtf, + xmin, + xmax, + ) + t1 = time.time() if write_time_durs_spec is not None: - logging.info("writing time durations to %s" % (write_time_durs_spec)) + logging.info("writing time durations to %s", write_time_durs_spec) u2td = Utt2Info.create(keys, info) u2td.save(write_time_durs_spec) @@ -135,9 +154,9 @@ def process_audio_files( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--output-path", required=True) - parser.add_argument("--output-script", required=True) + parser.add_argument("--output-recordings-file", required=True) parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None) parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 8e1653b1..f132a35c 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -5,6 +5,7 @@ """ import logging import multiprocessing + # import sys import os import time @@ -17,13 +18,19 @@ from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.models import (HFHubert2ResNet1dXVector, - HFWav2Vec2ResNet1dXVector, - HFWavLM2ResNet1dXVector) +from hyperion.torch.models import ( + HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector, +) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, @@ -95,7 +102,7 @@ def train_model(gpu_id, args): trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: - logging.info("trainer args={}".format(trn_args)) + logging.info(f"trainer args={trn_args}") metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, diff --git a/hyperion/bin/train_wav2xvector.py b/hyperion/bin/train_wav2xvector.py new file mode 100755 index 00000000..ddf292b8 --- /dev/null +++ b/hyperion/bin/train_wav2xvector.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +from pathlib import Path + +import torch +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy + +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as RXVec + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + # "efficientnet": EXVec, + # "tdnn": TDXVec, + # "transformer": TFXVec, + # "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_xvector(num_classes, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["xvector"]["num_classes"] = num_classes + model = xvec_class(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def train_xvec(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + xvec_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train Wav2XVector from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py index e978e219..9d885718 100644 --- a/hyperion/data_prep/__init__.py +++ b/hyperion/data_prep/__init__.py @@ -4,6 +4,8 @@ """ from .data_prep import DataPrep +from .musan import MusanDataPrep +from .rirs import RIRSDataPrep from .voxceleb2 import VoxCeleb2DataPrep from .voxceleb1 import VoxCeleb1DataPrep from .voxsrc22 import VoxSRC22DataPrep diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py index d9828674..0f654676 100644 --- a/hyperion/data_prep/data_prep.py +++ b/hyperion/data_prep/data_prep.py @@ -67,7 +67,8 @@ def _get_recording_duration(recordings, i, n): def get_recording_duration(self, recording_set): import itertools - from ..utils import SCPList + + # from ..utils import SCPList #don't remember why I put this here futures = [] logging.info("submitting threats...") diff --git a/hyperion/data_prep/musan.py b/hyperion/data_prep/musan.py new file mode 100644 index 00000000..abf7a46c --- /dev/null +++ b/hyperion/data_prep/musan.py @@ -0,0 +1,107 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import glob +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class MusanDataPrep(DataPrep): + """Class for preparing Musan database into tables + + Attributes: + corpus_dir: input data directory + subset: subset of the data noise, music, speech + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + subset: str, + output_dir: PathLike, + target_sample_freq: int, + num_threads: int = 10, + **kwargs, + ): + super().__init__(corpus_dir, output_dir, False, target_sample_freq, num_threads) + self.subset = subset + + @staticmethod + def dataset_name(): + return "musan" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--subset", + choices=["noise", "music", "speech"], + help="""musan subset in [noise, music, speech]""", + required=True, + ) + + def prepare(self): + logging.info( + "Peparing Musan %s corpus_dir:%s -> data_dir:%s", + self.subset, + self.corpus_dir, + self.output_dir, + ) + rec_dir = self.corpus_dir / self.subset + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + + rec_ids = [f.with_suffix("").name for f in rec_files] + storage_paths = [str(f) for f in rec_files] + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": storage_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + "duration": recs.loc[rec_ids, "duration"].values, + "noise_type": self.subset, + } + ) + segments = SegmentSet(segments) + segments.sort() + logging.info("making dataset") + dataset = Dataset( + segments, + recordings=recs, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", + len(segments), + ) diff --git a/hyperion/data_prep/rirs.py b/hyperion/data_prep/rirs.py new file mode 100644 index 00000000..066819a8 --- /dev/null +++ b/hyperion/data_prep/rirs.py @@ -0,0 +1,103 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import glob +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class RIRSDataPrep(DataPrep): + """Class for preparing Musan database into tables + + Attributes: + corpus_dir: input data directory + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + output_dir: PathLike, + target_sample_freq: int, + num_threads: int = 10, + **kwargs, + ): + super().__init__(corpus_dir, output_dir, False, target_sample_freq, num_threads) + + @staticmethod + def dataset_name(): + return "rirs" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + + def prepare(self): + logging.info( + "Peparing RIRS corpus_dir:%s -> data_dir:%s", + self.corpus_dir, + self.output_dir, + ) + rec_dir = self.corpus_dir + rirs_file = self.corpus_dir / "rir_list" + if rirs_file.exists(): + rirs_table = pd.read_csv( + rirs_file, + sep=" ", + header=None, + names=["dummy1", "rir_id", "dummy2", "room_id", "rec_files"], + ) + rec_files = [Path(f) for f in rirs_table["rec_files"].values] + room_ids = rirs_table["room_id"].values + else: + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + room_ids = None + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + + rec_ids = [f.with_suffix("").name for f in rec_files] + storage_paths = [str(f) for f in rec_files] + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": storage_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + {"id": rec_ids, "duration": recs.loc[rec_ids, "duration"].values,} + ) + if room_ids is not None: + segments["room_id"] = room_ids + segments = SegmentSet(segments) + segments.sort() + logging.info("making dataset") + dataset = Dataset(segments, recordings=recs,) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", len(segments), + ) diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py index b3958605..025fad37 100644 --- a/hyperion/data_prep/voxceleb1.py +++ b/hyperion/data_prep/voxceleb1.py @@ -233,17 +233,19 @@ def prepare(self): Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) ] + assert len(rec_files) > 0, "recording files not found" + speakers = [f.parents[1].name for f in rec_files] video_ids = [f.parent.name for f in rec_files] if self.cat_videos: + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, video_ids)] lists_cat_dir = self.output_dir / "lists_cat" lists_cat_dir.mkdir(exist_ok=True, parents=True) - uniq_video_ids, uniq_video_idx, video_idx = np.unique( - video_ids, return_index=True, return_inverse=True + rec_ids, uniq_rec_idx, rec_idx = np.unique( + rec_ids, return_index=True, return_inverse=True ) - rec_ids = uniq_video_ids - speakers = [speakers[i] for i in uniq_video_idx] - rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)] + speakers = [speakers[i] for i in uniq_rec_idx] + video_ids = [video_ids[i] for i in uniq_rec_idx] file_paths = [] futures = [] @@ -256,15 +258,13 @@ def prepare(self): lists_cat_dir, rec_id, rec_files, - video_idx, + rec_idx, i, ) futures.append(future) logging.info("waiting threats...") file_paths = [f.result() for f in tqdm(futures)] - video_ids = uniq_video_ids - else: file_names = [f.with_suffix("").name for f in rec_files] if self.use_kaldi_ids: @@ -331,7 +331,7 @@ def prepare(self): dataset = Dataset( segments, classes={"speaker": speakers, "language_est": languages}, - recordings={"recordings": recs}, + recordings=recs, enrollments=enrollments, trials=trials, sparse_trials=False, diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index 29ad3e44..969f2228 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -148,24 +148,27 @@ def prepare(self): df_lang = self._get_langs_est() rec_dir = self.corpus_dir / self.subset logging.info("searching audio files in %s", str(rec_dir)) - rec_files = list(rec_dir.glob("**/*.m4a")) + rec_files1 = list(rec_dir.glob("**/*.m4a")) + rec_files = [Path(f) for f in glob.iglob(f"{rec_dir}/**/*.m4a", recursive=True)] if not rec_files: # symlinks? try glob rec_files = [ - Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.m4a", recursive=True) ] + assert len(rec_files) > 0, "recording files not found" + speakers = [f.parents[1].name for f in rec_files] video_ids = [f.parent.name for f in rec_files] if self.cat_videos: + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, video_ids)] lists_cat_dir = self.output_dir / "lists_cat" lists_cat_dir.mkdir(exist_ok=True, parents=True) - uniq_video_ids, uniq_video_idx, video_idx = np.unique( - video_ids, return_index=True, return_inverse=True + rec_ids, uniq_rec_idx, rec_idx = np.unique( + rec_ids, return_index=True, return_inverse=True ) - rec_ids = uniq_video_ids - speakers = [speakers[i] for i in uniq_video_idx] - rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)] + speakers = [speakers[i] for i in uniq_rec_idx] + video_ids = [video_ids[i] for i in uniq_rec_idx] file_paths = [] futures = [] @@ -178,15 +181,13 @@ def prepare(self): lists_cat_dir, rec_id, rec_files, - video_idx, + rec_idx, i, ) futures.append(future) logging.info("waiting threats...") file_paths = [f.result() for f in tqdm(futures)] - video_ids = uniq_video_ids - else: file_names = [f.with_suffix("").name for f in rec_files] if self.use_kaldi_ids: @@ -252,7 +253,7 @@ def prepare(self): dataset = Dataset( segments, {"speaker": speakers, "language_est": languages}, - {"recordings": recs}, + recs, ) logging.info("saving dataset at %s", self.output_dir) dataset.save(self.output_dir) diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py index 79369149..f81f6eaf 100644 --- a/hyperion/data_prep/voxsrc22.py +++ b/hyperion/data_prep/voxsrc22.py @@ -127,6 +127,9 @@ def prepare_track12_dev(self): rec_ids = vox22_segmentid + vox1_segmentid rec_files = vox22_rec_files + vox1_rec_files + assert len(vox22_rec_files) > 0, "vox22 recording files not found" + assert len(vox1_rec_files) > 0, "vox1 recording files not found" + recs = pd.DataFrame({"id": rec_ids, "storage_path": rec_files}) recs = RecordingSet(recs) recs.sort() @@ -148,7 +151,7 @@ def prepare_track12_dev(self): logging.info("making dataset") dataset = Dataset( segments, - recordings={"recordings": recs}, + recordings=recs, enrollments=enrollments, trials=trials, sparse_trials=False, @@ -160,50 +163,6 @@ def prepare_track12_dev(self): len(segments), ) - # wav_file = voxsrc22_corpus_dir / file_id - # wav_file = vox1_corpus_dir / "wav" / file_id - # logging.info("searching audio files in %s", self.vox1_corpus_dir) - # vox1_rec_files = list(self.vox1_corpus_dir.glob("**/*.wav")) - # if not vox1_rec_files: - # # symlinks? try glob - # vox1_rec_files = [ - # Path(f) for f in glob.iglob(f"{self.vox1_corpus_dir}/**/*.wav", recursive=True) - # ] - - # vox1_rec_ids = [ f.parent.parent.name / f.parent.name / f.name for f in vox1_rec_files] - # rec_files = - - # rec_files = list(self.corpus_dir.glob("**/*.wav")) - # if not rec_files: - # # symlinks? try glob - # rec_files = [ - # Path(f) for f in glob.iglob(f"{self.corpus_dir}/**/*.wav", recursive=True) - # ] - - # u2s_file = output_dir / "utt2spk" - # logging.info("creating utt2spk file %s", u2s_file) - # file_ids = np.unique(np.concatenate((df_trials["enroll"], df_trials["test"]))) - # with open(u2s_file, "w") as f: - # for file_id in file_ids: - # f.write("%s %s\n" % (file_id, file_id)) - - # s2u_file = output_dir / "spk2utt" - # logging.info("creating spk2utt file %s", s2u_file) - # with open(s2u_file, "w") as f: - # for file_id in file_ids: - # f.write("%s %s\n" % (file_id, file_id)) - - # wav_file = output_dir / "wav.scp" - # logging.info("creating wav.scp file %s", wav_file) - # with open(wav_file, "w") as f: - # for file_id in file_ids: - # if "VoxSRC2022_dev" in file_id: - # wav_file = voxsrc22_corpus_dir / file_id - # else: - # wav_file = vox1_corpus_dir / "wav" / file_id - - # f.write("%s %s\n" % (file_id, wav_file)) - def prepare_track12_test(self): logging.info( "Preparing VoxSRC22 %s corpus:%s -> %s", diff --git a/hyperion/helpers/trial_data_reader.py b/hyperion/helpers/trial_data_reader.py index 4f33770b..85904eb2 100644 --- a/hyperion/helpers/trial_data_reader.py +++ b/hyperion/helpers/trial_data_reader.py @@ -16,7 +16,7 @@ from ..utils.utt2info import Utt2Info -class TrialDataReader(object): +class TrialDataReader: """ Loads Ndx, enroll file and x-vectors to evaluate PLDA. """ diff --git a/hyperion/helpers/vector_class_reader.py b/hyperion/helpers/vector_class_reader.py index c4c531ad..a9993768 100644 --- a/hyperion/helpers/vector_class_reader.py +++ b/hyperion/helpers/vector_class_reader.py @@ -49,7 +49,7 @@ def __init__( v[0]: int(v[1]) for v in [line.rstrip().split() for line in f] } - self.rng = np.random.RandomState(vcr_seed) + self.rng = np.random.default_rng(vcr_seed) self.csplit_max_spc = csplit_max_spc self.csplit_min_spc = csplit_min_spc self.csplit_mode = csplit_mode diff --git a/hyperion/io/ark_data_reader.py b/hyperion/io/ark_data_reader.py index 6cf22d5f..eaf76d49 100644 --- a/hyperion/io/ark_data_reader.py +++ b/hyperion/io/ark_data_reader.py @@ -223,8 +223,8 @@ def read( self._eof = True break - row_offset_i = row_offset[i] if row_offset_is_list else row_offset - num_rows_i = num_rows[i] if num_rows_is_list else num_rows + row_offset_i = row_offset[count] if row_offset_is_list else row_offset + num_rows_i = num_rows[count] if num_rows_is_list else num_rows binary = init_kaldi_input_stream(self.f) data_i = KaldiMatrix.read( @@ -269,7 +269,7 @@ def __init__( self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs ): super().__init__(file_path, permissive=False, **kwargs) - self.feature_set = FeatureSet.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if self.num_parts > 1: self.feature_set = self.feature_set.split(self.part_idx, self.num_parts) diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index 6c152cc5..a1adaef0 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -55,7 +55,7 @@ def __init__( self, recordings: Union[RecordingSet, PathLike], segments: Union[SegmentSet, PathLike, None] = None, - wav_scale: float = 2 ** 15 - 1, + wav_scale: float = 1.0, ): if not isinstance(recordings, RecordingSet): recordings = RecordingSet.load(recordings) @@ -255,7 +255,7 @@ def __init__( self, recordings: Union[RecordingSet, PathLike], segments: Union[SegmentSet, PathLike, None] = None, - wav_scale: float = 2 ** 15 - 1, + wav_scale: float = 1.0, part_idx: int = 1, num_parts: int = 1, ): @@ -373,7 +373,8 @@ def add_class_args(parser, prefix: Optional[str] = None): parser.add_argument( "--wav-scale", - default=2 ** 15 - 1, + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) @@ -399,8 +400,7 @@ def add_class_args(parser, prefix: Optional[str] = None): if prefix is not None: outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), + "--" + prefix, action=ActionParser(parser=parser), ) add_argparse_args = add_class_args @@ -411,7 +411,7 @@ def __init__( self, recordings: Union[RecordingSet, PathLike], segments: Union[SegmentSet, PathLike, None] = None, - wav_scale: float = 2 ** 15 - 1, + wav_scale: float = 1.0, ): super().__init__(recordings, segments, wav_scale) @@ -524,14 +524,14 @@ def add_class_args(parser, prefix: Optional[str] = None): parser.add_argument( "--wav-scale", - default=2 ** 15 - 1, + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) if prefix is not None: outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), + "--" + prefix, action=ActionParser(parser=parser), ) add_argparse_args = add_class_args diff --git a/hyperion/io/audio_writer.py b/hyperion/io/audio_writer.py index e416c209..ca0dde9f 100644 --- a/hyperion/io/audio_writer.py +++ b/hyperion/io/audio_writer.py @@ -27,12 +27,33 @@ "DOUBLE": "float64", "MS_ADPCM": "int16", "ULAW": "int16", - "PCM_U8": "uint8", - "PCM_S8": "int8", + "PCM_S8": "int16", "VORBIS": "float32", "GSM610": "int16", "G721_32": "int16", - "PCM_24": "int24", + "PCM_24": "int32", +} + +scale_32 = 2 ** 31 - 1 +scale_24 = 2 ** 23 - 1 +scale_16 = 2 ** 15 - 1 +scale_8 = 2 ** 7 - 1 + + +subtype_to_scale = { + "PCM_32": scale_32, + "ALAW": scale_16, + "IMA_ADPCM": scale_16, + "FLOAT": 1, + "PCM_16": scale_16, + "DOUBLE": 1, + "MS_ADPCM": scale_16, + "ULAW": scale_16, + "PCM_S8": scale_8, + "VORBIS": 1, + "GSM610": scale_16, + "G721_32": scale_16, + "PCM_24": scale_24, } @@ -45,6 +66,7 @@ class AudioWriter(object): audio_format: audio file format audio_subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...], if None, it uses soundfile defaults (recommended) + wav_scale: scale of the input waveform """ def __init__( @@ -53,6 +75,7 @@ def __init__( script_path: Optional[PathLike] = None, audio_format: str = "wav", audio_subtype: Optional[str] = None, + wav_scale: float = 1.0, ): self.output_path = Path(output_path) self.script_path = Path(script_path) if script_path is not None else None @@ -63,9 +86,15 @@ def __init__( if audio_subtype is None: self.subtype = sf.default_subtype(self.audio_format) else: - self.subtype = audio_subtype + self.subtype = audio_subtype.upper() assert sf.check_format(self.audio_format, self.subtype) + self._dtype = subtype_to_npdtype[self.subtype] + + self.wav_scale = wav_scale + # we multiply the audio for this number before saving it. + self._output_wav_scale = subtype_to_scale[self.subtype] / wav_scale + self.script_is_scp = False self.script_sep = None self.f_script = None @@ -78,7 +107,7 @@ def __init__( self.f_script = open(self.script_path, "w") else: self.script_sep = "," if script_ext == ".csv" else "\t" - self.f_script = open(self.script_path, "w", "utf-8") + self.f_script = open(self.script_path, "w", encoding="utf-8") row = self.script_sep.join( ["id", "storage_path", "duration", "sample_freq"] ) @@ -123,8 +152,7 @@ def write( data = [data] fs_is_list = isinstance(fs, (list, np.ndarray)) - assert self.subtype in subtype_to_npdtype - dtype = subtype_to_npdtype[self.subtype] + output_files = [] for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -135,7 +163,7 @@ def write( self.audio_format, ) fs_i = int(fs[i]) if fs_is_list else fs - data_i = data[i].astype(dtype, copy=False) + data_i = (self._output_wav_scale * data[i]).astype(self._dtype, copy=False) sf.write(output_file, data_i, fs_i, subtype=self.subtype) output_files.append(output_file) @@ -156,14 +184,11 @@ def write( @staticmethod def filter_args(**kwargs): valid_args = ( - "output_fs", - "output_wav_scale", - "output_audio_format", - "output_audio_subtype", - ) - return dict( - (re.sub("output_", "", k), kwargs[k]) for k in valid_args if k in kwargs + "wav_scale", + "audio_format", + "audio_subtype", ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -171,23 +196,27 @@ def add_class_args(parser, prefix=None): outer_parser = parser parser = ArgumentParser(prog="") - # parser.add_argument(p1+'output-wav-scale', default=1, type=float, - # help=('scale to divide the waveform before writing')) - parser.add_argument( - "--output-audio-format", + "--audio-format", default="flac", choices=["flac", "ogg", "wav"], help=("ouput audio format"), ) parser.add_argument( - "--output-audio-subtype", + "--audio-subtype", default=None, - choices=["pcm_16", "pcm_24", "float", "double", "vorbis"], + choices=["pcm_16", "pcm_24", "pcm_32", "float", "double", "vorbis"], help=("coding format for audio file"), ) + try: + parser.add_argument( + "--wav-scale", default="1.0", help=("input waveform scale wrt 1"), + ) + except: + pass + if prefix is not None: outer_parser.add_argument( "--" + prefix, action=ActionParser(parser=parser), diff --git a/hyperion/io/hyp_data_reader.py b/hyperion/io/hyp_data_reader.py index 575c3087..63d463fb 100644 --- a/hyperion/io/hyp_data_reader.py +++ b/hyperion/io/hyp_data_reader.py @@ -76,9 +76,8 @@ def read_random_slice(self, key, num_samples, rng, field=""): dataset = key + field assert dataset in self.f, "Dataset %s not found" % dataset num_rows = self.f[dataset].shape[0] - # print('hola',num_rows,num_samples,num_rows-num_samples) - # index = rng.random_integers(low=0, high=num_rows-num_samples, size=1)[0] - index = rng.randint(low=0, high=num_rows - num_samples + 1) + + index = rng.integers(low=0, high=num_rows - num_samples + 1) X = self.f[dataset][index : index + num_samples] return X, index diff --git a/hyperion/io/packed_audio_reader.py b/hyperion/io/packed_audio_reader.py index 17f78bc2..fb17cb18 100644 --- a/hyperion/io/packed_audio_reader.py +++ b/hyperion/io/packed_audio_reader.py @@ -378,7 +378,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( p1 + "wav-scale", - default=2 ** 15 - 1, + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) @@ -633,7 +634,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( p1 + "wav-scale", - default=2 ** 15, + default=1.0, + # default=2 ** 15, type=float, help=("multiplicative factor for waveform"), ) diff --git a/hyperion/io/rw_specifiers.py b/hyperion/io/rw_specifiers.py index 93123247..60e01ef1 100644 --- a/hyperion/io/rw_specifiers.py +++ b/hyperion/io/rw_specifiers.py @@ -387,11 +387,11 @@ def create(cls, rspecifier): if archive.suffix == ".csv": df = pd.read_csv(archive, nrows=2) storage_path = df["storage_path"].values[0] - if re.match(r".*\.h5$", scp_f2) is not None: + if re.match(r".*\.h5$", storage_path) is not None: archive_type = ArchiveType.H5 - elif re.match(r".*\.ark$", scp_f2) is not None: + elif re.match(r".*\.ark$", storage_path) is not None: archive_type = ArchiveType.ARK - elif re.match(r".*[cvg]$", scp_f2) is not None: + elif re.match(r".*[cvg]$", storage_path) is not None: archive_type = ArchiveType.AUDIO else: raise ValueError(f"Unknown format for {storage_path}") diff --git a/hyperion/np/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py index 799db930..1cc1a0be 100644 --- a/hyperion/np/augment/noise_augment.py +++ b/hyperion/np/augment/noise_augment.py @@ -26,7 +26,7 @@ class SingleNoiseAugment(object): min_snr: mininimum SNR(dB) to sample from. max_snr: maximum SNR(dB) to sample from. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -46,7 +46,7 @@ def __init__( self.cache = None self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -96,7 +96,7 @@ def forward(self, x): while noise is None or noise.shape[0] < num_samples: with self.lock: - noise_idx = self.rng.randint(len(self.noise_keys)) + noise_idx = self.rng.integers(len(self.noise_keys)) key = self.noise_keys[noise_idx] noise_k, fs_k = self.r.read([key]) noise_k = noise_k[0] @@ -112,12 +112,22 @@ def forward(self, x): with self.lock: self.cache = noise_k[need_samples:] + num_zeros = np.sum(noise == 0) with self.lock: + # add dither for noises files with many 0s. + if num_zeros > len(noise) // 3: + noise += 0.0001 * self.rng.standard_normal( + noise.shape, dtype=noise.dtype + ) + target_snr = self.rng.uniform(self.min_snr, self.max_snr) + scale = self._compute_noise_scale(x, noise, target_snr) info = {"noise_type": self.noise_type, "snr": target_snr} - return x + scale * noise, info + y = x + scale * noise + + return y, info def __call__(self, x): return self.forward(x) @@ -136,7 +146,7 @@ class NoiseAugment(object): is proportional to how often we want to sample a given noise type. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None): @@ -166,7 +176,7 @@ def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None): self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -177,7 +187,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: NoiseAugment object @@ -208,7 +218,7 @@ def forward(self, x): # decide whether to add noise or not with self.lock: - p = self.rng.random_sample() + p = self.rng.random() if p > self.noise_prob: # we don't add noise diff --git a/hyperion/np/augment/reverb_augment.py b/hyperion/np/augment/reverb_augment.py index cf4cc6cb..0b1f3596 100644 --- a/hyperion/np/augment/reverb_augment.py +++ b/hyperion/np/augment/reverb_augment.py @@ -39,7 +39,7 @@ class SingleReverbAugment(object): its first sample. preload_rirs: if True all RIRS are loaded into RAM. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -80,7 +80,7 @@ def __init__( self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -129,7 +129,7 @@ def forward(self, x): num_samples = x.shape[0] with self.lock: - rir_idx = self.rng.randint(len(self.rir_keys)) + rir_idx = self.rng.integers(len(self.rir_keys)) if self.preload_rirs: h = self.rirs[rir_idx] @@ -155,6 +155,7 @@ def forward(self, x): "h_max": h_max, "h_delay": h_delay, } + return y, info def __call__(self, x): @@ -176,7 +177,7 @@ class ReverbAugment(object): max_reverb_context: number of samples required as left context for the convolution operation. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -210,7 +211,7 @@ def __init__( self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -221,7 +222,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with reverb options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: ReverbAugment object. @@ -267,7 +268,7 @@ def forward(self, x): # decide whether to add reverb or not with self.lock: - p = self.rng.random_sample() + p = self.rng.random() if p > self.reverb_prob: # we don't add reverb diff --git a/hyperion/np/augment/speech_augment.py b/hyperion/np/augment/speech_augment.py index 0b1233f1..c27ca321 100644 --- a/hyperion/np/augment/speech_augment.py +++ b/hyperion/np/augment/speech_augment.py @@ -37,7 +37,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: SpeechAugment object. diff --git a/hyperion/np/augment/speed_augment.py b/hyperion/np/augment/speed_augment.py index 18a15651..a648190d 100644 --- a/hyperion/np/augment/speed_augment.py +++ b/hyperion/np/augment/speed_augment.py @@ -22,7 +22,7 @@ class SpeedAugment(object): keep_length: applies padding or cropping to keep the lenght of the signal. random_seed: random seed for random number generator. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -34,14 +34,16 @@ def __init__( rng=None, ): logging.info( - "init speed augment with prob={}, speed_ratios={}, keep_length={}". - format(speed_prob, speed_ratios, keep_length)) + "init speed augment with prob={}, speed_ratios={}, keep_length={}".format( + speed_prob, speed_ratios, keep_length + ) + ) self.speed_prob = speed_prob self.speed_ratios = speed_ratios self.keep_length = keep_length if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -52,7 +54,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: NoiseAugment object. @@ -84,7 +86,7 @@ def forward(self, x): """ # decide whether to add noise or not - p = self.rng.random_sample() + p = self.rng.random() if p > self.speed_prob: # we don't add speed perturbation info = {"speed_ratio": 1} @@ -98,14 +100,12 @@ def forward(self, x): # print(f"1 r={r} {x.shape} {y.shape}", flush=True) if self.keep_length: if r > 1: - dither = np.max(x) / 2**15 # we add some dither in the padding - pad_y = dither * np.ones( - (x.shape[-1] - y.shape[-1], ), dtype=y.dtype) + dither = np.max(x) / 2 ** 15 # we add some dither in the padding + pad_y = dither * np.ones((x.shape[-1] - y.shape[-1],), dtype=y.dtype) y = np.concatenate((y, pad_y), axis=-1) elif r < 1: - y = y[:x.shape[-1]] + y = y[: x.shape[-1]] - # print(f"2 r={r} {x.shape} {y.shape}", flush=True) return y, info def __call__(self, x): diff --git a/hyperion/np/classifiers/binary_logistic_regression.py b/hyperion/np/classifiers/binary_logistic_regression.py index 82a84529..e77115cd 100644 --- a/hyperion/np/classifiers/binary_logistic_regression.py +++ b/hyperion/np/classifiers/binary_logistic_regression.py @@ -29,7 +29,7 @@ class BinaryLogisticRegression(LogisticRegression): In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight. Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased. priors: prior prob for having a positive sample. - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. diff --git a/hyperion/np/classifiers/greedy_fusion.py b/hyperion/np/classifiers/greedy_fusion.py index 842b850e..f03a05a0 100644 --- a/hyperion/np/classifiers/greedy_fusion.py +++ b/hyperion/np/classifiers/greedy_fusion.py @@ -42,8 +42,8 @@ class GreedyFusionBinaryLR(NPModel): In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight. Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased. priors: prior prob for having a positive sample. - random_state: int, RandomState instance or None, optional, default: None - The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’. + random_state: int, default_rng instance or None, optional, default: None + The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If default_rng instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and diff --git a/hyperion/np/classifiers/linear_gbe.py b/hyperion/np/classifiers/linear_gbe.py index a6b8c7cc..f551af14 100644 --- a/hyperion/np/classifiers/linear_gbe.py +++ b/hyperion/np/classifiers/linear_gbe.py @@ -10,7 +10,7 @@ from scipy.special import gammaln from ...hyp_defs import float_cpu -from ...utils.math import int2onehot, invert_pdmat, logdet_pdmat, softmax +from ...utils.math_funcs import int2onehot, invert_pdmat, logdet_pdmat, softmax from ..np_model import NPModel @@ -426,7 +426,8 @@ def add_class_args(parser, prefix=None): parser.add_argument("--name", default="lgbe", help="model name") if prefix is not None: outer_parser.add_argument( - "--" + prefix, action=ActionParser(parser=parser), + "--" + prefix, + action=ActionParser(parser=parser), ) @staticmethod @@ -468,7 +469,8 @@ def add_eval_args(parser, prefix=None): ) if prefix is not None: outer_parser.add_argument( - "--" + prefix, action=ActionParser(parser=parser), + "--" + prefix, + action=ActionParser(parser=parser), ) add_argparse_args = add_class_args diff --git a/hyperion/np/classifiers/linear_gbe_up.py b/hyperion/np/classifiers/linear_gbe_up.py index 8566aeab..37ac9656 100644 --- a/hyperion/np/classifiers/linear_gbe_up.py +++ b/hyperion/np/classifiers/linear_gbe_up.py @@ -9,8 +9,13 @@ from scipy.special import gammaln from ...hyp_defs import float_cpu -from ...utils.math import (fullcov_varfloor, int2onehot, invert_pdmat, - logdet_pdmat, softmax) +from ...utils.math_funcs import ( + fullcov_varfloor, + int2onehot, + invert_pdmat, + logdet_pdmat, + softmax, +) from ..np_model import NPModel from .linear_gbe import LinearGBE diff --git a/hyperion/np/classifiers/linear_svmc.py b/hyperion/np/classifiers/linear_svmc.py index 5d743a46..6a977df9 100644 --- a/hyperion/np/classifiers/linear_svmc.py +++ b/hyperion/np/classifiers/linear_svmc.py @@ -10,7 +10,7 @@ from sklearn.svm import LinearSVC as SVC from ...hyp_defs import float_cpu -from ...utils.math import softmax +from ...utils.math_funcs import softmax from ..np_model import NPModel @@ -41,7 +41,7 @@ class LinearSVMC(NPModel): The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None max_iter: int, default: 100 Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge. @@ -61,7 +61,7 @@ class LinearSVMC(NPModel): penalty and dual will be ignored. verbose: int, default: 0 balance_class_weight: if True and class_weight is None, it makes class_weight="balanced". - lr_seed: seed form RandomState, used when random_state is None. + lr_seed: seed form default_rng, used when random_state is None. labels: list of class labels """ @@ -93,7 +93,7 @@ def __init__( class_weight = "balanced" if random_state is None: - random_state = np.random.RandomState(seed=lr_seed) + random_state = np.random.default_rng(seed=lr_seed) self.use_bias = use_bias self.bias_scaling = bias_scaling diff --git a/hyperion/np/classifiers/logistic_regression.py b/hyperion/np/classifiers/logistic_regression.py index 8e3d7e2e..4c4c0cfc 100644 --- a/hyperion/np/classifiers/logistic_regression.py +++ b/hyperion/np/classifiers/logistic_regression.py @@ -9,7 +9,7 @@ from sklearn.linear_model import LogisticRegression as LR from ...hyp_defs import float_cpu -from ...utils.math import softmax +from ...utils.math_funcs import softmax from ..np_model import NPModel @@ -36,7 +36,7 @@ class LogisticRegression(NPModel): Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. @@ -93,7 +93,7 @@ def __init__( super().__init__(**kwargs) if random_state is None: - random_state = np.random.RandomState(seed=lr_seed) + random_state = np.random.default_rng(seed=lr_seed) if bias_scaling is None: if use_bias and solver == "liblinear": diff --git a/hyperion/np/classifiers/q_scoring_homo_gbe.py b/hyperion/np/classifiers/q_scoring_homo_gbe.py index 9e54e0f4..3345dd72 100644 --- a/hyperion/np/classifiers/q_scoring_homo_gbe.py +++ b/hyperion/np/classifiers/q_scoring_homo_gbe.py @@ -9,7 +9,7 @@ from scipy.special import gammaln from ...hyp_defs import float_cpu -from ...utils.math import int2onehot, invert_pdmat, logdet_pdmat, softmax +from ...utils.math_funcs import int2onehot, invert_pdmat, logdet_pdmat, softmax from ..np_model import NPModel diff --git a/hyperion/np/classifiers/svmc.py b/hyperion/np/classifiers/svmc.py index 6b54034b..ac5211ef 100644 --- a/hyperion/np/classifiers/svmc.py +++ b/hyperion/np/classifiers/svmc.py @@ -12,7 +12,7 @@ from sklearn.svm import SVC from ...hyp_defs import float_cpu -from ...utils.math import softmax +from ...utils.math_funcs import softmax from ...utils.misc import filter_func_args from ..np_model import NPModel @@ -49,7 +49,7 @@ def __init__( class_weight = "balanced" if random_state is None: - random_state = np.random.RandomState(seed=lr_seed) + random_state = np.random.default_rng(seed=lr_seed) self.C = C self.kernel = kernel diff --git a/hyperion/np/feats/energy_vad.py b/hyperion/np/feats/energy_vad.py index 5b9eb751..1d578c68 100644 --- a/hyperion/np/feats/energy_vad.py +++ b/hyperion/np/feats/energy_vad.py @@ -5,6 +5,7 @@ import logging import numpy as np +from jsonargparse import ActionParser, ArgumentParser from scipy.signal import lfilter from ...hyp_defs import float_cpu @@ -19,7 +20,7 @@ class EnergyVAD(object): sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) frame_length: Frame length in milliseconds (default = 25) frame_shift: Frame shift in milliseconds (default = 10) - dither: Dithering constant (0.0 means no dither) (default = 1) + dither: Dithering constant (0.0 means no dither) (default = 2^(-15)) snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (default = True) vad_energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5) vad_energy_threshold: Constant term in energy threshold for MFCC0 for VAD (also see --vad-energy-mean-scale) (float, default = 5) @@ -32,7 +33,7 @@ def __init__( sample_frequency=16000, frame_length=25, frame_shift=10, - dither=1, + dither=1 / 2 ** 15, snip_edges=True, vad_energy_mean_scale=0.5, vad_energy_threshold=5, @@ -97,7 +98,7 @@ def compute(self, x, return_loge=False): # add dither if self.dither > 0: - n = self.dither * np.random.RandomState(seed=len(x)).randn( + n = self.dither * np.random.default_rng(seed=len(x)).randn( len(x) ).astype(float_cpu(), copy=False) x = x + n @@ -174,14 +175,12 @@ def add_class_args(parser, prefix=None): parser: Arguments parser prefix: Options prefix. """ - - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "sample-frequency", + "--sample-frequency", default=16000, type=int, help=( @@ -191,24 +190,21 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "frame-length", - type=int, - default=25, - help="Frame length in milliseconds", + "--frame-length", type=int, default=25, help="Frame length in milliseconds", ) parser.add_argument( - p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds" + "--frame-shift", type=int, default=10, help="Frame shift in milliseconds" ) parser.add_argument( - p1 + "dither", + "--dither", type=float, - default=1, + default=1 / 2 ** 15, help="Dithering constant (0.0 means no dither)", ) parser.add_argument( - p1 + "snip-edges", + "--snip-edges", default=True, type=str2bool, help=( @@ -221,7 +217,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "vad-energy-mean-scale", + "--vad-energy-mean-scale", type=float, default=0.5, help=( @@ -231,13 +227,13 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "vad-energy-threshold", + "--vad-energy-threshold", type=float, default=5, help="Constant term in energy threshold for MFCC0 for VAD", ) parser.add_argument( - p1 + "vad-frames-context", + "--vad-frames-context", type=int, default=0, help=( @@ -246,7 +242,7 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "vad-proportion-threshold", + "--vad-proportion-threshold", type=float, default=0.6, help=( @@ -254,5 +250,7 @@ def add_class_args(parser, prefix=None): "the window that need to have more energy than the threshold" ), ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args diff --git a/hyperion/np/feats/mfcc.py b/hyperion/np/feats/mfcc.py index cd98840d..b56728b8 100644 --- a/hyperion/np/feats/mfcc.py +++ b/hyperion/np/feats/mfcc.py @@ -6,6 +6,7 @@ from enum import Enum import numpy as np +from jsonargparse import ActionParser, ArgumentParser from scipy.fftpack import dct from scipy.signal import lfilter @@ -72,7 +73,7 @@ class MFCC(object): preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"blackmann") (default = 'povey') use_fft2: If true, it uses |X(f)|^2, if false, it uses |X(f)|, (default = True) - dither: Dithering constant (0.0 means no dither) (default = 1) + dither: Dithering constant (0.0 means no dither) (default = 1/2**15) fb_type: Filter-bank type: mel_kaldi, mel_etsi, mel_librosa, mel_librosa_htk, linear (default = 'mel_kaldi') low_freq: Low cutoff frequency for mel bins (default = 20) high_freq: High cutoff frequency for mel bins (if < 0, offset from Nyquist) (default = 0) @@ -98,7 +99,7 @@ def __init__( preemphasis_coeff=0.97, window_type="povey", use_fft2=True, - dither=1, + dither=1 / 2 ** 15, fb_type="mel_kaldi", low_freq=20, high_freq=0, @@ -256,7 +257,7 @@ def compute(self, x, return_fft=False, return_spec=False, return_logfb=False): # add dither if self.dither > 0: - n = self.dither * np.random.RandomState(seed=len(x)).randn( + n = self.dither * np.random.default_rng(seed=len(x)).randn( len(x) ).astype(float_cpu(), copy=False) x = x + n @@ -400,14 +401,12 @@ def add_class_args(parser, prefix=None): parser: Arguments parser prefix: Options prefix. """ - - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "sample-frequency", + "--sample-frequency", default=16000, type=int, help="Waveform data sample frequency " @@ -415,27 +414,22 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "frame-length", - type=int, - default=25, - help="Frame length in milliseconds", - ) - parser.add_argument( - p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds" + "--frame-length", type=int, default=25, help="Frame length in milliseconds", ) parser.add_argument( - p1 + "fft-length", type=int, default=512, help="Length of FFT" + "--frame-shift", type=int, default=10, help="Frame shift in milliseconds" ) + parser.add_argument("--fft-length", type=int, default=512, help="Length of FFT") parser.add_argument( - p1 + "remove-dc-offset", + "--remove-dc-offset", default=True, type=str2bool, help="Subtract mean from waveform on each frame", ) parser.add_argument( - p1 + "preemphasis-coeff", + "--preemphasis-coeff", type=float, default=0.97, help="Coefficient for use in signal preemphasis", @@ -444,30 +438,30 @@ def add_class_args(parser, prefix=None): FWF.add_class_args(parser, prefix) parser.add_argument( - p1 + "use-fft2", + "--use-fft2", default=True, type=str2bool, help="If true, it uses |X(f)|^2, if false, it uses |X(f)|", ) parser.add_argument( - p1 + "dither", + "--dither", type=float, - default=1, + default=1 / 2 ** 15, help="Dithering constant (0.0 means no dither)", ) FBF.add_class_args(parser, prefix) parser.add_argument( - p1 + "num-ceps", + "--num-ceps", type=int, default=13, help="Number of cepstra in MFCC computation (including C0)", ) parser.add_argument( - p1 + "snip-edges", + "--snip-edges", default=True, type=str2bool, help=( @@ -480,34 +474,34 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "energy-floor", + "--energy-floor", type=float, default=0, help="Floor on energy (absolute, not relative) in MFCC computation", ) parser.add_argument( - p1 + "raw-energy", + "--raw-energy", default=True, type=str2bool, help="If true, compute energy before preemphasis and windowing", ) parser.add_argument( - p1 + "use-energy", + "--use-energy", default=True, type=str2bool, help="Use energy (not C0) in MFCC computation", ) parser.add_argument( - p1 + "cepstral-lifter", + "--cepstral-lifter", type=float, default=22, help="Constant that controls scaling of MFCCs", ) parser.add_argument( - p1 + "input-step", + "--input-step", default="wave", choices=["wave", "fft", "spec", "log_spec", "logfb"], help=( @@ -516,7 +510,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "output-step", + "--output-step", default="mfcc", choices=["fft", "spec", "log_spec", "logfb", "mfcc"], help=( @@ -524,4 +518,7 @@ def add_class_args(parser, prefix=None): ), ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + add_argparse_args = add_class_args diff --git a/hyperion/np/metrics/__init__.py b/hyperion/np/metrics/__init__.py index 36afdbf5..d45daba5 100644 --- a/hyperion/np/metrics/__init__.py +++ b/hyperion/np/metrics/__init__.py @@ -5,7 +5,10 @@ from .acc import compute_accuracy from .confusion_matrix import * -from .dcf import (compute_act_dcf, compute_dcf, compute_min_dcf, - fast_eval_dcf_eer) +from .dcf import compute_act_dcf, compute_dcf, compute_min_dcf, fast_eval_dcf_eer from .eer import compute_eer, compute_prbep from .utils import effective_prior +from .verification_evaluator import ( + VerificationEvaluator, + VerificationAdvAttackEvaluator, +) diff --git a/hyperion/np/metrics/cllr.py b/hyperion/np/metrics/cllr.py index ec816286..cd97a97c 100644 --- a/hyperion/np/metrics/cllr.py +++ b/hyperion/np/metrics/cllr.py @@ -5,7 +5,7 @@ import numpy as np -from ..utils.math import neglogsigmoid +from ..utils.math_funcs import neglogsigmoid from .utils import opt_loglr diff --git a/hyperion/np/metrics/utils.py b/hyperion/np/metrics/utils.py index 0715d809..e638fd1b 100644 --- a/hyperion/np/metrics/utils.py +++ b/hyperion/np/metrics/utils.py @@ -8,7 +8,7 @@ import numpy as np from ...hyp_defs import float_cpu -from ...utils.math import logsumexp, softmax +from ...utils.math_funcs import logsumexp, softmax def effective_prior(p_tar, c_miss, c_fa): diff --git a/hyperion/np/metrics/verification_evaluator.py b/hyperion/np/metrics/verification_evaluator.py index 2adf15cf..e35e7cf7 100644 --- a/hyperion/np/metrics/verification_evaluator.py +++ b/hyperion/np/metrics/verification_evaluator.py @@ -2,8 +2,6 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - - import copy import logging import re @@ -18,13 +16,13 @@ import matplotlib.pyplot as plt from ...hyp_defs import float_cpu -from ...utils import TrialKey, TrialScores +from ...utils import TrialKey, TrialScores, SparseTrialKey, SparseTrialScores from ...utils.trial_stats import TrialStats from .dcf import fast_eval_dcf_eer from .utils import effective_prior -class VerificationEvaluator(object): +class VerificationEvaluator: """Class computes performance metrics for verification problems. Same metrics can be obtained from fast_eval_dcf_eer functions @@ -34,21 +32,40 @@ class VerificationEvaluator(object): p_tar: target prior float or list/nparray sorted in ascending order c_miss: cost of miss c_fa: cost of false alarm - + key_name: name describing the key + score_name: name describing the score + sparse: use sparse versions of TrialScores and Keys """ - def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None): - + def __init__( + self, + key, + scores, + p_tar, + c_miss=None, + c_fa=None, + key_name=None, + score_name=None, + sparse=False, + ): if isinstance(key, str): - logging.info("Load key: %s" % key) - key = TrialKey.load(key) + logging.info("Load key: %s", key) + if sparse: + key = SparseTrialKey.load(key) + else: + key = TrialKey.load(key) if isinstance(scores, str): - logging.info("Load scores: %s" % scores) - scores = TrialScores.load(scores) + logging.info("Load scores: %s", scores) + if sparse: + scores = SparseTrialScores.load(scores) + else: + scores = TrialScores.load(scores) self.key = key self.scores = scores.align_with_ndx(key) + self.key_name = key_name + self.score_name = score_name # compute effective prior is c_miss and c_fa are given if isinstance(p_tar, float): @@ -56,13 +73,16 @@ def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None): p_tar = np.asarray(p_tar) if c_miss is not None and c_fa is not None: + assert len(c_miss) == len(p_tar) + assert len(c_fa) == len(p_tar) c_miss = np.asarray(c_miss) c_fa = np.asarray(c_fa) p_tar = effective_prior(p_tar, c_miss, c_fa) + self._p_tar_sort = np.argsort(p_tar) self.p_tar = p_tar - def compute_dcf_eer(self, return_df=False): + def compute_dcf_eer(self, return_df=True): """ Computes DCF/EER @@ -74,24 +94,38 @@ def compute_dcf_eer(self, return_df=False): """ logging.info("separating tar/non") tar, non = self.scores.get_tar_non(self.key) + ntar = len(tar) + nnon = len(non) logging.info("computing EER/DCF") - min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer(tar, non, self.p_tar) + min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer( + tar, non, self.p_tar[self._p_tar_sort] + ) + min_dcf[self._p_tar_sort] = min_dcf.copy() + act_dcf[self._p_tar_sort] = act_dcf.copy() if not return_df: - return min_dcf, act_dcf, eer + return min_dcf, act_dcf, eer, ntar, nnon if len(self.p_tar) == 1: eer = [eer] min_dcf = [min_dcf] act_dcf = [act_dcf] - df = pd.DataFrame({"eer": eer}) - + df = pd.DataFrame( + { + "scores": [self.score_name], + "key": [self.key_name], + "eer": eer, + "eer(%)": eer * 100, + } + ) for i in range(len(min_dcf)): pi = self.p_tar[i] df["min-dcf-%.3f" % (pi)] = min_dcf[i] df["act-dcf-%.3f" % (pi)] = act_dcf[i] + df["num_targets"] = ntar + df["num_nontargets"] = nnon return df @@ -116,9 +150,7 @@ class VerificationAdvAttackEvaluator(VerificationEvaluator): def __init__( self, key, scores, attack_scores, attack_stats, p_tar, c_miss=None, c_fa=None ): - super(VerificationAdvAttackEvaluator, self).__init__( - key, scores, p_tar, c_miss, c_fa - ) + super().__init__(key, scores, p_tar, c_miss, c_fa) if not isinstance(attack_scores, list): attack_scores = [attack_scores] if not isinstance(attack_stats, list): @@ -133,7 +165,7 @@ def __init__( if isinstance(attack_scores[0], str): l = [] for file_path in attack_scores: - logging.info("Load attack scores: %s" % file_path) + logging.info("Load attack scores: %s", file_path) scores = TrialScores.load(file_path) l.append(scores) attack_scores = l @@ -151,7 +183,7 @@ def __init__( if isinstance(attack_stats[0], str): l = [] for file_path in attack_stats: - logging.info("Load attack stats: %s" % file_path) + logging.info("Load attack stats: %s", file_path) scores = TrialStats.load(file_path) l.append(scores) attack_stats = l @@ -216,7 +248,7 @@ def compute_dcf_eer_vs_stats( stat_bins, attacked_trials="all", higher_better=False, - return_df=False, + return_df=True, ): """ Computes DCF/EER versus SNR/Linf/etc curves @@ -307,7 +339,7 @@ def find_best_attacks( threshold=None, prior_idx=0, higher_better=False, - return_df=False, + return_df=True, ): """ Find the best attacks from the point of view of some of the stats. E.g., diff --git a/hyperion/np/pdfs/core/normal.py b/hyperion/np/pdfs/core/normal.py index b8f8bb54..67872315 100644 --- a/hyperion/np/pdfs/core/normal.py +++ b/hyperion/np/pdfs/core/normal.py @@ -7,11 +7,20 @@ import scipy.linalg as la from ....hyp_defs import float_cpu -from ....utils.math import (fullcov_varfloor, invert_pdmat, invert_trimat, - logdet_pdmat, symmat2vec, vec2symmat) -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import ( + fullcov_varfloor, + invert_pdmat, + invert_trimat, + logdet_pdmat, + symmat2vec, + vec2symmat, +) +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from .exp_family import ExpFamily @@ -213,7 +222,7 @@ def sample(self, num_samples, rng=None, seed=1024): assert self.is_init if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) return rng.multivariate_normal(self.mu, self.Sigma, size=(num_samples,)).astype( float_cpu() ) diff --git a/hyperion/np/pdfs/core/normal_diag_cov.py b/hyperion/np/pdfs/core/normal_diag_cov.py index c9986f4c..23535112 100644 --- a/hyperion/np/pdfs/core/normal_diag_cov.py +++ b/hyperion/np/pdfs/core/normal_diag_cov.py @@ -7,9 +7,12 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from .exp_family import ExpFamily @@ -183,7 +186,7 @@ def sample(self, num_samples, rng=None, seed=1024): """ assert self.is_init if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) return self.mu + 1.0 / self.cholLambda * x diff --git a/hyperion/np/pdfs/hmm/hmm.py b/hyperion/np/pdfs/hmm/hmm.py index 80232e36..92d9c371 100644 --- a/hyperion/np/pdfs/hmm/hmm.py +++ b/hyperion/np/pdfs/hmm/hmm.py @@ -6,7 +6,7 @@ import numpy as np from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax +from ....utils.math_funcs import logsumexp, softmax from ..core import PDF @@ -232,7 +232,7 @@ def viterbi_decode(self, x, nbest=1): def sample(self, num_seqs, num_steps, rng=None, seed=1024): if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) x = np.zeros((num_seqs, num_steps, self.num_states), dtype=float_cpu()) x[:, 0, :] = rng.multinomial(1, self.pi, size=(num_seqs,)) diff --git a/hyperion/np/pdfs/jfa/jfa_total.py b/hyperion/np/pdfs/jfa/jfa_total.py index 041431fb..6e2b79e3 100644 --- a/hyperion/np/pdfs/jfa/jfa_total.py +++ b/hyperion/np/pdfs/jfa/jfa_total.py @@ -7,8 +7,13 @@ from scipy import linalg as la from ....hyp_defs import float_cpu -from ....utils.math import (invert_pdmat, invert_trimat, logdet_pdmat, - symmat2vec, vec2symmat) +from ....utils.math_funcs import ( + invert_pdmat, + invert_trimat, + logdet_pdmat, + symmat2vec, + vec2symmat, +) from ..core.pdf import PDF diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py index 5560882c..2186522e 100644 --- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -7,7 +7,7 @@ import numpy as np from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax +from ....utils.math_funcs import logsumexp, softmax from ....utils.queues import GeneratorQueue from ..core import PDF diff --git a/hyperion/np/pdfs/mixtures/gmm.py b/hyperion/np/pdfs/mixtures/gmm.py index ca197142..7b080dae 100644 --- a/hyperion/np/pdfs/mixtures/gmm.py +++ b/hyperion/np/pdfs/mixtures/gmm.py @@ -8,12 +8,22 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.math import (fullcov_varfloor, invert_pdmat, invert_trimat, - logdet_pdmat, logsumexp, softmax, symmat2vec, - vec2symmat) -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import ( + fullcov_varfloor, + invert_pdmat, + invert_trimat, + logdet_pdmat, + logsumexp, + softmax, + symmat2vec, + vec2symmat, +) +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from ...clustering import KMeans from ..core import Normal from .exp_family_mixture import ExpFamilyMixture @@ -292,7 +302,7 @@ def sample(self, num_samples, rng=None, seed=1024, r=None): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) if r is None: r = rng.multinomial(1, self.pi, size=(num_samples,)) diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py index 90141573..7589243e 100644 --- a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -8,10 +8,13 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import logsumexp, softmax +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from ...clustering import KMeans from .exp_family_mixture import ExpFamilyMixture @@ -262,7 +265,7 @@ def sample(self, num_samples=1, rng=None, seed=1024, r=None): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) if r is None: r = rng.multinomial(1, self.pi, size=(num_samples,)) diff --git a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py index 4dc8f46e..6ef7c891 100644 --- a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py @@ -7,10 +7,13 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import logsumexp, softmax +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from ...clustering import KMeans from .gmm_diag_cov import GMMDiagCov @@ -193,7 +196,7 @@ def sample(self, num_samples=1, rng=None, seed=1024, r=None): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) if r is None: r = rng.multinomial(1, self.pi, size=(num_samples,)) diff --git a/hyperion/np/pdfs/plda/frplda.py b/hyperion/np/pdfs/plda/frplda.py index 183725a7..af8c5d8b 100644 --- a/hyperion/np/pdfs/plda/frplda.py +++ b/hyperion/np/pdfs/plda/frplda.py @@ -7,7 +7,7 @@ from scipy import linalg as sla from ....hyp_defs import float_cpu -from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase @@ -465,7 +465,7 @@ def sample( assert self.is_init if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) Sb = invert_pdmat(self.B, return_inv=True)[-1] chol_Sb = sla.cholesky(Sb, lower=False) diff --git a/hyperion/np/pdfs/plda/plda.py b/hyperion/np/pdfs/plda/plda.py index fd2eb9a9..76299970 100644 --- a/hyperion/np/pdfs/plda/plda.py +++ b/hyperion/np/pdfs/plda/plda.py @@ -7,7 +7,7 @@ from scipy import linalg as sla from ....hyp_defs import float_cpu -from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase @@ -674,7 +674,7 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) x_dim = self.mu.shape[0] diff --git a/hyperion/np/pdfs/plda/splda.py b/hyperion/np/pdfs/plda/splda.py index f9322d26..5d397183 100644 --- a/hyperion/np/pdfs/plda/splda.py +++ b/hyperion/np/pdfs/plda/splda.py @@ -6,7 +6,7 @@ from scipy import linalg as sla from ....hyp_defs import float_cpu -from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase @@ -502,7 +502,7 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) Sw = invert_pdmat(self.W, return_inv=True)[-1] chol_Sw = sla.cholesky(Sw, lower=False) diff --git a/hyperion/np/transforms/skl_tsne.py b/hyperion/np/transforms/skl_tsne.py index 3f60c4be..ebabc6ec 100644 --- a/hyperion/np/transforms/skl_tsne.py +++ b/hyperion/np/transforms/skl_tsne.py @@ -23,7 +23,7 @@ class SklTSNE(NPModel): metric: the metric to use when calculating distance between instances in ['cosine', 'euclidean', 'l1', 'l2', 'precomputed'] or callable function. init: initialization method in ['random', 'pca'] or embedding matrix of shape (num_samples, num_comp) verbose: verbosity level. - rng: RandomState instance + rng: default_rng instance rng_seed: seed for random number generator method: gradient calculation method in [‘barnes_hut’, 'exact'] angle: angle thetha in Barnes-Hut TSNE @@ -53,7 +53,7 @@ def __init__( super().__init__(**kwargs) self.rng_seed = rng_seed if rng is None: - rng = np.random.RandomState(seed=rng_seed) + rng = np.random.default_rng(seed=rng_seed) self._tsne = TSNE( n_components=tsne_dim, diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index fa675fdb..f91d7d96 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -304,6 +304,7 @@ def __getitem__(self, segment): x, fs = self._read_audio(seg_id, start, duration) x, fs = self._resample(x, fs) data = {"seg_id": seg_id, "sample_freq": fs} + if self.augmenters: # augmentations if duration == 0: @@ -324,6 +325,17 @@ def __getitem__(self, segment): seg_info = self._get_segment_info(seg_id) data.update(seg_info) + if np.any(~np.isfinite(data["x"])): + print( + "zzz", + x.max(), + x.min(), + x.mean(), + data["x"].max(), + data["x"].min(), + data["x"].mean(), + flush=True, + ) return data @staticmethod diff --git a/hyperion/torch/layers/audio_feats_factory.py b/hyperion/torch/layers/audio_feats_factory.py index a8398dac..6d0b4df4 100644 --- a/hyperion/torch/layers/audio_feats_factory.py +++ b/hyperion/torch/layers/audio_feats_factory.py @@ -315,7 +315,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--dither", type=float, - default=1, + default=1.0 / 2 ** 15, help="Dithering constant (0.0 means no dither)", ) diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 06838ddd..29b6cdaa 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -7,11 +7,19 @@ from .transducer import RNNRNNTransducer, RNNTransducer from .vae.vae import VAE from .vae.vq_vae import VQVAE -from .wav2transducer import (HFWav2Vec2ConformerV1RNNTransducer, - HFWav2Vec2RNNRNNTransducer, - HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer) -from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector, - HFWavLM2ResNet1dXVector) +from .wav2transducer import ( + HFWav2Vec2ConformerV1RNNTransducer, + HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2Transducer, +) +from .wav2xvectors import ( + HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector, + Wav2ResNetXVector, + Wav2ResNet1dXVector, +) from .xvectors.efficient_net_xvector import EfficientNetXVector from .xvectors.resnet1d_xvector import ResNet1dXVector from .xvectors.resnet_xvector import ResNetXVector diff --git a/hyperion/torch/models/plda/splda.py b/hyperion/torch/models/plda/splda.py index 2272793e..3a0f1dee 100644 --- a/hyperion/torch/models/plda/splda.py +++ b/hyperion/torch/models/plda/splda.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn -from ...utils.math import invert_trimat +from ...utils.math_funcs import invert_trimat from .plda_base import PLDABase diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index c2bcdf99..24ab5bbb 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -224,7 +224,7 @@ def extract_embed( ): if vad_samples is not None: - x, x_lengths = remove_silence(x, x_lengths) + x, x_lengths = remove_silence(x, vad_samples, x_lengths) feats, _, feat_lengths = self.forward_feats( x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks @@ -301,7 +301,7 @@ def set_train_mode(self, mode): logging.info("train mode set to %s", mode) - if "nograd" in mode: + if "nograd" in mode or mode == "ft-embed-affine": logging.info("using torch.no_grad for hf_feats") self._hf_context = torch.no_grad() else: diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py index 0d9f1bc4..0e4faded 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py @@ -52,3 +52,21 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ResNet1dXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNet1dXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py index 1f7283a0..11d643af 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py @@ -52,3 +52,21 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ResNetXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNetXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index 4c21f478..4bbc0c4c 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import contextlib import logging from jsonargparse import ActionParser, ArgumentParser @@ -35,6 +36,23 @@ def __init__(self, feats, xvector): self.feats = feats self.xvector = xvector + self._feats_context = contextlib.nullcontext() + + @property + def sample_frequency(self): + return self.feats.sample_frequency + + def compute_prototype_affinity(self): + return self.xvector.compute_prototype_affinity() + + def update_loss_margin(self, epoch): + """Updates the value of the margin in AAM/AM-softmax losses + given the epoch number + + Args: + epoch: epoch which is about to start + """ + self.xvector.update_loss_margin(epoch) def rebuild_output_layer( self, @@ -58,8 +76,9 @@ def rebuild_output_layer( num_subcenters=num_subcenters, ) - def compute_prototype_affinity(self): - return self.xvector.compute_prototype_affinity() + def change_config(self, xvector): + logging.info("changing wav2xvector config") + self.xvector.change_config(**xvector) def forward( self, @@ -73,15 +92,28 @@ def forward( return_output=True, ): - if vad_samples is not None: - x, x_lengths = remove_silence(x, x_lengths) - feats, feat_lengths = self.feats(x, x_lengths) - if vad_feats is not None: - feats, feat_lengths = remove_silence(feats, feat_lengths) - - # feat_lengths = torch.div(x_lengths * feats.size(-1), x.size(-1)) - return self.xvector(feats, feat_lengths, y, enc_layers, classif_layers, - return_output) + with self._feats_context: + if vad_samples is not None: + x, x_lengths = remove_silence(x, vad_samples, x_lengths) + + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, vad_feats, feat_lengths) + + n = torch.sum(~torch.isfinite(feats)) + if n > 0: + print( + "feats", + n, + torch.sum(torch.isnan(feats)), + torch.sum(torch.any(torch.isnan(x), dim=-1)), + x.dtype, + feats.dtype, + flush=True, + ) + return self.xvector( + feats, feat_lengths, y, enc_layers, classif_layers, return_output + ) def extract_embed( self, @@ -94,18 +126,54 @@ def extract_embed( detach_chunks=False, ): - if vad_samples is not None: - x, x_lengths = remove_silence(x, x_lengths) - feats, feat_lengths = self.feats(x, x_lengths) - if vad_feats is not None: - feats, feat_lengths = remove_silence(feats, feat_lengths) + with self._feats_context: + if vad_samples is not None: + x, x_lengths = remove_silence(x, vad_samples, x_lengths) - feats = feats.transpose(1, 2) - return self.xvector.extract_embed(feats, feat_lengths, chunk_length, - embed_layer, detach_chunks) + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, vad_feats, feat_lengths) + + chunk_length = int(chunk_length * feats.shape[1] / x.shape[-1]) + + return self.xvector.extract_embed( + feats, feat_lengths, chunk_length, embed_layer, detach_chunks + ) def set_train_mode(self, mode): - self.xvector.set_train_mode(mode) + if mode == self._train_mode: + return + + if mode == "full-feats-grad": + self._feats_context = contextlib.nullcontext() + xvector_mode = "full" + else: + logging.info("using torch.no_grad for feats") + self._feats_context = torch.no_grad() + + self.xvector.set_train_mode(xvector_mode) + self._train_mode = mode + + def _train(self, train_mode: str): + + self.feats.train() + if train_mode in ["frozen"]: + super()._train(train_mode) + elif train_mode in ["full-feats-grad", "full"]: + self.xvector._train("full") + elif train_mode == "ft-embed-affine": + self.xvector._train("ft-embed_affine") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "full-feats-grad", + ] def get_config(self): feat_cfg = self.feats.get_config() @@ -119,7 +187,7 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) @staticmethod - def filter_args(*kwargs): + def filter_args(**kwargs): """Filters Wav2XVector class arguments from arguments dictionary. Args: @@ -150,5 +218,4 @@ def add_class_args(parser, prefix=None): AudioFeatsMVN.add_class_args(parser, prefix="feats") if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index a9ad224e..440c22b6 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -50,6 +50,10 @@ def __init__( self.trans = trans self.aug_after_mvn = aug_after_mvn + @property + def sample_frequency(self): + return self.audio_feats.fs + @property def fs(self): return self.audio_feats.fs diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 0cb887ca..e7020e1d 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -2,11 +2,11 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os from collections import OrderedDict as ODict from copy import deepcopy from enum import Enum from typing import Optional +from pathlib import Path import torch import torch.nn as nn @@ -110,13 +110,11 @@ def valid_train_modes(): return ["full", "frozen"] def save(self, file_path): - file_dir = os.path.dirname(file_path) - if not (os.path.isdir(file_dir)): - os.makedirs(file_dir, exist_ok=True) - - config = self.get_config() + file_path = Path(file_path) + file_path.parent.mkdir(parents=True, exist_ok=True) torch.save( - {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()} + {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()}, + file_path, ) @staticmethod @@ -176,7 +174,7 @@ def _fix_cfg_compatibility(class_obj, cfg): Fixed configuration dictionary. """ # for compatibility with older x-vector models - XVector = torch_model_registry["xvector"] + XVector = TorchModel.registry["XVector"] if issubclass(class_obj, XVector): # We renamed AM-softmax scale parameer s to cos_scale if "s" in cfg: @@ -195,8 +193,9 @@ def auto_load(file_path, extra_objs={}, map_location=None): cfg = model_data["model_cfg"] class_name = cfg["class_name"] del cfg["class_name"] - if class_name in torch_model_registry: - class_obj = torch_model_registry[class_name] + print(TorchModel.registry) + if class_name in TorchModel.registry: + class_obj = TorchModel.registry[class_name] elif class_name in extra_objs: class_obj = extra_objs[class_name] else: diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index fe72339f..4d4dd55a 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -100,3 +100,19 @@ def cat(cls, tables): ) df["class_idx"].drop(columns=["class_idx"], inplace=True) return cls(df) + + def filter( + self, + predicate=None, + items=None, + iindex=None, + columns=None, + by="id", + keep=True, + rebuild_idx=False, + ): + new_class_info = super().filter(predicate, items, iindex, columns, by, keep) + if rebuild_idx: + new_class_info.add_class_idx() + + return new_class_info diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index d1d969fb..dd446576 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -4,13 +4,14 @@ """ import logging from pathlib import Path -from typing import Dict, Optional, Union +from typing import List, Dict, Optional, Union from copy import deepcopy import math import numpy as np import pandas as pd import yaml +from .info_table import InfoTable from .class_info import ClassInfo from .feature_set import FeatureSet from .misc import PathLike @@ -30,7 +31,7 @@ class Dataset: Attributes: segments: SegmentSet object or path to it. classes: Dictionary of ClassInfo objects or paths to then - recordings: Dictionary of RecordingSet objects or paths to then + recordings: RecordingSet object or paths to then features: Dictionary of FeatureSet objects or paths to then enrollments: Dictionary of EnrollmentMap objects or paths to then trials: Dictionary of TrialKey/TrialNdx/SparseTrialKey objects @@ -45,7 +46,7 @@ def __init__( self, segments: Union[SegmentSet, PathLike], classes: Optional[Dict[str, Union[ClassInfo, PathLike]]] = None, - recordings: Optional[Dict[str, Union[RecordingSet, PathLike]]] = None, + recordings: Optional[Union[RecordingSet, PathLike]] = None, features: Optional[Dict[str, Union[FeatureSet, PathLike]]] = None, enrollments: Optional[Dict[str, Union[EnrollmentMap, PathLike]]] = None, trials: Optional[ @@ -65,24 +66,65 @@ def __init__( self._classes, self._classes_paths = self._parse_dict_args(classes, ClassInfo) - self._recordings, self._recordings_paths = self._parse_dict_args( - recordings, RecordingSet - ) + if isinstance(recordings, RecordingSet): + self._recordings = recordings + self._recordings_path = None + else: + assert isinstance(recordings, (str, Path)) + self._recordings = None + self._recordings_path = Path(recordings) + + # self._recordings, self._recordings_paths = self._parse_dict_args( + # recordings, RecordingSet + # ) self._features, self._features_paths = self._parse_dict_args( features, FeatureSet ) self._enrollments, self._enrollments_paths = self._parse_dict_args( - enrollments, - EnrollmentMap, + enrollments, EnrollmentMap, ) self._trials, self._trials_paths = self._parse_dict_args( - trials, - (TrialKey, TrialNdx, SparseTrialKey), + trials, (TrialKey, TrialNdx, SparseTrialKey), ) self.sparse_trials = sparse_trials self.table_sep = table_sep + self._files_to_delete = [] + + def get_dataset_files(self): + file_paths = [] + for file_path in [self._segments_path, self._recordings_path]: + if file_path is not None: + file_paths.append(file_path) + + for path_dict in [ + self._features_paths, + self._enrollments_paths, + self._trials_paths, + ]: + if path_dict is None: + continue + for k, v in path_dict.items(): + file_paths.append(v) + + return file_paths + + def _delete_files(self, dataset_dir): + if not self._files_to_delete: + return + + dataset_files = self.get_dataset_files() + for file_path in self._files_to_delete: + file_path = Path(file_path) + # if the file has been added again we don't delete + if file_path in dataset_files: + continue + + # if we are saving the dataset to another location + # we don't delete the one in the original + if file_path.parent == dataset_dir and file_path.is_file(): + file_path.unlink() def _parse_dict_args(self, data, types): if data is None: @@ -109,17 +151,38 @@ def segments(self, keep_loaded: bool = True): return self._segments - def recordings_value(self, key: str, keep_loaded: bool = True): - if self._recordings[key] is None: - assert self._recordings_paths[key] is not None - recordings = RecordingSet.load( - self._recordings_paths[key], sep=self.table_sep - ) + def __len__(self): + return len(self.segments()) + + def recordings(self, keep_loaded: bool = True): + if self._recordings is None: + assert self._recordings_path is not None + recordings = RecordingSet.load(self._recordings_path, sep=self.table_sep) if keep_loaded: - self._recordings[key] = recordings + self._recordings = recordings return recordings - return self._recordings[key] + return self._recordings + + # def recordings_value(self, key: str, keep_loaded: bool = True): + # if self._recordings[key] is None: + # assert self._recordings_paths[key] is not None + # recordings = RecordingSet.load( + # self._recordings_paths[key], sep=self.table_sep + # ) + # if keep_loaded: + # self._recordings[key] = recordings + # return recordings + + # return self._recordings[key] + + def features_keys(self): + if self._features is not None: + return self._features.keys() + elif self._features_path is not None: + return self._features_path.keys() + else: + return {} def features_value(self, key: str, keep_loaded: bool = True): if self._features[key] is None: @@ -131,6 +194,14 @@ def features_value(self, key: str, keep_loaded: bool = True): return self._features[key] + def classes_keys(self): + if self._classes is not None: + return self._classes.keys() + elif self._classes_path is not None: + return self._classes_path.keys() + else: + return {} + def classes_value(self, key: str, keep_loaded: bool = True): if self._classes[key] is None: assert self._classes_paths[key] is not None @@ -170,12 +241,12 @@ def trials_value(self, key: str, keep_loaded: bool = True): return self._trials[key] - def recordings(self, keep_loaded: bool = True): - if self._recordings is None: - yield from () - else: - for key in self._recordings.keys(): - yield key, self.recordings_value(key, keep_loaded) + # def recordings(self, keep_loaded: bool = True): + # if self._recordings is None: + # yield from () + # else: + # for key in self._recordings.keys(): + # yield key, self.recordings_value(key, keep_loaded) def features(self, keep_loaded: bool = True): if self._features is None: @@ -299,7 +370,6 @@ def save_changed( dataset_path: PathLike, update_paths: bool = True, table_sep: Optional[str] = None, - force_save_all: bool = False, ): """Saves the tables that change in disk or tables that are not in the ouput directory. @@ -330,24 +400,36 @@ def save_changed( if update_paths: self._segments_path = file_path - if self._recordings is not None: - file_names = {} - for k in self._recordings.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._recordings[k] is not None - or file_path != self._recordings_paths[k] - or not file_path.exists() - ): - v = self.recordings_value(k, keep_loaded=False) - v.save(file_path, sep=table_sep) - if update_paths: - self._recordings_paths[k] = file_path - - if file_names: - dataset["recordings"] = file_names + file_name = f"recordings{table_ext}" + dataset["recordings"] = file_name + file_path = dataset_dir / file_name + if ( + self._recordings is not None + or file_path != self._recordings_path + or not file_path.exists() + ): + self.recordings(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._recordings_path = file_path + + # if self._recordings is not None: + # file_names = {} + # for k in self._recordings.keys(): + # file_name = k + table_ext + # file_names[k] = file_name + # file_path = dataset_dir / file_name + # if ( + # self._recordings[k] is not None + # or file_path != self._recordings_paths[k] + # or not file_path.exists() + # ): + # v = self.recordings_value(k, keep_loaded=False) + # v.save(file_path, sep=table_sep) + # if update_paths: + # self._recordings_paths[k] = file_path + + # if file_names: + # dataset["recordings"] = file_names if self._features is not None: file_names = {} @@ -428,6 +510,8 @@ def save_changed( with open(dataset_file, "w") as f: yaml.dump(dataset, f) + self._delete_files(dataset_dir) + def save_all( self, dataset_path: PathLike, @@ -457,17 +541,24 @@ def save_all( if update_paths: self._segments_path = file_path - file_names = {} - for k, v in self.recordings(keep_loaded=False): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - v.save(file_path, sep=table_sep) - if update_paths: - self._recordings_paths[k] = file_path + file_name = f"recordings{table_ext}" + dataset["recordings"] = file_name + file_path = dataset_dir / file_name + self.recordings(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._recordings_path = file_path - if file_names: - dataset["recordings"] = file_names + # file_names = {} + # for k, v in self.recordings(keep_loaded=False): + # file_name = k + table_ext + # file_names[k] = file_name + # file_path = dataset_dir / file_name + # v.save(file_path, sep=table_sep) + # if update_paths: + # self._recordings_paths[k] = file_path + + # if file_names: + # dataset["recordings"] = file_names file_names = {} for k, v in self.features(keep_loaded=False): @@ -520,10 +611,13 @@ def save_all( with open(dataset_file, "w") as f: yaml.dump(dataset, f) + self._delete_files(dataset_dir) + def update_from_disk(self): self.segments() - for k, v in self.recordings(): - pass + self.recordings() + # for k, v in self.recordings(): + # pass for k, v in self.features(): pass @@ -568,9 +662,10 @@ def load( classes[k] = Dataset.resolve_file_path(dataset_dir, v) if "recordings" in dataset: - recordings = {} - for k, v in dataset["recordings"].items(): - recordings[k] = Dataset.resolve_file_path(dataset_dir, v) + recordings = Dataset.resolve_file_path(dataset_dir, dataset["recordings"]) + # recordings = {} + # for k, v in dataset["recordings"].items(): + # recordings[k] = Dataset.resolve_file_path(dataset_dir, v) if "features" in dataset: features = {} @@ -615,32 +710,42 @@ def add_features(self, features_name: str, features: Union[PathLike, FeatureSet] else: raise ValueError() - def add_recordings( - self, - recordings_name: str, - recordings: Union[PathLike, RecordingSet], + def set_segments( + self, segments: Union[PathLike, SegmentSet], update_seg_durs: bool, ): - if self._recordings is None: - self._recordings = {} - self._recordings_paths = {} + if isinstance(segments, (str, Path)): + self._segments = None + self._segments_path = segments + elif isinstance(segments, SegmentSet): + self._segments = segments + self._segments_path = None + else: + raise ValueError() - if isinstance(features, (str, Path)): - self._recordings[features_name] = None - self._recordings_paths[recordings_name] = recordings + def set_recordings( + self, recordings: Union[PathLike, RecordingSet], update_seg_durs: bool, + ): + if isinstance(recordings, (str, Path)): + self._recordings = None + self._recordings_path = Path(recordings) elif isinstance(recordings, RecordingSet): - self._recordings[recordings_name] = recordings - self._recordings_paths[recordings_name] = None + self._recordings = recordings + self._recordings_path = None else: raise ValueError() + if update_seg_durs: + rec_ids = self.segments(keep_loaded=True).recordings() + self.segments()["duration"] = self.recordings().loc[rec_ids, "duration"] + def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): if self._classes is None: self._classes = {} self._classes_paths = {} if isinstance(classes, (str, Path)): - self._classes[features_name] = None - self._classes_paths[classes_name] = classes + self._classes[classes_name] = None + self._classes_paths[classes_name] = Path(classes) elif isinstance(classes, ClassInfo): self._classes[classes_name] = classes self._classes_paths[classes_name] = None @@ -648,9 +753,7 @@ def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): raise ValueError() def add_enrollments( - self, - enrollments_name: str, - enrollments: Union[PathLike, EnrollmentMap], + self, enrollments_name: str, enrollments: Union[PathLike, EnrollmentMap], ): if self._enrollments is None: self._enrollments = {} @@ -658,7 +761,7 @@ def add_enrollments( if isinstance(enrollments, (str, Path)): self._enrollments[enrollments_name] = None - self._enrollments_paths[enrollments_name] = enrollments + self._enrollments_paths[enrollments_name] = Path(enrollments) elif isinstance(enrollments, EnrollmentMap): self._enrollments[enrollments_name] = enrollments self._enrollments_paths[enrollments_name] = None @@ -675,8 +778,8 @@ def add_trials( self._trials_paths = {} if isinstance(trials, (str, Path)): - self._trials[features_name] = None - self._trials_paths[trials_name] = trials + self._trials[trials_name] = None + self._trials_paths[trials_name] = Path(trials) elif isinstance(trials, (TrialKey, TrialNdx, SparseTrialKey)): self._trials[trials_name] = trials self._trials_paths[trials_name] = None @@ -685,85 +788,104 @@ def add_trials( def remove_features(self, features_name: str): if self._features_paths[features_name] is not None: - file_path = Path(self._features_paths[features_name]) - if file_path.is_file(): - file_path.unlink() + self._files_to_delete.append(self._features_paths[features_name]) del self._features[features_name] del self._features_paths[features_name] - def remove_recordings( - self, - recordings_name: str, - ): - if self._recordingsr_paths[recordings_name] is not None: - file_path = Path(self._recordings_paths[recordings_name]) - if file_path.is_file(): - file_path.unlink() + def remove_recordings(self,): + if self._recordings_path is not None: + self._files_to_delete.append(self._recordings_path) - del self._recordings[recordings_name] - del self._recordings_paths[recordings_name] + self._recordings = None + self._recordings_path = None + + # def remove_recordings( + # self, + # recordings_name: str, + # ): + # if self._recordingsr_paths[recordings_name] is not None: + # file_path = Path(self._recordings_paths[recordings_name]) + # if file_path.is_file(): + # file_path.unlink() + + # del self._recordings[recordings_name] + # del self._recordings_paths[recordings_name] def remove_classes(self, classes_name: str): if self._classes_paths[classes_name] is not None: - file_path = Path(self._classes_paths[classes_name]) - if file_path.is_file(): - file_path.unlink() + self._files_to_delete.append(self._class_paths[class_name]) del self._classes[classes_name] del self._classes_paths[classes_name] def remove_enrollments( - self, - enrollments_name: str, + self, enrollments_name: str, ): if self._enrollments_paths[enrollments_name] is not None: - file_path = Path(self._enrollments_paths[enrollments_name]) - if file_path.is_file(): - file_path.unlink() + self._files_to_delete.append(self._enrollments_paths[enrollments_name]) del self._enrollments[enrollments_name] del self._enrollments_paths[enrollments_name] def remove_trials( - self, - trials_name: str, + self, trials_name: str, ): if self._trials_paths[trials_name] is not None: - file_path = Path(self._trials_paths[trials_name]) - if file_path.is_file(): - file_path.unlink() + self._files_to_delete.append(self._trials_paths[trials_name]) del self._trials[trials_name] del self._trials_paths[trials_name] - def set_segments(self, segments: Union[PathLike, SegmentSet]): - if isinstance(segments, SegmentSet): - self._segments = segments - else: - self._segments_path = segments + def add_cols_to_segments( + self, + right_table: Union[InfoTable, pd.DataFrame, PathLike], + column_names: Union[None, str, List[str], np.ndarray] = None, + on: Union[str, List[str], np.ndarray] = "id", + right_on: Union[None, str, List[str], np.ndarray] = None, + ): + if isinstance(right_table, (str, Path)): + file_path = Path(right_table) + if file_path.is_file(): + right_table = InfoTable.load(file_path) + else: + if right_table == "recordings": + right_table = self.recordings() + elif right_table in self.features_keys(): + right_table = self.features_value(right_table) + elif right_table in self.classes_keys(): + right_table = self.classes_value + else: + raise ValueError("%s not found", right_table) + + segments = self.segments(keep_loaded=True) + segments.add_columns(right_table, column_names, on=on, right_on=right_on) - def clean(self): - rec_ids = self.segments().recording_ids() - for k, table in self.recordings(): - table = table.loc[table["id"].isin(rec_ids)].copy() - self._recordings[k] = RecordingSet(table) + def clean(self, rebuild_class_idx=False): + rec_ids = self.segments().recordings() + # for k, table in self.recordings(): + # # table = table.loc[table["id"].isin(rec_ids)].copy() + # # self._recordings[k] = RecordingSet(table) + self._recordings = self.recordings().filter(lambda df: df["id"].isin(rec_ids)) ids = self.segments()["id"].values for k, table in self.features(): - table = table.loc[table["id"].isin(ids)].copy() - self._features[k] = FeatureSet(table) + self._features[k] = table.filter(lambda df: df["id"].isin(ids)) + # table = table.loc[table["id"].isin(ids)].copy() + # self._features[k] = FeatureSet(table) for k, table in self.classes(): class_ids = self.segments()[k].unique() - table = table[table["id"].isin(class_ids)].copy() - self._classes[k] = ClassInfo(table) + self._classes[k] = table.filter(lambda df: df["id"].isin(class_ids)) + # table = table[table["id"].isin(class_ids)].copy() + # self._classes[k] = ClassInfo(table) remove_keys = [] for k, table in self.enrollments(): - table = table.loc[table["segmentid"].isin(ids)].copy() + # table = table.loc[table["segmentid"].isin(ids)].copy() + table = table.filter(lambda df: df["segmentid"].isin(ids)) if len(table) > 0: - self._enrollments[k] = EnrollmentMap(table) + self._enrollments[k] = table else: remove_keys.append(k) @@ -790,7 +912,7 @@ def _split_into_trials_and_cohort( seed: int, ): # select test speakers - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) spks = segments["speaker"].unique() trial_spks = rng.choice(spks, size=(num_trial_speakers,), replace=False) @@ -859,20 +981,14 @@ def split_into_trials_and_cohort( segments_male = SegmentSet(segments[segments["gender"] == "m"]) segments_female = SegmentSet(segments[segments["gender"] == "f"]) trials_male, enroll_male, cohort_male = self._split_into_trials_and_cohort( - segments_male, - num_tar_trials, - num_trial_speakers, - seed, + segments_male, num_tar_trials, num_trial_speakers, seed, ) ( trials_female, enroll_female, cohort_female, ) = self._split_into_trials_and_cohort( - segments_female, - num_tar_trials, - num_trial_speakers, - seed, + segments_female, num_tar_trials, num_trial_speakers, seed, ) trials = TrialKey.merge([trials_male, trials_female]) enroll = EnrollmentMap.cat([enroll_male, enroll_female]) @@ -880,10 +996,7 @@ def split_into_trials_and_cohort( else: segments = self.segments() trials, enroll, cohort = self._split_into_trials_and_cohort( - segments, - num_tar_trials, - num_trial_speakers, - seed, + segments, num_tar_trials, num_trial_speakers, seed, ) dataset_trials = self.clone() @@ -899,3 +1012,176 @@ def split_into_trials_and_cohort( dataset_cohort.clean() return dataset_trials, dataset_cohort + + def remove_short_segments(self, min_length: float, length_name: str = "duration"): + segments = self.segments() + self._segments = segments.filter(lambda df: df[length_name] >= min_length) + self.clean() + + def remove_classes_few_segments( + self, class_name: str, min_segs: int, rebuild_idx: bool = False, + ): + segments = self.segments() + classes, counts = np.unique(segments[class_name], return_counts=True) + keep_classes = classes[counts >= min_segs] + self._segments = segments.filter(lambda df: df[class_name].isin(keep_classes)) + self.clean() + if rebuild_idx: + class_info = self.classes_value(class_name) + class_info.add_class_idx() + + def rebuild_class_idx(self, class_name: str): + class_info = self.classes_value(class_name) + class_info.add_class_idx() + + def _segments_split(self, val_prob: float, rng: np.random.Generator): + segments = self.segments() + p = rng.permutation(len(segments)) + num_train = int(round((1 - val_prob) * len(p))) + + train_idx = p[:num_train] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_idx = p[num_train:] + val_segs = segments.filter(iindex=val_idx) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_joint_classes( + self, + val_prob: float, + joint_classes: List[str], + min_train_samples: int, + rng: np.random.Generator, + ): + segments = self.segments() + classes = segments[joint_classes].apply("-".join, axis=1) + u_classes, class_ids = np.unique(classes, return_inverse=True) + train_mask = np.zeros(len(segments), dtype=bool) + kk = 0 + for c_id in range(len(u_classes)): + idx = (class_ids == c_id).nonzero()[0] + count = len(idx) + p = rng.permutation(count) + num_train = max( + int(round((1 - val_prob) * count)), min(min_train_samples, count) + ) + kk += count - num_train + train_idx = idx[p[:num_train]] + train_mask[train_idx] = True + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_disjoint_classes( + self, val_prob: float, disjoint_classes: List[str], rng: np.random.Generator, + ): + segments = self.segments() + classes = segments[disjoint_classes].apply("-".join, axis=1) + u_classes, class_ids = np.unique(classes, return_inverse=True) + p = rng.permutation(len(u_classes)) + class_ids = p[class_ids] + num_train = int(round((1 - val_prob) * len(segments))) + train_mask = np.zeros(len(segments), dtype=bool) + count_acc = 0 + for c_id in range(len(u_classes)): + idx = (class_ids == c_id).nonzero()[0] + train_mask[idx] = True + count = len(idx) + count_acc += count + if count_acc >= num_train: + break + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_joint_and_disjoint_classes( + self, + val_prob: float, + joint_classes: List[str], + disjoint_clases: List[str], + min_train_samples: int, + rng: np.random.Generator, + ): + raise NotImplementedError("I'll implement this when I need it") + segments = self.segments() + j_classes = segments[joint_classes].apply("-".join, axis=1) + ju_classes, j_class_ids = np.unique(j_classes, return_inverse=True) + d_classes = segments[disjoint_classes].apply("-".join, axis=1) + du_classes, d_class_ids = np.unique(d_classes, return_inverse=True) + d_p = rng.permutation(len(du_classes)) + d_class_ids = d_p[d_class_ids] + d_sort_idx = np.argsort(d_class_ids) + d_sort_j_class_ids = j_class_ids[d_sort_idx] + + train_d_classes = set() + for c_id in range(len(ju_classes)): + idx = (j_sort_class_ids == c_id).nonzero()[0] + count = len(idx) + num_train = max( + int(round((1 - val_prob) * count)), min(min_train_samples, count) + ) + sel_d_class_ids = set(d_sort_idx[:num_train]) + train_d_classes = train_d_classes.union(sel_d_class_ids) + + train_mask = np.zeros(len(segments), dtype=bool) + for c_id in train_d_classes: + mask = d_class_ids == c_id + train_mask[mask] = True + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def split_train_val( + self, + val_prob: float, + joint_classes: Optional[List[str]] = None, + disjoint_classes: Optional[List[str]] = None, + min_train_samples: int = 1, + seed: int = 11235813, + ): + rng = np.random.default_rng(seed) + if joint_classes is None and disjoint_classes is None: + train_segs, val_segs = self._segments_split(val_prob, rng) + elif joint_classes is not None and disjoint_classes is None: + train_segs, val_segs = self._segments_split_joint_classes( + val_prob, joint_classes, min_train_samples, rng, + ) + elif joint_classes is None and disjoint_classes is not None: + train_segs, val_segs = self._segments_split_disjoint_classes( + val_prob, disjoint_classes, rng, + ) + else: + train_segs, val_segs = self._segments_split_joint_and_disjoint_classes( + val_prob, joint_classes, disjoint_classes, min_train_samples, rng, + ) + + train_ds = self.clone() + train_ds.set_segments(train_segs) + train_ds.clean() + + val_ds = self.clone() + val_ds.set_segments(val_segs) + val_ds.clean() + + return train_ds, val_ds diff --git a/hyperion/utils/fold_list.py b/hyperion/utils/fold_list.py index f22263cf..80b818d6 100644 --- a/hyperion/utils/fold_list.py +++ b/hyperion/utils/fold_list.py @@ -176,7 +176,7 @@ def create( FoldList object. """ if shuffle: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) if group_by_key is None: group_by_key = segment_key diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 45eab05f..57f3faf2 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -8,6 +8,7 @@ from collections import OrderedDict from copy import deepcopy from pathlib import Path +from typing import Optional, Union, List import numpy as np import pandas as pd @@ -192,14 +193,41 @@ def cat(cls, tables): ].is_unique, """there are duplicated ids in the tables we are concatenating""" return cls(df) - def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): + def filter( + self, predicate=None, items=None, iindex=None, columns=None, by="id", keep=True + ): + """Filters the table and produce a new table with the elements to keep + + Args: + predicate: callable function that defines the filtering criterion e.g.: + lambda df: df["duration"] > 1.0. + items: filters the table based in column value with pandas command: + df.loc[items, by], used only if predicate is None + iindex: filters the table based on integer index with pandas command: + df.iloc[iiindex], used if predicate and items are None + columns: columns to keep of remove. + by: column id to use with itmes criterion + keep: if True, the criterion is used to keep rows, if False it is used + to remove rows + + Returns + InfoTable of the same class as the input. + """ assert ( - items is None or iindex is None - ), "items and iindex cannot be not None at the same time" + predicate is not None + or items is not None + or iindex is not None + or columns is not None + ), "predicate, items, iindex and columns cannot be not None at the same time" df = self.df + if predicate is not None: + mask = predicate(self.df) + if not keep: - if items is not None: + if predicate is not None: + mask = np.logical_not(mask) + elif items is not None: items = np.setdiff1d(df[by], items) elif iindex is not None: iindex = np.setdiff1d(np.arange(len(df)), iindex) @@ -207,7 +235,12 @@ def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): if columns is not None: columns = np.setdiff1d(df.columns, columns) - if items is not None: + if predicate is not None: + if columns is None: + df = df.loc[mask] + else: + df = df.loc[mask, columns] + elif items is not None: if by != "id": missing = [False if v in df[by] else True for v in items] if any(missing): @@ -225,7 +258,7 @@ def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): if columns is not None: df = df[columns] - return self.__class__(df) + return self.__class__(df.copy()) def __eq__(self, other): """Equal operator""" @@ -255,7 +288,7 @@ def shuffle(self, seed=1024, rng=None): Index used to shuffle the list. """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) index = np.arange(len(self.df)) rng.shuffle(index) self.df = self.df.iloc[index] @@ -279,14 +312,33 @@ def get_loc(self, keys): loc = self.df.index.get_loc(keys) if isinstance(loc, int): return loc - elif isinstance(loc, np.ndarray) and loc.dtype == np.bool: + + if isinstance(loc, np.ndarray) and loc.dtype == np.bool: return np.nonzero(loc)[0] - else: - return list(range(loc.start, loc.stop, loc.step)) + + return list(range(loc.start, loc.stop, loc.step)) def get_col_idx(self, keys): return self.df.columns.get_loc(keys) + def add_columns( + self, + right_table, + column_names: Union[None, str, List[str], np.ndarray] = None, + on: Union[str, List[str], np.ndarray] = "id", + right_on: Union[None, str, List[str], np.ndarray] = None, + ): + if isinstance(right_table, InfoTable): + right_table = right_table.df + + if column_names is not None: + right_table = right_table[column_names] + + if right_on is None: + right_on = on + + self.df = self.df.merge(right_table, how="left", left_on=on, right_on=right_on) + # def __len__(self): # """Returns the number of elements in the list.""" diff --git a/hyperion/utils/math.py b/hyperion/utils/math_funcs.py similarity index 93% rename from hyperion/utils/math.py rename to hyperion/utils/math_funcs.py index 84596f7d..5ee510b9 100644 --- a/hyperion/utils/math.py +++ b/hyperion/utils/math_funcs.py @@ -346,10 +346,26 @@ def int2onehot(class_ids, num_classes=None): return p -def cosine_scoring(x1, x2): +def average_vectors(x, ids): + assert x.shape[0] == len(ids) + num_ids = np.max(ids) + 1 + x_avg = np.zeros((num_ids, x.shape[1]), dtype=x.dtype) + for i in range(num_ids): + mask = ids == i + x_avg[i] = np.mean(x[mask], axis=0) - l2_1 = np.sqrt(np.sum(x1 ** 2, axis=-1, keepdims=True)) - l2_2 = np.sqrt(np.sum(x2 ** 2, axis=-1, keepdims=True)) + return x_avg + + +def cosine_scoring(x1, x2, ids1=None, ids2=None): + if ids1 is not None: + x1 = average_vectors(x1, ids1) + + if ids2 is not None: + x2 = average_vectors(x2, ids2) + + l2_1 = np.sqrt(np.sum(x1 ** 2, axis=-1, keepdims=True) + 1e-10) + l2_2 = np.sqrt(np.sum(x2 ** 2, axis=-1, keepdims=True) + 1e-10) x1 = x1 / l2_1 x2 = x2 / l2_2 diff --git a/hyperion/utils/plotting.py b/hyperion/utils/plotting.py index 2341beb4..ec617975 100644 --- a/hyperion/utils/plotting.py +++ b/hyperion/utils/plotting.py @@ -4,6 +4,7 @@ """ import matplotlib + # matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np @@ -11,7 +12,7 @@ import scipy.stats as stats from mpl_toolkits.mplot3d import Axes3D as plt3d -from .math import invert_pdmat +from .math_funcs import invert_pdmat def plot_gaussian_1D(mu, C, num_sigmas=3, num_pts=100, weight=1, **kwargs): diff --git a/hyperion/utils/scp_list.py b/hyperion/utils/scp_list.py index 5abf76f2..070e4f53 100644 --- a/hyperion/utils/scp_list.py +++ b/hyperion/utils/scp_list.py @@ -384,7 +384,7 @@ def shuffle(self, seed=1024, rng=None): Index used to shuffle the list. """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) index = np.arange(len(self.key)) rng.shuffle(index) diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py index 6aef5bb2..a99b4e1e 100644 --- a/hyperion/utils/segment_set.py +++ b/hyperion/utils/segment_set.py @@ -13,42 +13,48 @@ class SegmentSet(InfoTable): def __init__(self, df): super().__init__(df) - if "start" in df and "recording_id" not in df: - df["recording_id"] = df["id"] + if "start" in df and "recordings" not in df: + df["recordings"] = df["id"] - if "start" not in df and "recording_id" in df: + if "start" not in df and "recordings" in df: df["start"] = 0.0 @property def has_time_marks(self): - return ( - "recording_id" in self.df and "start" in self.df and "duration" in self.df - ) + return "recordings" in self.df and "start" in self.df and "duration" in self.df @property def has_recording_ids(self): - return "recording_id" in self.df + return "recordings" in self.df - def recording_ids(self, ids=None): + @property + def has_recordings(self): + return "recordings" in self.df + + def recordings(self, ids=None): if ids is None: - if "recording_id" in self.df: - return self.df["recording_id"] + if "recordings" in self.df: + return self.df["recordings"] else: return self.df["id"] - if "recording_id" in self.df: - return self.df.loc[ids, "recording_id"] + if "recordings" in self.df: + return self.df.loc[ids, "recordings"] return ids - def recording_time_marks(self, ids): - if "recording" in self.df: - rec_col = "recording_id" - else: - rec_col = "id" + def recording_ids(self, ids=None): + return self.recordings(ids) + + def recording_time_marks(self, ids, recordings_name: str = "recordings"): + if recordings_name == "recordings": + if "recordings" in self.df: + recordings_name = "recordings" + else: + recordings_name = "id" assert "duration" in self.df if "start" not in self.df: self.df["start"] = 0.0 - return self.df.loc[ids, [rec_col, "start", "duration"]] + return self.df.loc[ids, [recordings_name, "start", "duration"]] diff --git a/hyperion/utils/sparse_trial_key.py b/hyperion/utils/sparse_trial_key.py index 1bc321a7..62fcd446 100644 --- a/hyperion/utils/sparse_trial_key.py +++ b/hyperion/utils/sparse_trial_key.py @@ -145,7 +145,7 @@ def load_table(cls, file_path, sep=None): file_path: File to read the list. Returns: - TrialKey object. + SparseTrialKey object. """ file_path = Path(file_path) ext = file_path.suffix @@ -156,19 +156,15 @@ def load_table(cls, file_path, sep=None): models = df["modelid"].values segments = df["segmentid"].values is_tar = (df["targettype"] == "target").values - model_set, _, model_idx = np.unique( - models, return_index=True, return_inverse=True - ) - seg_set, _, seg_idx = np.unique( - segments, return_index=True, return_inverse=True - ) + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) tar = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") non = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") - for item in zip(model_idx, seg_idx, is_tar): - if item[2]: - tar[item[0], item[1]] = True + for i, j, target_type in zip(model_idx, seg_idx, is_tar): + if target_type: + tar[i, j] = True else: - non[item[0], item[1]] = True + non[i, j] = True return cls(model_set, seg_set, tar.tocsr(), non.tocsr()) @classmethod diff --git a/hyperion/utils/sparse_trial_scores.py b/hyperion/utils/sparse_trial_scores.py index 7ed9a1d1..760bd1f1 100644 --- a/hyperion/utils/sparse_trial_scores.py +++ b/hyperion/utils/sparse_trial_scores.py @@ -3,12 +3,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import copy import logging -import os.path as path +from pathlib import Path import numpy as np +import pandas as pd import scipy.sparse as sparse from ..hyp_defs import float_cpu @@ -18,9 +18,6 @@ from .trial_ndx import TrialNdx from .trial_scores import TrialScores -# import h5py - - class SparseTrialScores(TrialScores): @@ -55,6 +52,26 @@ def save_txt(self, file_path): % (self.model_set[r], self.seg_set[c], self.scores[r, c]) ) + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + self.score_mask.eliminate_zeros() + score_mask = self.score_mask.tocoo() + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}LLR\n") + for i, j in zip(score_mask.row, score_mask.col): + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}\n" + ) + @classmethod def load_h5(cls, file_path): raise NotImplementedError() @@ -90,6 +107,35 @@ def load_txt(cls, file_path): scores[item[0], item[1]] = item[2] return cls(model_set, seg_set, scores.tocsr(), score_mask.tocsr()) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialScores object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + score_list = df["LLR"].values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + scores = sparse.lil_matrix((len(model_set), len(seg_set)), dtype=float_cpu()) + score_mask = sparse.lil_matrix(scores.shape, dtype="bool") + for i, j, score in zip(model_idx, seg_idx, score_list): + score_mask[i, j] = True + scores[i, j] = score + + return cls(model_set, seg_set, scores.tocsr(), score_mask.tocsr()) + @classmethod def merge(cls, scr_list): raise NotImplementedError() @@ -160,9 +206,9 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): if not (np.all(f_mod) and np.all(f_seg)): for i in (f_mod == 0).nonzero()[0]: - logging.info("model %s not found" % model_set[i]) + logging.info("model %s not found", model_set[i]) for i in (f_seg == 0).nonzero()[0]: - logging.info("segment %s not found" % seg_set[i]) + logging.info("segment %s not found", seg_set[i]) if raise_missing: raise Exception("some scores were not computed") @@ -172,18 +218,36 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): scores = self.scores.tocoo() new_data = scores.data new_row = scores.row.copy() + # for i, r in enumerate(mod_idx): + # if f_mod[i] and i != r: + # idx = scores.row == r + # new_row[idx] = i + + # new_col = scores.col.copy() + # for j, c in enumerate(seg_idx): + # if f_seg[j] and j != c: + # idx = scores.col == c + # new_col[idx] = j + + # idx = np.logical_and(new_row < num_mod, new_col < num_seg) + # if not np.all(idx): + # new_data = new_data[idx] + # new_row = new_row[idx] + # new_col = new_col[idx] + + new_row = -1 * np.ones_like(scores.row) for i, r in enumerate(mod_idx): - if f_mod[i] and i != r: + if f_mod[i]: idx = scores.row == r new_row[idx] = i - new_col = scores.col.copy() + new_col = -1 * np.ones_like(scores.col) for j, c in enumerate(seg_idx): - if f_seg[j] and j != c: + if f_seg[j]: idx = scores.col == c new_col[idx] = j - idx = np.logical_and(new_row < num_mod, new_col < num_seg) + idx = np.logical_and(new_row != -1, new_col != -1) if not np.all(idx): new_data = new_data[idx] new_row = new_row[idx] @@ -193,19 +257,37 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): score_mask = self.score_mask.tocoo() new_data = score_mask.data - new_row = score_mask.row.copy() + # new_row = score_mask.row.copy() + # for i, r in enumerate(mod_idx): + # if f_mod[i] and i != r: + # idx = score_mask.row == r + # new_row[idx] = i + + # new_col = score_mask.col.copy() + # for j, c in enumerate(seg_idx): + # if f_seg[j] and j != c: + # idx = score_mask.col == c + # new_col[idx] = j + + # idx = np.logical_and(new_row < num_mod, new_col < num_seg) + # if not np.all(idx): + # new_data = new_data[idx] + # new_row = new_row[idx] + # new_col = new_col[idx] + + new_row = -1 * np.ones_like(score_mask.row) for i, r in enumerate(mod_idx): - if f_mod[i] and i != r: + if f_mod[i]: idx = score_mask.row == r new_row[idx] = i - new_col = score_mask.col.copy() + new_col = -1 * np.ones_like(score_mask.col) for j, c in enumerate(seg_idx): - if f_seg[j] and j != c: + if f_seg[j]: idx = score_mask.col == c new_col[idx] = j - idx = np.logical_and(new_row < num_mod, new_col < num_seg) + idx = np.logical_and(new_row != -1, new_col != -1) if not np.all(idx): new_data = new_data[idx] new_row = new_row[idx] @@ -249,7 +331,7 @@ def align_with_ndx(self, ndx, raise_missing=True): if not scr.score_mask[r, c]: missing_scores = True logging.info( - "missing-scores for %s %s" % (scr.model_set[r], scr.seg_set[c]) + "missing-scores for %s %s", scr.model_set[r], scr.seg_set[c] ) if missing_scores and raise_missing: @@ -291,7 +373,7 @@ def set_valid_scores(self, scores, ndx=None): self.scores = scr.scores self.score_mat = scr.score_mat - self.scores[self.score_mask]=scores + self.scores[self.score_mask] = scores @classmethod def from_trial_scores(cls, scr): @@ -302,6 +384,12 @@ def from_trial_scores(cls, scr): score_mask.eliminate_zeros() return cls(scr.model_set, scr.seg_set, scores, score_mask) + def to_trial_scores(self): + scores = self.scores.toarray("C") + score_mask = self.score_mask.toarray("C") + # scores[~score_mask] = 0.0 + return TrialScores(self.model_set, self.seg_set, scores, score_mask) + def set_missing_to_value(self, ndx, val): """Aligns the scores with a TrialNdx and sets the trials with missing scores to the same value. diff --git a/hyperion/utils/train_val_eval_list.py b/hyperion/utils/train_val_eval_list.py index fd17e240..cbccf093 100644 --- a/hyperion/utils/train_val_eval_list.py +++ b/hyperion/utils/train_val_eval_list.py @@ -207,7 +207,7 @@ def create( part_names = ["train", "eval"] if shuffle: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) if group_by_key is None: group_by_key = segment_key diff --git a/hyperion/utils/trial_key.py b/hyperion/utils/trial_key.py index 4a99461b..5d8019b6 100644 --- a/hyperion/utils/trial_key.py +++ b/hyperion/utils/trial_key.py @@ -11,7 +11,8 @@ import numpy as np import pandas as pd -from .list_utils import * +# from .list_utils import * +from .list_utils import sort, intersect, ismember, split_list, list2ndarray from .trial_ndx import TrialNdx @@ -178,7 +179,8 @@ def load(cls, file_path, sep=None): Returns: TrialKey object. """ - _, file_ext = path.splitext(file_path) + file_path = Path(file_path) + file_ext = file_path.suffix if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) elif file_ext in ("", ".txt"): @@ -268,7 +270,7 @@ def load_txt(cls, file_path): @classmethod def load_table(cls, file_path, sep=None): - """Loads object from txt file + """Loads object from pandas table file Args: file_path: File to read the list. @@ -285,12 +287,8 @@ def load_table(cls, file_path, sep=None): models = df["modelid"].values segments = df["segmentid"].values is_tar = (df["targettype"] == "target").values - model_set, _, model_idx = np.unique( - models, return_index=True, return_inverse=True - ) - seg_set, _, seg_idx = np.unique( - segments, return_index=True, return_inverse=True - ) + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) tar = np.zeros((len(model_set), len(seg_set)), dtype="bool") non = np.zeros((len(model_set), len(seg_set)), dtype="bool") for i, j, target_type in zip(model_idx, seg_idx, is_tar): diff --git a/hyperion/utils/trial_ndx.py b/hyperion/utils/trial_ndx.py index e26d19e2..b7b873df 100644 --- a/hyperion/utils/trial_ndx.py +++ b/hyperion/utils/trial_ndx.py @@ -4,12 +4,14 @@ """ import copy -import os.path as path +from pathlib import Path import h5py import numpy as np +import pandas as pd -from .list_utils import * +# from .list_utils import * +from .list_utils import sort, intersect, ismember, split_list, list2ndarray class TrialNdx(object): @@ -46,17 +48,20 @@ def sort(self): self.seg_set, s_idx = sort(self.seg_set, return_index=True) self.trial_mask = self.trial_mask[np.ix_(m_idx, s_idx)] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in [".h5", ".hdf5"]: self.save_h5(file_path) - else: + elif file_ext in [".txt", ""]: self.save_txt(file_path) + else: + self.save_table(file_path, sep=sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -71,15 +76,6 @@ def save_h5(self, file_path): f.create_dataset("ID/column_ids", data=seg_set) f.create_dataset("trial_mask", data=self.trial_mask.astype("uint8")) - # model_set = self.model_set.astype('S') - # f.create_dataset('ID/row_ids', self.model_set.shape, dtype=model_set.dtype) - # f['ID/row_ids'] = model_set - # seg_set = self.seg_set.astype('S') - # f.create_dataset('ID/column_ids', self.seg_set.shape, dtype=seg_set.dtype) - # f['ID/column_ids'] = seg_set - # f.create_dataset('trial_mask', self.trial_mask.shape, dtype='uint8') - # f['trial_mask'] = self.trial_mask.astype('uint8') - def save_txt(self, file_path): """Saves object to txt file. @@ -91,8 +87,25 @@ def save_txt(self, file_path): for item in zip(idx[0], idx[1]): f.write("%s %s\n" % (self.model_set[item[1]], self.seg_set[item[0]])) + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}\n") + I, J = self.trial_mask.nonzero() + for i, j in zip(I, J): + f.write(f"{self.model_set[i]}{sep}{self.seg_set[j]}\n") + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -101,11 +114,14 @@ def load(cls, file_path): Returns: TrialNdx object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -148,6 +164,36 @@ def load_txt(cls, file_path): trial_mask[item[0], item[1]] = True return cls(model_set, seg_set, trial_mask) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialNdx object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + model_set, _, model_idx = np.unique( + models, return_index=True, return_inverse=True + ) + seg_set, _, seg_idx = np.unique( + segments, return_index=True, return_inverse=True + ) + trial_mask = np.zeros((len(model_set), len(seg_set)), dtype="bool") + for i, j in zip(model_idx, seg_idx): + trial_mask[i, j] = True + + return cls(model_set, seg_set, trial_mask) + @classmethod def merge(cls, ndx_list): """Merges several index objects. diff --git a/hyperion/utils/trial_scores.py b/hyperion/utils/trial_scores.py index a486647d..9e7fcd5d 100644 --- a/hyperion/utils/trial_scores.py +++ b/hyperion/utils/trial_scores.py @@ -3,16 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import copy import logging -import os.path as path +from pathlib import Path import h5py import numpy as np +import pandas as pd from ..hyp_defs import float_cpu -from .list_utils import * + +# from .list_utils import * +from .list_utils import sort, intersect, ismember, split_list, list2ndarray from .trial_key import TrialKey from .trial_ndx import TrialNdx @@ -56,17 +58,20 @@ def sort(self): self.scores = self.scores[ix] self.score_mask = self.score_mask[ix] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in [".h5", ".hdf5"]: self.save_h5(file_path) - else: + elif file_ext in ["", ".txt"]: self.save_txt(file_path) + else: + self.save_table(file_path, sep=sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -100,8 +105,27 @@ def save_txt(self, file_path): ) ) + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}LLR\n") + I, J = self.score_mask.nonzero() + for i, j in zip(I, J): + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}\n" + ) + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -110,11 +134,14 @@ def load(cls, file_path): Returns: TrialScores object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -163,6 +190,35 @@ def load_txt(cls, file_path): scores[item[0], item[1]] = item[2] return cls(model_set, seg_set, scores, score_mask) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialScores object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + score_list = df["LLR"].values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + score_mask = np.zeros((len(model_set), len(seg_set)), dtype="bool") + scores = np.zeros((len(model_set), len(seg_set)), dtype=float_cpu()) + for i, j, score in zip(model_idx, seg_idx, score_list): + score_mask[i, j] = True + scores[i, j] = score + + return cls(model_set, seg_set, scores, score_mask) + @classmethod def merge(cls, scr_list): """Merges several score objects. @@ -235,7 +291,7 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): Filtered TrialScores object. """ - if not (keep): + if not keep: model_set = np.setdiff1d(self.model_set, model_set) seg_set = np.setdiff1d(self.model_set, seg_set) @@ -244,15 +300,15 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): if np.all(f_mod) and np.all(f_seg): model_set = self.model_set[mod_idx] - set_set = self.seg_set[seg_idx] + seg_set = self.seg_set[seg_idx] ix = np.ix_(mod_idx, seg_idx) scores = self.scores[ix] score_mask = self.score_mask[ix] else: for i in (f_mod == 0).nonzero()[0]: - logging.info("model %s not found" % model_set[i]) + logging.info("model %s not found", model_set[i]) for i in (f_seg == 0).nonzero()[0]: - logging.info("segment %s not found" % seg_set[i]) + logging.info("segment %s not found", seg_set[i]) if raise_missing: raise Exception("some scores were not computed") diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py index edf2c23a..c1c429f2 100644 --- a/hyperion/utils/utt2info.py +++ b/hyperion/utils/utt2info.py @@ -261,7 +261,7 @@ def shuffle(self, seed=1024, rng=None): Index used to shuffle the list. """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) index = np.arange(len(self.key)) rng.shuffle(index) self.utt_info = self.utt_info.iloc[index] From 77bbad4c76bf147227cce74cef2c3a8b13e4cf83 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 7 Sep 2023 19:52:13 +0000 Subject: [PATCH 74/89] Add new parameters for feat_fusion_end --- ...c2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml | 71 +++++++++++++++++++ ...c2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml | 71 +++++++++++++++++++ .../v1/global_conf/config_lid_v7.0_13langs.sh | 42 +++++++++++ .../v1/global_conf/config_lid_v7.1_13langs.sh | 42 +++++++++++ .../wav2languageid/hf_wav2languageid.py | 29 ++++++-- .../hf_wav2vec2resnet1d_languageid.py | 3 +- 6 files changed, 253 insertions(+), 5 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v7.0_13langs.sh create mode 100644 egs/commonvoice/v1/global_conf/config_lid_v7.1_13langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml new file mode 100644 index 00000000..061014e0 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 96 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 96 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_feat6.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + loss: weightedCE + loss_weight_exp: 1.0 # 0~1 + # focal_loss_gamma: 2.0 + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml new file mode 100644 index 00000000..4bd1ad28 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 96 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - language + target_sample_freq: 16000 + wav_scale: 1 + + sampler: + sampler_type: 'class_weighted_random_seg_chunk_sampler' + min_batch_size: 96 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + # weighted + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_ecapatdnn512x3_1layer_feat12.yaml +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 5000 + update_lr_on_opt_step: true + loss: weightedCE + loss_weight_exp: 1.0 # 0~1 + # focal_loss_gamma: 2.0 + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 1024 + # eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + diff --git a/egs/commonvoice/v1/global_conf/config_lid_v7.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v7.0_13langs.sh new file mode 100644 index 00000000..13ef37b4 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v7.0_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.0.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v7.0_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v7.0.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v7.0_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/commonvoice/v1/global_conf/config_lid_v7.1_13langs.sh b/egs/commonvoice/v1/global_conf/config_lid_v7.1_13langs.sh new file mode 100644 index 00000000..b00c7bb0 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_lid_v7.1_13langs.sh @@ -0,0 +1,42 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="sl_test_proc_audio ga-IE_test_proc_audio cv_test_proc_audio br_test_proc_audio tr_test_proc_audio cy_test_proc_audio tt_test_proc_audio ca_test_proc_audio kab_test_proc_audio de_test_proc_audio fr_test_proc_audio it_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs + + +bpe_model=data/13_langs_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v7.1.yaml +nnet_s1_args="" +nnet_name=${hf_model_name}_resnet1d_v7.1_13_langs +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/resnet1d_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v7.1.yaml +nnet_s2_args="" +nnet_s2_name=${hf_model_name}_resnet1d_v7.1_13_langs.s2 +nnet_s2_dir=exp/resnet1d_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/resnet1d_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/hyperion/torch/models/wav2languageid/hf_wav2languageid.py b/hyperion/torch/models/wav2languageid/hf_wav2languageid.py index 22974afe..ff3a83a7 100644 --- a/hyperion/torch/models/wav2languageid/hf_wav2languageid.py +++ b/hyperion/torch/models/wav2languageid/hf_wav2languageid.py @@ -28,13 +28,16 @@ class HFWav2LanguageID(TorchModel): """ def __init__( - self, hf_feats, languageid, feat_fusion_start=0, feat_fusion_method="weighted-avg" + self, hf_feats, languageid, feat_fusion_start=0, feat_fusion_end=-1, feat_fusion_method="weighted-avg" ): super().__init__() self.hf_feats = hf_feats self.languageid = languageid self.feat_fusion_start = feat_fusion_start + if feat_fusion_end == -1: + feat_fusion_end = self.hf_feats.num_encoder_layers + self.feat_fusion_end = feat_fusion_end self.feat_fusion_method = feat_fusion_method self._hf_context = contextlib.nullcontext() self._make_fuser() @@ -44,7 +47,7 @@ def _make_fuser(self): self.feat_fuser = None return - num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + num_layers = self.feat_fusion_end + 1 - self.feat_fusion_start layer_dim = self.hf_feats.hidden_size if self.feat_fusion_method == "weighted-avg": self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) @@ -67,10 +70,11 @@ def _fuse_hid_feats(self, hid_feats): # There is only one layer of features return hid_feats[0] - hid_feats = hid_feats[self.feat_fusion_start :] + hid_feats = hid_feats[self.feat_fusion_start : self.feat_fusion_end + 1] if self.feat_fusion_method == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + # logging.info(torch.tensor(norm_weights.values).to(device)) feats = torch.sum(hid_feats * norm_weights, dim=-1) elif self.feat_fusion_method == "linear": hid_feats = torch.stack(hid_feats, dim=-1) @@ -140,6 +144,7 @@ def forward_feats( feat_lengths = hf_output["hidden_states_lengths"] if return_hid_states: hid_feats = hf_output["hidden_states"] + assert(len(hid_feats) == self.hf_feats.num_encoder_layers + 1) feats = self._fuse_hid_feats(hid_feats) else: hid_feats = None @@ -331,6 +336,7 @@ def filter_args(**kwargs): "hf_feats", "languageid", "feat_fusion_start", + "feat_fusion_end", "feat_fusion_method", ) args = dict((k, kwargs[k]) for k in valanguageid_args if k in kwargs) @@ -346,6 +352,7 @@ def get_config(self): "hf_feats": hf_cfg, "languageid": xvec_cfg, "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_end": self.feat_fusion_end, "feat_fusion_method": self.feat_fusion_method, } @@ -370,9 +377,23 @@ def add_class_args(parser, prefix=None, skip=set()): type=int, help=( "the input to language identification model will fuse the wav2vec layers from feat_fusion_start to" - "the wav2vec num_layers" + "the feat_fusion_end" ), ) + + + parser.add_argument( + "--feat-fusion-end", + default=-1, + type=int, + help=( + "the input to language identification model will fuse the wav2vec layers from feat_fusion_start to" + "the feat_fusion_end" + ), + ) + + + parser.add_argument( "--feat-fusion-method", default="weighted-avg", diff --git a/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py b/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py index d357cd87..fb64f060 100644 --- a/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py +++ b/hyperion/torch/models/wav2languageid/hf_wav2vec2resnet1d_languageid.py @@ -33,6 +33,7 @@ def __init__( hf_feats: Union[Dict, HFWav2Vec2], languageid: Union[Dict, ResNet1dLanguageID], feat_fusion_start: int = 0, + feat_fusion_end: int = -1, feat_fusion_method: str = "weighted-avg", ): @@ -52,7 +53,7 @@ def __init__( assert isinstance(languageid, ResNet1dLanguageID) assert languageid.encoder_net.in_feats == hf_feats.hidden_size - super().__init__(hf_feats, languageid, feat_fusion_start, feat_fusion_method) + super().__init__(hf_feats, languageid, feat_fusion_start, feat_fusion_end, feat_fusion_method) @staticmethod def filter_args(**kwargs): From 89c6e2016b391818c35ab91644bbd091db4f9986 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 8 Sep 2023 11:24:03 -0400 Subject: [PATCH 75/89] finished vox v1.2 except plda --- egs/voxceleb/v1.2/run_007_eval_be.sh | 321 ++++++++++++++++++ .../eval_cosine_scoring_backend_with_qmf.py | 253 +++++++++++--- hyperion/bin/merge_scores.py | 19 +- hyperion/bin/train_qmf.py | 135 ++++++++ .../np/classifiers/logistic_regression.py | 3 +- hyperion/torch/utils/misc.py | 4 +- hyperion/utils/trial_scores.py | 138 +++++++- 7 files changed, 800 insertions(+), 73 deletions(-) create mode 100755 egs/voxceleb/v1.2/run_007_eval_be.sh create mode 100755 hyperion/bin/train_qmf.py diff --git a/egs/voxceleb/v1.2/run_007_eval_be.sh b/egs/voxceleb/v1.2/run_007_eval_be.sh new file mode 100755 index 00000000..9084d35b --- /dev/null +++ b/egs/voxceleb/v1.2/run_007_eval_be.sh @@ -0,0 +1,321 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name +score_plda_dir=$score_dir/${be_name}/plda +score_cosine_dir=$score_dir/cosine +score_cosine_snorm_dir=$score_dir/cosine_snorm +score_cosine_qmf_dir=$score_dir/cosine_qmf + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend.py \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_dir/voxceleb1_scores.csv \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + merge_scores.py --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_dir/voxceleb1_results.csv + + cat $score_cosine_dir/voxceleb1_results.csv +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring" + $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend.py \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_dir/voxsrc22_dev_scores.csv + + # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \ + # hyp_utils/conda_env.sh \ + # eval_cosine_scoring_backend.py \ + # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ + # --ndx-file data/voxsrc22_eval/trials.csv \ + # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ + # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_dir/voxsrc22_dev_results.csv + + cat $score_cosine_dir/voxsrc22_dev_results.csv + +fi + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend.py \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + merge_scores.py --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_snorm_dir/voxceleb1_results.csv + + cat $score_cosine_snorm_dir/voxceleb1_results.csv + fi + + if [ $stage -le 6 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + AS-Norm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend.py \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + merge_scores.py --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + fi + +fi + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + echo "Train QMF in Vox2" + echo "...Calculating quality measures for Vox2" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend_with_qmf.py \ + --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --ndx-file data/voxceleb2cat_train_trials/trials.csv \ + --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv \ + --score-file $score_cosine_qmf_dir/voxceleb2_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + train_qmf.py --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --key-file data/voxceleb2cat_train_trials/trials.csv \ + --model-file $score_cosine_qmf_dir/qmf.h5 + + fi + + if [ $stage -le 8 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend_with_qmf.py \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxceleb1_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + ) & + done + wait + fi + + if [ $stage -le 9 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend_with_qmf.py \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + merge_scores.py --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + ) & + done + wait + fi + +fi + diff --git a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py index f567dd81..0333669f 100755 --- a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py +++ b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py @@ -30,6 +30,7 @@ from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.transforms import TransformList from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.classifiers import BinaryLogisticRegression as LR def get_precomp_qm_names(quality_measures): @@ -38,7 +39,6 @@ def get_precomp_qm_names(quality_measures): def normalize_duration(q, min_dur, max_dur, frame_rate): - q = q / frame_rate q = np.log(np.clip(q / frame_rate, a_min=min_dur, a_max=max_dur)) log_min_dur = np.log(min_dur) @@ -99,6 +99,9 @@ def load_trial_data( test_segments.add_columns(test_feats_set) if enroll_feats_set != test_feats_set or enroll_segments != test_segments: enroll_segments.add_columns(enroll_feats_set) + else: + test_segments = test_feats_set + enroll_segments = enroll_feats_set # now we retrive the quality measures q_e = [] @@ -132,7 +135,6 @@ def load_trial_data( def load_cohort_data(segments_file, feats_file): - segments = SegmentSet.load(segments_file) feats_reader = DRF.create(feats_file) x = feats_reader.read(segments["id"], squeeze=True) @@ -160,16 +162,13 @@ def get_score_filepath( test_part_idx, num_test_parts, ): - score_file = Path(score_file) new_suffix = "" if score_name is not None: new_suffix = f".{score_name}" if num_enroll_parts > 1 or num_test_parts > 1: - new_suffix = ( - f"{new_suffix}.{enroll_part_idx}.{test_part_idx}{score_file.suffix}" - ) + new_suffix = f"{new_suffix}.{enroll_part_idx}.{test_part_idx}" if new_suffix: new_suffix = f"{new_suffix}{score_file.suffix}" @@ -177,25 +176,58 @@ def get_score_filepath( return score_file -def save_scores(ndx, scores, score_file, score_name, enroll_part_idx, + +def save_scores( + ndx, + scores, + score_file, + score_name, + q_measures, + enroll_part_idx, num_enroll_parts, test_part_idx, - num_test_parts): + num_test_parts, +): + score_file = get_score_filepath( + score_file, + score_name, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + logging.info("saving scores with to %s", score_file) + scores = TrialScores( + ndx.model_set, ndx.seg_set, scores, ndx.trial_mask, q_measures=q_measures + ) + scores.save(score_file) + -def save_empty_scores(ndx, score_file, score_name, enroll_part_idx, +def save_empty_scores( + ndx, + score_file, + score_name, + q_measures, + enroll_part_idx, num_enroll_parts, test_part_idx, - num_test_parts): + num_test_parts, +): scores = np.zeros(ndx.trial_mask.shape, dtype="float32") - score_file = get_score_filepath(score_file, score_name,enroll_part_idx, - num_enroll_parts, - test_part_idx, - num_test_parts) - - scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) - scores.save(score_file) - + if q_measures is not None: + q_measures = {k: scores for k in q_measures} + save_scores( + ndx, + scores, + score_file, + score_name, + q_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) def segment_to_trial_qm(q_e, q_t): @@ -226,31 +258,29 @@ def align_scores_to_ndx(enroll_set, ndx, scores, scores_norm, q_trial): return scores, scores_norm, q_trial -def make_qm_table(ndx, scores, scores_norm, q_trial): - if scores_norm is None: - scores = scores[ndx.trial_mask] - else: - scores = scores_norm[ndx.trial_mask] - - for qm in q_trial: - q_trial[qm] = q_trial[qm][ndx.trial_mask] +# def make_qm_table(ndx, scores, scores_norm, q_trial): +# if scores_norm is None: +# scores = scores[ndx.trial_mask] +# else: +# scores = scores_norm[ndx.trial_mask] - I, J = np.nonzero(ndx.trial_mask) - modelid = ndx.model_set[I] - segmentid = ndx.seg_set[J] - unique_id = [f"{a}-{b}" for a, b in zip(modelid, segmentid)] - - q_dict = { - "id": unique_id, - "modelid": modelid, - "segmentid": segmentid, - "scores": scores, - } - q_dict.update(q_trial) - df = pd.DataFrame(q_dict) - return InfoTable(df) +# for qm in q_trial: +# q_trial[qm] = q_trial[qm][ndx.trial_mask] +# I, J = np.nonzero(ndx.trial_mask) +# modelid = ndx.model_set[I] +# segmentid = ndx.seg_set[J] +# unique_id = [f"{a}-{b}" for a, b in zip(modelid, segmentid)] +# q_dict = { +# "id": unique_id, +# "modelid": modelid, +# "segmentid": segmentid, +# "scores": scores, +# } +# q_dict.update(q_trial) +# df = pd.DataFrame(q_dict) +# return InfoTable(df) def eval_backend( @@ -276,7 +306,6 @@ def eval_backend( test_part_idx, num_test_parts, ): - logging.info("loading data") enroll_map, ndx, x_e, x_t, q_e, q_t = load_trial_data( enroll_map_file, @@ -297,8 +326,43 @@ def eval_backend( if not np.any(ndx.trial_mask): # this part doesn't have any trials, save empty files - - + if qmf_file is not None: + quality_measures = None + save_empty_scores( + ndx, + score_file, + "snorm.qmf" if cohort_segments_file is not None else "qmf", + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + save_empty_scores( + ndx, + score_file, + None, + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if cohort_segments_file is not None: + save_empty_scores( + ndx, + score_file, + "snorm", + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + return + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) q_e = average_qm(q_e, enroll_set, enroll_ids) @@ -362,46 +426,123 @@ def eval_backend( enroll_set, ndx, scores, scores_norm, q_trial ) if qmf_file is None: - qm_table = make_qm_table(ndx, scores, scores_norm, q_trial) - qm_file = get_score_filepath( + save_scores( + ndx, + scores, score_file, - "qm", + None, + q_trial, enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts, ) - qm_table.save(qm_file) + + if scores_norm is not None: + save_scores( + ndx, + scores_norm, + score_file, + "snorm", + q_trial, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + # qm_table = make_qm_table(ndx, scores, scores_norm, q_trial) + # qm_file = get_score_filepath( + # score_file, + # "qm", + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # qm_table.save(qm_file) return - score_file_nonorm = get_score_filepath( + save_scores( + ndx, + scores, score_file, None, + None, enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts, ) - logging.info("saving scores to %s", score_file_nonorm) - scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) - scores.save(score_file_nonorm) if scores_norm is not None: - score_file_snorm = get_score_filepath( + save_scores( + ndx, + scores_norm, score_file, "snorm", + None, enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts, ) - logging.info("saving scores with AS-Norm to %s", score_file_snorm) - scores.scores = scores_norm - scores.save(score_file_snorm) + logging.info("applying qmf") + if scores_norm is None: + score_name = "qmf" + scores_fus = [scores.ravel()] + else: + score_name = "snorm.qmf" + scores_fus = [scores_norm.ravel()] + + q_names = list(q_trial.keys()) + q_names.sort() + for q_name in q_names: + scores_fus.append(q_trial[q_name].ravel()) + + scores_fus = np.vstack(scores_fus).T + lr = LR.load(qmf_file) + scores_fus = lr.predict(scores_fus) + scores_fus = np.reshape(scores_fus, (ndx.num_models, ndx.num_tests)) + save_scores( + ndx, + scores_fus, + score_file, + score_name, + None, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) -if __name__ == "__main__": + # score_file_nonorm = get_score_filepath( + # score_file, + # None, + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # logging.info("saving scores to %s", score_file_nonorm) + # scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + # scores.save(score_file_nonorm) + + # if scores_norm is not None: + # score_file_snorm = get_score_filepath( + # score_file, + # "snorm", + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # logging.info("saving scores with AS-Norm to %s", score_file_snorm) + # scores.scores = scores_norm + # scores.save(score_file_snorm) + +if __name__ == "__main__": parser = ArgumentParser( description="Eval cosine-scoring with optional AS-Norm and QMF" ) diff --git a/hyperion/bin/merge_scores.py b/hyperion/bin/merge_scores.py index 6a275f5c..cb8524b7 100755 --- a/hyperion/bin/merge_scores.py +++ b/hyperion/bin/merge_scores.py @@ -18,14 +18,19 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, base_idx): - output_file = Path(output_file) output_file.parent.mkdir(exist_ok=True, parents=True) ext = output_file.suffix if input_files is None: - input_file_base = output_file.with_suffix("") + if ext in [".h5", ".csv", ".tsv"]: + input_file_base = output_file + else: + input_file_base = output_file.parent / (output_file.name + ".txt") + ext = "" + + logging.info("merging %s* -> %s", input_file_base.with_suffix(""), output_file) input_files = [] for i in range(num_enroll_parts): idx_i = base_idx + i @@ -33,6 +38,8 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas idx_j = base_idx + j input_file_i = input_file_base.with_suffix(f".{idx_i}.{idx_j}{ext}") input_files.append(input_file_i) + else: + logging.info("merging %s -> %s", " + ".join(input_files), output_file) if ext == ".h5": # if files are h5 we need to load everything in RAM @@ -57,7 +64,6 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas if __name__ == "__main__": - parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument( @@ -88,7 +94,12 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas help="""index of the first job, typically 0 or 1""", ) parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, ) args = parser.parse_args() diff --git a/hyperion/bin/train_qmf.py b/hyperion/bin/train_qmf.py new file mode 100755 index 00000000..a97e8a5f --- /dev/null +++ b/hyperion/bin/train_qmf.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Trains calibration for SRE18 tel condition +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.trial_scores import TrialScores +from hyperion.utils.trial_key import TrialKey +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR + + +def print_q_stats(scr, q_names): + for k in q_names: + q_vec = scr.q_measures[k][scr.score_mask] + s = f"{k} stats mean={np.mean(q_vec)} min={np.min(q_vec)} max={np.max(q_vec)} median={np.median(q_vec)}" + logging.info(s) + + +def train_qmf( + score_file, key_file, model_file, prior, lambda_reg, quality_measures, verbose +): + logging.info("load key: %s", key_file) + key = TrialKey.load(key_file) + logging.info("load scores: %s", score_file) + scr = TrialScores.load(score_file) + tar, non = scr.get_tar_non(key) + ntar = len(tar) + nnon = len(non) + + if quality_measures is None: + quality_measures = list(scr.q_measures.keys()) + quality_measures.sort() + + print_q_stats(scr, quality_measures) + q_tar, q_non = scr.get_tar_non_q_measures(key, quality_measures) + + min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + min_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, + ) + + logging.info("train calibration") + # tar = np.vstack((tar, maxnf_tar, minnf_tar, maxcohmu_tar, mincohmu_tar)).T + # non = np.vstack((non, maxnf_non, minnf_non, maxcohmu_non, mincohmu_non)).T + tar = np.hstack((tar[:, None], q_tar)) + non = np.hstack((non[:, None], q_non)) + + x = np.vstack((tar, non)) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) + lr.fit(x, y) + logging.info(f"A={lr.A} b={lr.b}") + logging.info("save calibration at %s", model_file) + lr.save(model_file) + + logging.info("calibrate scores") + tar_cal = lr.predict(tar) + non_cal = lr.predict(non) + act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + act_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, + ) + + score_file = Path(score_file) + output_file = score_file.with_suffix(f".qmf{score_file.suffix}") + scr_out = TrialScores(key.model_set, key.seg_set) + scr_out.scores[key.tar] = tar_cal + scr_out.scores[key.non] = non_cal + scr_out.score_mask = np.logical_or(key.tar, key.non) + scr_out.save(output_file) + + +if __name__ == "__main__": + parser = ArgumentParser(description="Trains QMF calibration") + + parser.add_argument("--score-file", required=True) + parser.add_argument("--key-file", required=True) + parser.add_argument("--model-file", required=True) + parser.add_argument("--prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument( + "--quality-measures", + default=None, + nargs="+", + choices=["snorm-mu/s", "snorm-mu", "speech_duration", "num_speech_frames"], + ) + + args = parser.parse_args() + config_logger(args.verbose) + logging.debug(args) + + train_qmf(**namespace_to_dict(args)) diff --git a/hyperion/np/classifiers/logistic_regression.py b/hyperion/np/classifiers/logistic_regression.py index 4c4c0cfc..03d9fd13 100644 --- a/hyperion/np/classifiers/logistic_regression.py +++ b/hyperion/np/classifiers/logistic_regression.py @@ -93,7 +93,8 @@ def __init__( super().__init__(**kwargs) if random_state is None: - random_state = np.random.default_rng(seed=lr_seed) + # random_state = np.random.default_rng(seed=lr_seed) + random_state = np.random.RandomState(seed=lr_seed) if bias_scaling is None: if use_bias and solver == "liblinear": diff --git a/hyperion/torch/utils/misc.py b/hyperion/torch/utils/misc.py index b2a3810f..46c09080 100644 --- a/hyperion/torch/utils/misc.py +++ b/hyperion/torch/utils/misc.py @@ -4,8 +4,8 @@ """ import torch -import torch.cuda.amp as amp import torch.nn as nn +import torch.cuda.amp as amp def l2_norm(x, dim=1, axis=None): @@ -104,3 +104,5 @@ def get_selfsim_tarnon(y, return_mask=False): mask = torch.triu(torch.ones_like(y_bin, dtype=torch.bool), diagonal=1) return y_bin, mask + + diff --git a/hyperion/utils/trial_scores.py b/hyperion/utils/trial_scores.py index 9e7fcd5d..4a5e59da 100644 --- a/hyperion/utils/trial_scores.py +++ b/hyperion/utils/trial_scores.py @@ -14,7 +14,7 @@ from ..hyp_defs import float_cpu # from .list_utils import * -from .list_utils import sort, intersect, ismember, split_list, list2ndarray +from .list_utils import intersect, ismember, list2ndarray, sort, split_list from .trial_key import TrialKey from .trial_ndx import TrialNdx @@ -28,13 +28,22 @@ class TrialScores(object): seg_set: List of test segment names. scores: Matrix with the scores (num_models x num_segments). score_mask: Boolean matrix with the trials with valid scores to True (num_models x num_segments). + q_measures: optional dictionary of quality measure matrices """ - def __init__(self, model_set=None, seg_set=None, scores=None, score_mask=None): + def __init__( + self, + model_set=None, + seg_set=None, + scores=None, + score_mask=None, + q_measures=None, + ): self.model_set = model_set self.seg_set = seg_set self.scores = scores self.score_mask = score_mask + self.q_measures = q_measures if (model_set is not None) and (seg_set is not None): self.validate() @@ -57,6 +66,9 @@ def sort(self): ix = np.ix_(m_idx, s_idx) self.scores = self.scores[ix] self.score_mask = self.score_mask[ix] + if self.q_measures is not None: + for k in self.q_measures.keys(): + self.q_measures[k] = self.q_measures[k][ix] def save(self, file_path, sep=None): """Saves object to txt/h5 file. @@ -86,6 +98,10 @@ def save_h5(self, file_path): f.create_dataset("ID/column_ids", data=seg_set) f.create_dataset("scores", data=self.scores) f.create_dataset("score_mask", data=self.score_mask.astype("uint8")) + if self.q_measures is not None: + q_grp = f.create_group("q_measures") + for k, v in self.q_measures.items(): + q_grp.create_dataset(k, data=v) def save_txt(self, file_path): """Saves object to txt file. @@ -105,6 +121,9 @@ def save_txt(self, file_path): ) ) + if self.q_measures is not None: + logging.warning("q_measures cannot be saved to txt file") + def save_table(self, file_path, sep=None): """Saves object to pandas tabnle file. @@ -116,12 +135,20 @@ def save_table(self, file_path, sep=None): if sep is None: sep = "\t" if ".tsv" in ext else "," + q_str = "" + if self.q_measures is not None: + q_str = sep + sep.join(self.q_measures.keys()) + with open(file_path, "w", encoding="utf-8") as f: - f.write(f"modelid{sep}segmentid{sep}LLR\n") + f.write(f"modelid{sep}segmentid{sep}LLR{q_str}\n") I, J = self.score_mask.nonzero() for i, j in zip(I, J): + if self.q_measures is not None: + q_str = sep + sep.join( + [str(v[i, j]) for k, v in self.q_measures.items()] + ) f.write( - f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}\n" + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}{q_str}\n" ) @classmethod @@ -158,7 +185,12 @@ def load_h5(cls, file_path): seg_set = [t.decode("utf-8") for t in f["ID/column_ids"]] scores = np.asarray(f["scores"], dtype=float_cpu()) score_mask = np.asarray(f["score_mask"], dtype="bool") - return cls(model_set, seg_set, scores, score_mask) + if "q_measures" in f: + q_grp = f["q_measures"] + q_measures = {k: q_grp[k] for k in q_grp} + else: + q_measures = None + return cls(model_set, seg_set, scores, score_mask, q_measures) @classmethod def load_txt(cls, file_path): @@ -217,7 +249,21 @@ def load_table(cls, file_path, sep=None): score_mask[i, j] = True scores[i, j] = score - return cls(model_set, seg_set, scores, score_mask) + if len(df.columns) > 3: + q_names = df.columns[3:] + q_vals = df.iloc[:, 3:].values + q_measures = {} + for q_name in q_names: + q_measures[q_name] = np.zeros(scores.shape, dtype=float_cpu()) + + for i, j, q_row in zip(model_idx, seg_idx, q_vals): + for col, q_name in enumerate(q_names): + q_measures[q_name][i, j] = q_row[col] + + else: + q_measures = None + + return cls(model_set, seg_set, scores, score_mask, q_measures) @classmethod def merge(cls, scr_list): @@ -234,6 +280,7 @@ def merge(cls, scr_list): seg_set = scr_list[0].seg_set scores = scr_list[0].scores score_mask = scr_list[0].score_mask + q_measures = scr_list[0].q_measures for i in range(1, num_scr): scr_i = scr_list[i] new_model_set = np.union1d(model_set, scr_i.model_set) @@ -252,6 +299,10 @@ def merge(cls, scr_list): scores_1[ix_a] = scores[ix_b] score_mask_1 = np.zeros(shape, dtype="bool") score_mask_1[ix_a] = score_mask[ix_b] + if q_measures is not None: + q_measures_1 = {k: np.zeros(shape) for k in q_measures.keys()} + for k in q_measures.keys(): + q_measures_1[k][ix_a] = q_measures[k][ix_b] trial_mask_2 = np.zeros( (len(new_model_set), len(new_seg_set)), dtype="bool" @@ -268,14 +319,21 @@ def merge(cls, scr_list): scores_2[ix_a] = scr_i.scores[ix_b] score_mask_2 = np.zeros(shape, dtype="bool") score_mask_2[ix_a] = scr_i.score_mask[ix_b] + if q_measures is not None: + q_measures_2 = {k: np.zeros(shape) for k in q_measures.keys()} + for k in q_measures.keys(): + q_measures_2[k][ix_a] = scr_i.q_measures[k][ix_b] model_set = new_model_set seg_set = new_seg_set scores = scores_1 + scores_2 assert not (np.any(np.logical_and(score_mask_1, score_mask_2))) score_mask = np.logical_or(score_mask_1, score_mask_2) + if q_measures is not None: + for k in q_measures.keys(): + q_measures[k] = q_measures_1[k] + q_measures_2[k] - return cls(model_set, seg_set, scores, score_mask) + return cls(model_set, seg_set, scores, score_mask, q_measures) def filter(self, model_set, seg_set, keep=True, raise_missing=True): """Removes elements from TrialScores object. @@ -297,13 +355,17 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): f_mod, mod_idx = ismember(model_set, self.model_set) f_seg, seg_idx = ismember(seg_set, self.seg_set) - + q_measures = None if np.all(f_mod) and np.all(f_seg): model_set = self.model_set[mod_idx] seg_set = self.seg_set[seg_idx] ix = np.ix_(mod_idx, seg_idx) scores = self.scores[ix] score_mask = self.score_mask[ix] + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = self.q_measures[k][ix] else: for i in (f_mod == 0).nonzero()[0]: logging.info("model %s not found", model_set[i]) @@ -318,8 +380,13 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): ix2 = np.ix_(mod_idx[f_mod], seg_idx[f_seg]) scores[ix1] = self.scores[ix2] score_mask[ix1] = self.score_mask[ix2] + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = np.zeros(scores.shape, dtype=float_cpu()) + q_measures[k][ix1] = self.q_measures[k][ix2] - return TrialScores(model_set, seg_set, scores, score_mask) + return TrialScores(model_set, seg_set, scores, score_mask, q_measures) def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): """Splits the TrialScores into num_model_parts x num_seg_parts and returns part @@ -340,7 +407,13 @@ def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): ix = np.ix_(model_idx1, seg_idx1) scores = self.scores[ix] score_mask = self.score_mask[ix] - return TrialScores(model_set, seg_set, scores, score_mask) + q_measures = None + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = self.q_measures[k][ix] + + return TrialScores(model_set, seg_set, scores, score_mask, q_measures) def validate(self): """Validates the attributes of the TrialScores object.""" @@ -362,6 +435,10 @@ def validate(self): else: assert self.score_mask.shape == (len(self.model_set), len(self.seg_set)) + if self.q_measures is not None: + for k in self.q_measures.keys(): + assert self.q_measures[k].shape == self.scores.shape + def align_with_ndx(self, ndx, raise_missing=True): """Aligns scores, model_set and seg_set with TrialNdx or TrialKey. @@ -412,6 +489,34 @@ def get_tar_non(self, key): non = scr.scores[non_mask] return tar, non + def get_tar_non_q_measures(self, key, q_names=None, return_dict=False): + """Returns target and non target scores. + + Args: + key: TrialKey object. + q_names: names of quality measures to return, if None it will return all + + Returns: + Numpy array with target scores. + Numpy array with non-target scores. + """ + scr = self.align_with_ndx(key) + tar_mask = np.logical_and(scr.score_mask, key.tar) + if q_names is None: + q_names = self.q_measures.keys() + tar = {} + for k in q_names: + tar[k] = self.q_measures[k][tar_mask] + non_mask = np.logical_and(scr.score_mask, key.non) + non = {} + for k in q_names: + non[k] = self.q_measures[k][non_mask] + + if not return_dict: + tar = np.vstack(tuple(tar[k] for k in q_names)).T + non = np.vstack(tuple(non[k] for k in q_names)).T + return tar, non + def set_missing_to_value(self, ndx, val): """Aligns the scores with a TrialNdx and sets the trials with missing scores to the same value. @@ -450,6 +555,18 @@ def __eq__(self, other): eq = eq and np.all(self.seg_set == other.seg_set) eq = eq and np.all(np.isclose(self.scores, other.scores, atol=1e-5)) eq = eq and np.all(self.score_mask == other.score_mask) + if self.q_measures is not None: + eq = eq and other.q_measures is not None + if eq: + eq = self.q_measures.keys() == other.q_measures.keys() + if eq: + for k in self.q_measures.keys(): + eq = eq and np.all( + np.isclose( + self.q_measures[k], other.q_measures[k], atol=1e-5 + ) + ) + return eq def __ne__(self, other): @@ -463,7 +580,6 @@ def __cmp__(self, other): return 1 def test(key_file="core-core_det5_key.h5"): - key = TrialKey.load(key_file) mask = np.logical_or(key.tar, key.non) From 44f085a86b8c6e9206431cdfbb4f26954dfb4672 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Sun, 10 Sep 2023 11:16:43 -0400 Subject: [PATCH 76/89] introduce entry points --- README.md | 4 +- egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml | 34 +++ egs/voxceleb/v1.2/run_001_prepare_data.sh | 26 +- egs/voxceleb/v1.2/run_002_compute_evad.sh | 16 +- .../v1.2/run_003_prepare_noises_rirs.sh | 102 +++---- .../v1.2/run_004_prepare_xvec_train_data.sh | 46 +-- egs/voxceleb/v1.2/run_005_train_xvector.sh | 4 +- egs/voxceleb/v1.2/run_006_extract_xvectors.sh | 16 +- egs/voxceleb/v1.2/run_007_eval_be.sh | 80 ++--- hyperion/bin/__init__.py | 0 hyperion/bin/adv_finetune_xvector_from_wav.py | 18 +- hyperion/bin/apply_mvn_select_frames.py | 36 ++- hyperion/bin/audio_to_duration.py | 17 +- hyperion/bin/compute_energy_vad.py | 17 +- hyperion/bin/compute_mfcc_feats.py | 21 +- hyperion/bin/copy_feats.py | 7 +- hyperion/bin/decode_wav2transducer.py | 20 +- hyperion/bin/decode_wav2vec2rnn_transducer.py | 92 +++--- hyperion/bin/eval_cosine_scoring_backend.py | 27 +- .../eval_cosine_scoring_backend_with_qmf.py | 38 +-- hyperion/bin/eval_verification_metrics.py | 25 +- ...l_xvec_cosine_scoring_from_adv_test_wav.py | 23 +- ...osine_scoring_from_adv_test_wav_wavegan.py | 26 +- ...l_xvec_cosine_scoring_from_art_test_wav.py | 27 +- .../eval_xvec_cosine_scoring_from_test_wav.py | 22 +- ...sine_scoring_from_transfer_adv_test_wav.py | 20 +- ...sine_scoring_from_transfer_art_test_wav.py | 27 +- hyperion/bin/eval_xvec_logits_from_wav.py | 28 +- hyperion/bin/extract_wav2vec2xvectors.py | 28 +- hyperion/bin/extract_wav2xvectors.py | 23 +- hyperion/bin/extract_xvectors_from_feats.py | 22 +- hyperion/bin/extract_xvectors_from_wav.py | 23 +- .../extract_xvectors_slidwin_from_feats.py | 34 ++- .../bin/extract_xvectors_slidwin_from_wav.py | 36 ++- hyperion/bin/finetune_wav2vec2transducer.py | 52 ++-- hyperion/bin/finetune_wav2vec2xvector.py | 26 +- hyperion/bin/finetune_wav2xvector.py | 22 +- .../bin/finetune_xvector_dfr_from_feats.py | 17 +- hyperion/bin/finetune_xvector_dfr_from_wav.py | 20 +- hyperion/bin/finetune_xvector_from_feats.py | 16 +- hyperion/bin/finetune_xvector_from_wav.py | 18 +- .../generate_adv_attacks_xvector_classif.py | 31 +- .../bin/generate_adv_attacks_xvector_verif.py | 18 +- hyperion/bin/hyperion_dataset.py | 62 ++-- hyperion/bin/hyperion_tables.py | 21 +- hyperion/bin/make_babble_noise_audio_files.py | 20 +- hyperion/bin/make_wav2xvector.py | 21 +- hyperion/bin/merge_scores.py | 12 +- hyperion/bin/pack_wav_rirs.py | 15 +- hyperion/bin/plot_embedding_tsne.py | 17 +- hyperion/bin/plot_embedding_tsne_per_class.py | 23 +- hyperion/bin/prepare_data.py | 11 +- hyperion/bin/preprocess_audio_files.py | 20 +- .../split_dataset_into_trials_and_cohort.py | 11 +- hyperion/bin/train_qmf.py | 28 +- hyperion/bin/train_wav2rnn_transducer.py | 77 ++--- hyperion/bin/train_wav2vec2rnn_transducer.py | 90 +++--- hyperion/bin/train_wav2vec2transducer.py | 79 ++--- hyperion/bin/train_wav2vec2xvector.py | 28 +- hyperion/bin/train_wav2xvector.py | 28 +- hyperion/bin/train_xvector_from_feats.py | 18 +- hyperion/bin/train_xvector_from_wav.py | 18 +- hyperion/io/__init__.py | 8 +- .../np/pdfs/mixtures/exp_family_mixture.py | 165 ---------- .../torch/lr_schedulers/red_lr_on_plateau.py | 6 +- hyperion/utils/queues.py | 287 ------------------ setup.py | 33 +- 67 files changed, 1110 insertions(+), 1193 deletions(-) create mode 100644 egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml create mode 100644 hyperion/bin/__init__.py delete mode 100644 hyperion/utils/queues.py diff --git a/README.md b/README.md index 7132a031..4838157b 100644 --- a/README.md +++ b/README.md @@ -28,11 +28,11 @@ The full API is described in the documentation page [https://hyperion-ml.readthe We use anaconda or miniconda, though you should be able to make it work in other python distributions To start, you should create a new enviroment and install PyTorch>=1.9, (older versions are not supported any longer) e.g.: ``` -conda create --name ${your_env} python=3.8 +conda create --name ${your_env} python=3.11 conda activate ${your_env} conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=10.2 -c pytorch +conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia ``` -In next Hyperion versions, we will upgrade to Pytorch>=1.9 and drop compatibility with older PyTorch versions. ### Installing Hyperion diff --git a/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml b/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..86f55073 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml @@ -0,0 +1,34 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max + mediumroom: + weight: 1 + rir_path: csv:data/rirs_mediumroom/rirs.csv + rir_norm: max + realroom: + weight: 1 + rir_path: csv:data/rirs_real/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 3 + max_snr: 18 diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh index aef70e96..563d3c2d 100755 --- a/egs/voxceleb/v1.2/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh @@ -16,31 +16,31 @@ config_file=default_config.sh if [ $stage -le 1 ];then # Prepare the VoxCeleb2 dataset for training. - prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ - --cat-videos --use-kaldi-ids \ - --output-dir data/voxceleb2cat_train + hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train fi if [ $stage -le 2 ];then # prepare voxceleb1 for test - prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \ - --use-kaldi-ids \ - --output-dir data/voxceleb1_test + hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test fi if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then - prepare_data.py voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ - --vox1-corpus-dir $voxceleb1_root \ - --output-dir data/voxsrc22_dev + hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev fi # if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then -# prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \ -# --vox1-corpus-dir $voxceleb1_root \ -# --output-dir data/voxsrc22_test + # hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + # --vox1-corpus-dir $voxceleb1_root \ + # --output-dir data/voxsrc22_test # fi if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then # split vox2 into 2 parts, for cohort and qmf training - split_dataset_into_trials_and_cohort.py --data-dir data/voxceleb2cat_train + hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train fi diff --git a/egs/voxceleb/v1.2/run_002_compute_evad.sh b/egs/voxceleb/v1.2/run_002_compute_evad.sh index e7593df2..acccace3 100755 --- a/egs/voxceleb/v1.2/run_002_compute_evad.sh +++ b/egs/voxceleb/v1.2/run_002_compute_evad.sh @@ -48,18 +48,18 @@ if [ $stage -le 2 ];then echo "compute vad for $name" $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ hyp_utils/conda_env.sh \ - compute_energy_vad.py --cfg $vad_config \ + hyperion-compute-energy-vad --cfg $vad_config \ --recordings-file data/$name/recordings.csv \ --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ --part-idx JOB --num-parts $nj || exit 1 - hyperion_tables.py cat \ - --table-type features \ - --output-file $vad_dir/$name/vad.csv --num-tables $nj - hyperion_dataset.py add_features \ - --dataset data/$name \ - --features-name vad \ - --features-file $vad_dir/$name/vad.csv + hyperion-tables cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion-dataset add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv done fi diff --git a/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh index aed1dae4..73c7ed82 100755 --- a/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh +++ b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh @@ -18,10 +18,10 @@ config_file=default_config.sh if [ $stage -le 1 ]; then for name in noise music speech do - prepare_data.py musan \ - --corpus-dir $musan_root \ - --subset $name \ - --output-dir data/musan_$name + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name done fi @@ -37,66 +37,66 @@ if [ $stage -le 2 ]; then output_dir=exp/proc_audio/$name $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ hyp_utils/conda_env.sh \ - preprocess_audio_files.py \ + hyperion-preprocess-audio-files \ --audio-format flac \ --part-idx JOB --num-parts $nj \ --recordings-file $input_data_dir/recordings.csv \ --output-path $output_dir \ --output-recordings-file $output_dir/recordings.JOB.csv - - hyperion_tables.py cat \ - --table-type recordings \ - --output-file $output_dir/recordings.csv --num-tables $nj - hyperion_dataset.py set_recordings \ - --dataset $input_data_dir \ - --recordings-file $output_dir/recordings.csv \ - --output-dataset $output_data_dir - + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + done fi if [ $stage -le 3 ]; then - # Create Babble noise from MUSAN speech files - for name in musan_speech - do - input_data_dir=data/$name - output_data_dir=data/${name}_babble - output_dir=exp/proc_audio/${name}_babble - $train_cmd $output_dir/log/make_babble_noise_${name}.log \ - hyp_utils/conda_env.sh \ - make_babble_noise_audio_files.py \ - --audio-format flac \ - --min-spks 3 --max-spks 10 --num-reuses 5 \ - --recordings-file $input_data_dir/recordings.csv \ - --output-path $output_dir \ - --output-recordings-file $output_data_dir/recordings.csv - hyperion_dataset.py make_from_recordings \ - --dataset $output_data_dir \ - --recordings-file $output_data_dir/recordings.csv - done + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done fi if [ $stage -le 4 ]; then - if [ ! -d "RIRS_NOISES" ]; then - # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises - wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip - unzip rirs_noises.zip - fi - prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom - prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom - prepare_data.py rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real - for rirs in rirs_smallroom rirs_mediumroom rirs_real - do - output_dir=exp/rirs/$rirs - data_dir=data/$rirs - $train_cmd $output_dir/log/pack_rirs_${name}.log \ - hyp_utils/conda_env.sh \ - pack_wav_rirs.py ${args} --input $data_dir/recordings.csv \ - --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; - hyperion_dataset.py add_features --dataset $data_dir \ - --features-name rirs --features-file $output_dir/rirs.csv + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv - done + done fi diff --git a/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh index 7649ff22..4e0c5b19 100755 --- a/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh +++ b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh @@ -35,42 +35,42 @@ if [ $stage -le 2 ];then $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ hyp_utils/conda_env.sh \ - preprocess_audio_files.py \ + hyperion-preprocess-audio-files \ --audio-format flac --remove-dc-offset $vad_args \ --part-idx JOB --num-parts $nj \ --recordings-file data/$nnet_data/recordings.csv \ --output-path $output_dir \ --output-recordings-file $output_dir/recordings.JOB.csv - hyperion_tables.py cat \ - --table-type recordings \ - --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj - hyperion_dataset.py set_recordings $update_durs \ - --dataset data/$nnet_data \ - --recordings-file $output_dir/recordings.csv \ - --output-dataset data/${nnet_data}_proc_audio \ - --remove-features vad + hyperion-dataset set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad fi if [ $stage -le 3 ];then - hyperion_dataset.py remove_short_segments \ - --dataset data/${nnet_data}_proc_audio \ - --output-dataset data/${nnet_data}_filtered \ - --length-name duration --min-length 2.0 + hyperion-dataset remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 - hyperion_dataset.py remove_classes_few_segments \ - --dataset data/${nnet_data}_filtered \ - --class-name speaker --min-segs 4 + hyperion-dataset remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 fi if [ $stage -le 4 ];then - hyperion_dataset.py split_train_val \ - --dataset data/${nnet_data}_filtered \ - --val-prob 0.03 \ - --joint-classes speaker --min-train-samples 1 \ - --seed 1123581321 \ - --train-dataset data/${nnet_data}_xvector_train \ - --val-dataset data/${nnet_data}_xvector_val + hyperion-dataset split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --joint-classes speaker --min-train-samples 1 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val fi diff --git a/egs/voxceleb/v1.2/run_005_train_xvector.sh b/egs/voxceleb/v1.2/run_005_train_xvector.sh index d2f31ea1..2479d565 100755 --- a/egs/voxceleb/v1.2/run_005_train_xvector.sh +++ b/egs/voxceleb/v1.2/run_005_train_xvector.sh @@ -44,7 +44,7 @@ if [ $stage -le 1 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - train_wav2xvector.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ --data.train.dataset.segments-file $train_data_dir/segments.csv \ --data.train.dataset.class-files $train_data_dir/speaker.csv \ @@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - finetune_wav2xvector.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ --data.train.dataset.segments-file $train_data_dir/segments.csv \ --data.train.dataset.class-files $train_data_dir/speaker.csv \ diff --git a/egs/voxceleb/v1.2/run_006_extract_xvectors.sh b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh index 09b8c8e9..0dc58048 100755 --- a/egs/voxceleb/v1.2/run_006_extract_xvectors.sh +++ b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh @@ -58,15 +58,15 @@ if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qm echo "Extracting x-vectors for $name" $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - extract_wav2xvectors.py ${xvec_args} ${vad_args} \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ --part-idx JOB --num-parts $nj \ --recordings-file data/$name/recordings.csv \ --random-utt-length --min-utt-length 2 --max-utt-length 30 \ --model-path $nnet \ --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv - hyperion_tables.py cat \ - --table-type features \ - --output-file $output_dir/xvector.csv --num-tables $nj + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj done fi @@ -88,14 +88,14 @@ if [ $stage -le 2 ]; then echo "Extracting x-vectors for $name" $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - extract_wav2xvectors.py ${xvec_args} ${vad_args} \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ --part-idx JOB --num-parts $nj \ --recordings-file data/$name/recordings.csv \ --model-path $nnet \ --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv - hyperion_tables.py cat \ - --table-type features \ - --output-file $output_dir/xvector.csv --num-tables $nj + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj done fi diff --git a/egs/voxceleb/v1.2/run_007_eval_be.sh b/egs/voxceleb/v1.2/run_007_eval_be.sh index 9084d35b..53621488 100755 --- a/egs/voxceleb/v1.2/run_007_eval_be.sh +++ b/egs/voxceleb/v1.2/run_007_eval_be.sh @@ -56,7 +56,7 @@ if [ $stage -le 3 ];then do $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend.py \ + hyperion-eval-cosine-scoring-backend \ --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ --ndx-file data/voxceleb1_test/trials.csv \ --enroll-map-file data/voxceleb1_test/enrollment.csv \ @@ -66,11 +66,11 @@ if [ $stage -le 3 ];then done done wait - merge_scores.py --output-file $score_cosine_dir/voxceleb1_scores.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ - eval_verification_metrics.py \ + hyperion-eval-verification-metrics \ --score-files $score_cosine_dir/voxceleb1_scores.csv \ --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ --score-names voxceleb1 \ @@ -85,22 +85,22 @@ if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then echo "Eval voxsrc2 with Cosine scoring" $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend.py \ + hyperion-eval-cosine-scoring-backend \ --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ --ndx-file data/voxsrc22_dev/trials.csv \ --enroll-map-file data/voxsrc22_dev/enrollment.csv \ --score-file $score_cosine_dir/voxsrc22_dev_scores.csv # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \ - # hyp_utils/conda_env.sh \ - # eval_cosine_scoring_backend.py \ - # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ - # --ndx-file data/voxsrc22_eval/trials.csv \ - # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ - # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv + # hyp_utils/conda_env.sh \ + # hyperion-eval-cosine-scoring-backend \ + # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ + # --ndx-file data/voxsrc22_eval/trials.csv \ + # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ + # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \ - eval_verification_metrics.py \ + hyperion-eval-verification-metrics \ --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \ --key-files data/voxsrc22_dev/trials.csv \ --score-names voxsrc22_dev \ @@ -121,7 +121,7 @@ if [ "$do_snorm" == "true" ];then do $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend.py \ + hyperion-eval-cosine-scoring-backend \ --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ --ndx-file data/voxceleb1_test/trials.csv \ --enroll-map-file data/voxceleb1_test/enrollment.csv \ @@ -135,11 +135,11 @@ if [ "$do_snorm" == "true" ];then sleep 5s done wait - merge_scores.py --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ - eval_verification_metrics.py \ + hyperion-eval-verification-metrics \ --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \ --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ --score-names voxceleb1 \ @@ -159,7 +159,7 @@ if [ "$do_snorm" == "true" ];then do $train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend.py \ + hyperion-eval-cosine-scoring-backend \ --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ --ndx-file data/voxsrc22_dev/trials.csv \ --enroll-map-file data/voxsrc22_dev/enrollment.csv \ @@ -174,16 +174,16 @@ if [ "$do_snorm" == "true" ];then sleep 10s done wait - merge_scores.py --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ - eval_verification_metrics.py \ - --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ - --key-files data/voxsrc22_dev/trials.csv \ - --score-names voxsrc22_dev \ - --key-names all \ - --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv @@ -202,7 +202,7 @@ if [ "$do_qmf" == "true" ];then do $train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend_with_qmf.py \ + hyperion-eval-cosine-scoring-backend-with-qmf \ --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ --ndx-file data/voxceleb2cat_train_trials/trials.csv \ --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv \ @@ -216,13 +216,13 @@ if [ "$do_qmf" == "true" ];then sleep 5s done wait - merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts - train_qmf.py --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ - --key-file data/voxceleb2cat_train_trials/trials.csv \ - --model-file $score_cosine_qmf_dir/qmf.h5 - + hyperion-train-qmf --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --key-file data/voxceleb2cat_train_trials/trials.csv \ + --model-file $score_cosine_qmf_dir/qmf.h5 + fi if [ $stage -le 8 ];then @@ -234,7 +234,7 @@ if [ "$do_qmf" == "true" ];then do $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend_with_qmf.py \ + hyperion-eval-cosine-scoring-backend-with-qmf \ --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ --ndx-file data/voxceleb1_test/trials.csv \ --enroll-map-file data/voxceleb1_test/enrollment.csv \ @@ -252,11 +252,11 @@ if [ "$do_qmf" == "true" ];then for suffix in "" .snorm .snorm.qmf do ( - merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \ - eval_verification_metrics.py \ + hyperion-eval-verification-metrics \ --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ --score-names voxceleb1 \ @@ -280,7 +280,7 @@ if [ "$do_qmf" == "true" ];then do $train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend_with_qmf.py \ + hyperion-eval-cosine-scoring-backend-with-qmf \ --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ --ndx-file data/voxsrc22_dev/trials.csv \ --enroll-map-file data/voxsrc22_dev/enrollment.csv \ @@ -299,11 +299,11 @@ if [ "$do_qmf" == "true" ];then for suffix in "" .snorm .snorm.qmf do ( - merge_scores.py --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \ - eval_verification_metrics.py \ + hyperion-eval-verification-metrics \ --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ --key-files data/voxsrc22_dev/trials.csv \ --score-names voxsrc22_dev \ diff --git a/hyperion/bin/__init__.py b/hyperion/bin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperion/bin/adv_finetune_xvector_from_wav.py b/hyperion/bin/adv_finetune_xvector_from_wav.py index f45b84a0..ea3d3b80 100755 --- a/hyperion/bin/adv_finetune_xvector_from_wav.py +++ b/hyperion/bin/adv_finetune_xvector_from_wav.py @@ -13,6 +13,13 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.adv_attacks import AttackFactory @@ -29,8 +36,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) xvec_dict = { "resnet": RXVec, @@ -43,7 +48,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -138,7 +142,6 @@ def init_attack(feat_extractor, model, wav_scale, **kwargs): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -231,8 +234,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="""Fine-tune x-vector model from audio files with adversarial training""" @@ -266,6 +268,10 @@ def make_parser(xvec_class): train_xvec(gpu_id, args_sc) +if __name__ == "__main__": + main() + + # def init_data( # audio_path, # train_list, diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py index bdf53786..f8299edc 100755 --- a/hyperion/bin/apply_mvn_select_frames.py +++ b/hyperion/bin/apply_mvn_select_frames.py @@ -10,6 +10,13 @@ import time import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import RandomAccessDataReaderFactory as RDRF @@ -18,8 +25,6 @@ from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.utils import Utt2Info from hyperion.utils.kaldi_matrix import compression_methods -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def process_feats( @@ -35,7 +40,6 @@ def process_feats( compression_method, **kwargs ): - logging.info("initializing") mvn_args = MVN.filter_args(**kwargs) mvn = MVN(**mvn_args) @@ -49,16 +53,23 @@ def process_feats( logging.info("opening output stream: %s" % (output_spec)) with DWF.create( - output_spec, compress=compress, compression_method=compression_method, + output_spec, + compress=compress, + compression_method=compression_method, ) as writer: - logging.info("opening input stream: %s" % (output_spec)) with DRF.create( - input_spec, path_prefix=path_prefix, part_idx=part_idx, num_parts=num_parts, + input_spec, + path_prefix=path_prefix, + part_idx=part_idx, + num_parts=num_parts, ) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = RDRF.create(vad_spec, path_prefix=vad_path_prefix,) + v_reader = RDRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) while not reader.eof(): key, data = reader.read(1) @@ -91,8 +102,7 @@ def process_feats( u2nf.save(write_num_frames_spec) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Apply CMVN and remove silence") parser.add_argument("--input", dest="input_spec", required=True) @@ -105,7 +115,9 @@ def process_feats( "--path-prefix", dest="path_prefix", default=None, help=("scp file_path prefix") ) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument( "--part-idx", @@ -150,3 +162,7 @@ def process_feats( logging.debug(args) process_feats(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/audio_to_duration.py b/hyperion/bin/audio_to_duration.py index ac8852a4..8ef6b5c1 100755 --- a/hyperion/bin/audio_to_duration.py +++ b/hyperion/bin/audio_to_duration.py @@ -9,15 +9,19 @@ import time import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.io import SequentialAudioReader as AR from hyperion.utils import SegmentSet -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def audio_to_duration(audio_file, output_file, **kwargs): - input_args = AR.filter_args(**kwargs) logging.info(f"input_args={input_args}") @@ -36,8 +40,7 @@ def audio_to_duration(audio_file, output_file, **kwargs): seg_set.save(output_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Writes audio file durations to table") parser.add_argument("--cfg", action=ActionConfigFile) @@ -59,3 +62,7 @@ def audio_to_duration(audio_file, output_file, **kwargs): logging.debug(args) audio_to_duration(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index 9d50388c..fe0b1d8e 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -9,10 +9,6 @@ import time import numpy as np -from hyperion.hyp_defs import config_logger -from hyperion.io import DataWriterFactory as DWF -from hyperion.io import SequentialAudioReader as AR -from hyperion.np.feats import EnergyVAD from jsonargparse import ( ActionConfigFile, ActionParser, @@ -20,9 +16,13 @@ namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.np.feats import EnergyVAD -def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): +def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): vad_args = EnergyVAD.filter_args(**kwargs) vad = EnergyVAD(**vad_args) @@ -78,8 +78,7 @@ def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): f_num_frames.close() -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Compute Kaldi Energy VAD") parser.add_argument("--cfg", action=ActionConfigFile) @@ -105,3 +104,7 @@ def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): logging.debug(args) compute_vad(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py index 442e4141..f42f260d 100755 --- a/hyperion/bin/compute_mfcc_feats.py +++ b/hyperion/bin/compute_mfcc_feats.py @@ -9,20 +9,24 @@ import time import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import compression_methods from hyperion.np.feats import MFCC -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def compute_mfcc_feats( input_path, output_path, compress, compression_method, write_num_frames, **kwargs ): - mfcc_args = MFCC.filter_args(**kwargs) mfcc = MFCC(**mfcc_args) @@ -34,7 +38,9 @@ def compute_mfcc_feats( reader = DRF.create(input_path, **input_args) writer = DWF.create( - output_path, compress=compress, compression_method=compression_method, + output_path, + compress=compress, + compression_method=compression_method, ) if write_num_frames is not None: @@ -68,8 +74,7 @@ def compute_mfcc_feats( f_num_frames.close() -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Compute MFCC features") parser.add_argument("--cfg", action=ActionConfigFile) @@ -109,3 +114,7 @@ def compute_mfcc_feats( logging.debug(args) compute_mfcc_feats(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/copy_feats.py b/hyperion/bin/copy_feats.py index 4549caec..4ffc1a58 100755 --- a/hyperion/bin/copy_feats.py +++ b/hyperion/bin/copy_feats.py @@ -12,11 +12,12 @@ import time import numpy as np + from hyperion.hyp_defs import config_logger from hyperion.io import CopyFeats as CF -if __name__ == "__main__": +def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, fromfile_prefix_chars="@", @@ -37,3 +38,7 @@ logging.debug(args) CF(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py index 972b247c..bcf9e05c 100755 --- a/hyperion/bin/decode_wav2transducer.py +++ b/hyperion/bin/decode_wav2transducer.py @@ -15,18 +15,22 @@ import sentencepiece as spm import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.models.wav2transducer.beam_search import (beam_search, - greedy_search) +from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): @@ -118,7 +122,6 @@ def decode_one_batch( def decode_transducer( input_spec, output_spec, model_path, bpe_model, use_gpu, **kwargs ): - device = init_device(use_gpu) model = load_model(model_path, device) @@ -202,8 +205,7 @@ def decode_transducer( ) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Extracts x-vectors from waveform computing " "acoustic features on the fly" @@ -235,3 +237,7 @@ def decode_transducer( logging.debug(args) decode_transducer(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py index 4fdc3140..33aea8c3 100755 --- a/hyperion/bin/decode_wav2vec2rnn_transducer.py +++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py @@ -15,19 +15,23 @@ import sentencepiece as spm import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML from hyperion.torch.models import HFWav2Vec2RNNTransducer -from hyperion.torch.models.wav2transducer.beam_search import (beam_search, - greedy_search) +from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): @@ -48,10 +52,11 @@ def load_model(model_path, device): def decode_one_batch( - model: nn.Module, - sp: spm.SentencePieceProcessor, - x: torch.Tensor, - decoding_method="beam_search") -> Dict[str, List[List[str]]]: + model: nn.Module, + sp: spm.SentencePieceProcessor, + x: torch.Tensor, + decoding_method="beam_search", +) -> Dict[str, List[List[str]]]: """Decode one batch and return the result in a dict. The dict has the following format: - key: It indicates the setting used for decoding. For example, @@ -77,7 +82,7 @@ def decode_one_batch( the returned dict. """ device = model.device - feature = x #batch["inputs"] + feature = x # batch["inputs"] assert x.shape[0] == 1 assert feature.ndim == 2 @@ -87,7 +92,8 @@ def decode_one_batch( feature_lens = torch.Tensor([x.shape[1]]).int() encoder_out, hid_feats, encoder_out_lens = model.forward_feats( - x=feature, x_lengths=feature_lens) + x=feature, x_lengths=feature_lens + ) hyps = [] batch_size = encoder_out.size(0) @@ -114,9 +120,16 @@ def decode_one_batch( return hyps[0] -def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, - infer_args, use_gpu, **kwargs): - +def decode_transducer( + input_spec, + output_spec, + scp_sep, + model_path, + bpe_model, + infer_args, + use_gpu, + **kwargs, +): device = init_device(use_gpu) model = load_model(model_path, device) @@ -142,8 +155,9 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, t2 = time.time() logging.info("processing utt %s", key) with torch.no_grad(): - x = torch.tensor( - x[None, :], dtype=torch.get_default_dtype()).to(device) + x = torch.tensor(x[None, :], dtype=torch.get_default_dtype()).to( + device + ) tot_frames = x.shape[1] logging.info( @@ -157,10 +171,10 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, if x.shape[1] == 0: y = [""] else: - #y = decode_one_batch(model=model, sp=sp, x=x) - x_lengths = torch.tensor((x.shape[1], ), - dtype=torch.long, - device=device) + # y = decode_one_batch(model=model, sp=sp, x=x) + x_lengths = torch.tensor( + (x.shape[1],), dtype=torch.long, device=device + ) y = model.infer(x, x_lengths, **infer_args) y = sp.decode(y[0]) @@ -172,10 +186,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, tot_time = t4 - t1 infer_time = t3 - t2 logging.info( - ("utt %s total-time=%.3f read-time=%.3f " - "infer-time=%.3f " - "write-time=%.3f " - "infer-rt-factor=%.2f tot-rt-factor=%.2f"), + ( + "utt %s total-time=%.3f read-time=%.3f " + "infer-time=%.3f " + "write-time=%.3f " + "infer-rt-factor=%.2f tot-rt-factor=%.2f" + ), key, tot_time, t2 - t1, @@ -186,16 +202,14 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, ) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( - description=("ASR decoding for RNN-T with Wav2vec features")) + description=("ASR decoding for RNN-T with Wav2vec features") + ) parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument("--input", dest="input_spec", required=True) - parser.add_argument("--scp-sep", - default=" ", - help=("scp file field separator")) + parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) AR.add_class_args(parser) parser.add_argument("--model-path", required=True) @@ -203,16 +217,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, HFWav2Vec2RNNTransducer.add_infer_args(parser, "infer-args") parser.add_argument("--output", dest="output_spec", required=True) - parser.add_argument("--use-gpu", - default=False, - action="store_true", - help="extract xvectors in gpu") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -220,3 +230,7 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, logging.debug(args) decode_transducer(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_cosine_scoring_backend.py b/hyperion/bin/eval_cosine_scoring_backend.py index 1a740024..835cae0b 100755 --- a/hyperion/bin/eval_cosine_scoring_backend.py +++ b/hyperion/bin/eval_cosine_scoring_backend.py @@ -4,24 +4,24 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging +import time +from pathlib import Path + +import numpy as np from jsonargparse import ( - ArgumentParser, ActionConfigFile, ActionParser, + ArgumentParser, namespace_to_dict, ) -import time -import logging -from pathlib import Path - -import numpy as np from hyperion.hyp_defs import config_logger -from hyperion.utils import TrialNdx, TrialKey, TrialScores, EnrollmentMap, SegmentSet -from hyperion.utils.math_funcs import cosine_scoring from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.np.transforms import TransformList from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.transforms import TransformList +from hyperion.utils import EnrollmentMap, SegmentSet, TrialKey, TrialNdx, TrialScores +from hyperion.utils.math_funcs import cosine_scoring def load_trial_data( @@ -58,7 +58,6 @@ def load_trial_data( def load_cohort_data(segments_file, feats_file): - segments = SegmentSet.load(segments_file) feats_reader = DRF.create(feats_file) x = feats_reader.read(segments["id"], squeeze=True) @@ -81,7 +80,6 @@ def eval_backend( cohort_nbest, avg_cohort_by, ): - logging.info("loading data") enroll_map, ndx, x_e, x_t = load_trial_data( enroll_map_file, @@ -151,8 +149,7 @@ def eval_backend( scores.save(score_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Eval cosine-scoring with optional AS-Norm") parser.add_argument("--enroll-feats-file", default=None) @@ -198,3 +195,7 @@ def eval_backend( logging.debug(args) eval_backend(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py index 0333669f..4fecf2f3 100755 --- a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py +++ b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py @@ -4,33 +4,33 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging +import time from pathlib import Path import numpy as np import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.transforms import TransformList from hyperion.utils import ( - TrialNdx, - TrialKey, - TrialScores, EnrollmentMap, - SegmentSet, InfoTable, + SegmentSet, + TrialKey, + TrialNdx, + TrialScores, ) -from hyperion.utils.math_funcs import cosine_scoring, average_vectors -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.np.transforms import TransformList -from hyperion.np.score_norm import AdaptSNorm -from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.utils.math_funcs import average_vectors, cosine_scoring def get_precomp_qm_names(quality_measures): @@ -542,7 +542,7 @@ def eval_backend( # scores.save(score_file_snorm) -if __name__ == "__main__": +def main(): parser = ArgumentParser( description="Eval cosine-scoring with optional AS-Norm and QMF" ) @@ -611,3 +611,7 @@ def eval_backend( logging.debug(args) eval_backend(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_verification_metrics.py b/hyperion/bin/eval_verification_metrics.py index 83227558..98fd37e2 100755 --- a/hyperion/bin/eval_verification_metrics.py +++ b/hyperion/bin/eval_verification_metrics.py @@ -5,19 +5,19 @@ """ import logging from pathlib import Path -import pandas as pd - -from hyperion.hyp_defs import config_logger -from hyperion.np.metrics import VerificationEvaluator as VE +import pandas as pd from jsonargparse import ( ActionConfigFile, - ActionYesNo, ActionParser, + ActionYesNo, ArgumentParser, namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.np.metrics import VerificationEvaluator as VE + def eval_verification_metrics( key_files, @@ -30,7 +30,6 @@ def eval_verification_metrics( sparse, output_file, ): - assert len(key_files) == len(key_names) assert len(score_files) == len(score_names) dfs = [] @@ -61,8 +60,7 @@ def eval_verification_metrics( print(df.to_string(), flush=True) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Evaluate speaker verification metrics") parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument("--key-files", required=True, nargs="+") @@ -85,7 +83,12 @@ def eval_verification_metrics( parser.add_argument("--sparse", default=False, action=ActionYesNo) parser.add_argument("--output-file", required=True) parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, ) args = parser.parse_args() @@ -94,3 +97,7 @@ def eval_verification_metrics( del kwargs["verbose"] del kwargs["cfg"] eval_verification_metrics(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index 7c9d4104..1baad913 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -12,6 +12,13 @@ import pandas as pd import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -26,8 +33,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): @@ -44,7 +49,6 @@ def __init__( self.sigma = sigma def forward(self, s_t): - if self.sigma > 0: s_t = s_t + self.sigma * torch.randn_like(s_t) @@ -107,7 +111,6 @@ def load_calibrator(cal_file, threshold): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -143,7 +146,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) xvector_model = load_model(model_path) @@ -319,8 +321,7 @@ def eval_cosine_scoring( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Eval cosine-scoring given enroll x-vector and test wave" ) @@ -336,7 +337,9 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -415,3 +418,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index fb0d402c..3e4e9229 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -7,6 +7,7 @@ import os import sys import time + # [Added Sonal May21] from pathlib import Path @@ -14,6 +15,13 @@ import pandas as pd import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -29,8 +37,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) torch.backends.cudnn.enabled = False @@ -45,7 +51,7 @@ def __init__( sigma=0, smoothing_after_wavegan=None, wave_gan_defender=None, - wav_scale=2 ** 15 - 1, + wav_scale=2**15 - 1, ): super().__init__() self.feat_extractor = feat_extractor @@ -61,7 +67,6 @@ def __init__( self.apply_wavegan = False if wave_gan_defender is None else True def forward(self, s_t): - # Pre-proceessing defense, wavegan + smoothing [Added Sonal May21] s_t = s_t / self.wav_scale if self.smoothing_after_wavegan: @@ -149,7 +154,6 @@ def load_calibrator(cal_file, threshold): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -188,7 +192,6 @@ def eval_cosine_scoring_wavegan( wave_gan_model_ckpt, **kwargs ): - device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) @@ -374,8 +377,7 @@ def eval_cosine_scoring_wavegan( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Eval cosine-scoring given enroll x-vector and test wave" ) @@ -391,7 +393,9 @@ def eval_cosine_scoring_wavegan( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -488,3 +492,7 @@ def eval_cosine_scoring_wavegan( logging.debug(args) eval_cosine_scoring_wavegan(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index 2d5baa17..781cdbdf 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -15,6 +15,13 @@ import torch.nn as nn from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -22,16 +29,15 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): @@ -69,7 +75,6 @@ def load_calibrator(cal_file): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -156,7 +161,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device_type = "gpu" if use_gpu else "cpu" device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) @@ -343,8 +347,7 @@ def eval_cosine_scoring( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Eval cosine-scoring given enroll x-vector " @@ -363,7 +366,9 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -431,3 +436,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index 76af5d75..2ebb7e3d 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -12,6 +12,13 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import RandomAccessDataReaderFactory as DRF @@ -24,8 +31,6 @@ from hyperion.torch.utils.misc import l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): @@ -66,7 +71,6 @@ def load_calibrator(cal_file, device): def read_data(v_file, ndx_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) try: @@ -104,7 +108,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) @@ -199,8 +202,7 @@ def eval_cosine_scoring( s.save_txt(score_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Eval cosine-scoring given enroll x-vector and test wave" ) @@ -216,7 +218,9 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -266,3 +270,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index f33402a1..a6f8efa4 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -12,6 +12,13 @@ import pandas as pd import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -26,8 +33,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): @@ -104,7 +109,6 @@ def load_calibrator(cal_file, threshold): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -146,7 +150,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device = init_device(use_gpu) # load victim model feat_extractor = init_feats(**kwargs["feats"]) @@ -204,7 +207,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -337,8 +340,7 @@ def eval_cosine_scoring( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Eval cosine-scoring given enroll x-vector and " @@ -435,3 +437,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index f94dc497..7b8bc245 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -15,6 +15,13 @@ import torch.nn as nn from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -22,16 +29,15 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): @@ -113,7 +119,6 @@ def load_calibrator(cal_file): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -155,7 +160,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device_type = "gpu" if use_gpu else "cpu" device = init_device(use_gpu) # load victim model @@ -361,8 +365,7 @@ def eval_cosine_scoring( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Eval cosine-scoring given enroll x-vector and " @@ -384,7 +387,9 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -456,3 +461,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index f60c7508..b2e6a665 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -12,6 +12,13 @@ import numpy as np import pandas as pd import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -21,12 +28,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_device(use_gpu): @@ -110,7 +111,6 @@ def eval_xvec( use_gpu, **kwargs ): - rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) @@ -131,15 +131,16 @@ def eval_xvec( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec) as writer: - logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) ) with AR(input_spec, **ar_args) as reader: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) + v_reader = VRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) while not reader.eof(): t1 = time.time() @@ -224,8 +225,7 @@ def eval_xvec( aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Evaluates x-vectors logits from waveform computing " @@ -299,3 +299,7 @@ def eval_xvec( logging.debug(args) eval_xvec(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index 5eba1b99..f2df9581 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -13,6 +13,13 @@ import pandas as pd import torch import torchaudio.transforms as tat +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -21,12 +28,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) resamplers = {} @@ -122,7 +123,6 @@ def extract_xvectors( use_gpu, **kwargs, ): - rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) model = load_model(model_path, device) @@ -143,13 +143,14 @@ def extract_xvectors( ar_args["wav_scale"] = 1.0 logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec) as writer: - logging.info(f"opening input stream: {recordings_file} with args={ar_args}") with AR(recordings_file, **ar_args) as reader: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) + v_reader = VRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) while not reader.eof(): t1 = time.time() @@ -252,8 +253,7 @@ def extract_xvectors( aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Extracts x-vectors from waveform computing " "acoustic features on the fly" @@ -340,3 +340,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_wav2xvectors.py b/hyperion/bin/extract_wav2xvectors.py index 7b04fcc8..763df3fc 100755 --- a/hyperion/bin/extract_wav2xvectors.py +++ b/hyperion/bin/extract_wav2xvectors.py @@ -13,6 +13,13 @@ import pandas as pd import torch import torchaudio.transforms as tat +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -21,12 +28,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) resamplers = {} @@ -121,7 +122,6 @@ def extract_xvectors( use_gpu, **kwargs, ): - rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) model = load_model(model_path, device) @@ -143,10 +143,8 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s with args=%s", output_spec, str(ar_args)) with DWF.create(output_spec, metadata_columns=metadata_columns) as writer: - logging.info(f"opening input stream: {recordings_file} with args={ar_args}") with AR(recordings_file, **ar_args) as reader: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) @@ -255,8 +253,7 @@ def extract_xvectors( aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="""Extracts x-vectors from waveform computing acoustic features on the fly""" ) @@ -331,3 +328,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py index b02db70c..e70225c2 100755 --- a/hyperion/bin/extract_xvectors_from_feats.py +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -11,6 +11,13 @@ import numpy as np import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF @@ -19,12 +26,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_device(use_gpu): @@ -82,7 +83,6 @@ def extract_xvectors( use_gpu, **kwargs ): - logging.info("initializing") rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) @@ -96,7 +96,6 @@ def extract_xvectors( dr_args = DRF.filter_args(**kwargs) logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec) as writer: - logging.info("opening input stream: %s" % (input_spec)) with DRF.create(input_spec, **dr_args) as reader: if vad_spec is not None: @@ -174,8 +173,7 @@ def extract_xvectors( u2nf.save(write_num_frames_spec) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Extracts x-vectors from features") parser.add_argument("--cfg", action=ActionConfigFile) @@ -244,3 +242,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index 6a8130d3..71a24bd4 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -12,6 +12,13 @@ import numpy as np import pandas as pd import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -21,12 +28,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_device(use_gpu): @@ -111,7 +112,6 @@ def extract_xvectors( use_gpu, **kwargs ): - rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) @@ -132,12 +132,10 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec) as writer: - logging.info( "opening input stream: {} with args={}".format(recordings_file, ar_args) ) with AR(recordings_file, **ar_args) as reader: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) @@ -235,8 +233,7 @@ def extract_xvectors( aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Extracts x-vectors from waveform computing acoustic features on the fly" @@ -317,3 +314,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index bcec5133..a1186ed2 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -12,6 +12,13 @@ import numpy as np import torch import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF @@ -20,12 +27,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_device(use_gpu): @@ -73,7 +74,6 @@ def extract_xvectors( use_gpu, **kwargs ): - logging.info("initializing") rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) @@ -86,7 +86,6 @@ def extract_xvectors( dr_args = DRF.filter_args(**kwargs) logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec) as writer: - logging.info("opening input stream: %s" % (output_spec)) with DRF.create(input_spec, **dr_args) as reader: if vad_spec is not None: @@ -118,7 +117,13 @@ def extract_xvectors( t4 = time.time() if x.shape[0] == 0: - y = np.zeros((1, model.embed_dim,), dtype=float_cpu(),) + y = np.zeros( + ( + 1, + model.embed_dim, + ), + dtype=float_cpu(), + ) else: xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) with torch.no_grad(): @@ -195,8 +200,7 @@ def extract_xvectors( yaml.dump(params, f) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Extract x-vectors over a sliding window") parser.add_argument("--cfg", action=ActionConfigFile) @@ -208,7 +212,9 @@ def extract_xvectors( ) parser.add_argument("--slidwin-params-path", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) MVN.add_class_args(parser, prefix="mvn") @@ -298,3 +304,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index f1a64e1b..f973b566 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -13,6 +13,13 @@ import pandas as pd import torch import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -22,12 +29,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_device(use_gpu): @@ -99,7 +100,6 @@ def extract_xvectors( use_gpu, **kwargs ): - rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) @@ -124,15 +124,16 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec) as writer: - logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) ) with AR(input_spec, **ar_args) as reader: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) + v_reader = VRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) while not reader.eof(): t1 = time.time() @@ -172,7 +173,13 @@ def extract_xvectors( t6 = time.time() if x.shape[1] == 0: - y = np.zeros((1, model.embed_dim,), dtype=float_cpu(),) + y = np.zeros( + ( + 1, + model.embed_dim, + ), + dtype=float_cpu(), + ) else: x = x.transpose(1, 2).contiguous() y = ( @@ -255,8 +262,7 @@ def extract_xvectors( yaml.dump(params, f) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Extract x-vectors over a sliding window" @@ -347,3 +353,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2vec2transducer.py b/hyperion/bin/finetune_wav2vec2transducer.py index 6f17f800..138f18f7 100755 --- a/hyperion/bin/finetune_wav2vec2transducer.py +++ b/hyperion/bin/finetune_wav2vec2transducer.py @@ -14,6 +14,14 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD @@ -22,9 +30,6 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from torch.nn.utils.rnn import pad_sequence model_dict = { "hf_wav2vec2transducer": HFWav2Vec2Transducer, @@ -43,8 +48,7 @@ def transducer_collate(batch): audio = pad_sequence(audio) audio_length = torch.as_tensor(audio_length) target = k2.RaggedTensor(target) - return torch.transpose(audio,0,1), audio_length, target - + return torch.transpose(audio, 0, 1), audio_length, target def init_data(partition, rank, num_gpus, **kwargs): @@ -73,7 +77,9 @@ def init_data(partition, rank, num_gpus, **kwargs): largs = ( {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} ) - data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader @@ -89,11 +95,7 @@ def init_model(in_model_file, rank, model_class, **kwargs): return model - - - def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -119,7 +121,7 @@ def train_model(gpu_id, args): trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {} + metrics = {} trainer = Trainer( model, device=device, @@ -135,7 +137,7 @@ def train_model(gpu_id, args): def make_parser(model_class): parser = ArgumentParser() - + parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") AD.add_class_args(train_parser, prefix="dataset", skip={}) @@ -161,27 +163,23 @@ def make_parser(model_class): data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) - parser.add_argument( "--data.train.dataset.text_file", - type=str, + type=str, ) - - parser.add_argument("--data.val.dataset.text_file", type=str) - + + parser.add_argument("--data.val.dataset.text_file", type=str) + parser.add_argument( "--data.train.dataset.bpe_model", - type=str, + type=str, ) parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" ) - parser.link_arguments( - "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" - ) - + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") parser.add_argument("--in-model-file", required=True) model_class.add_finetune_args(parser, prefix="model") @@ -198,8 +196,10 @@ def make_parser(model_class): return parser -if __name__ == "__main__": - parser = ArgumentParser(description="Fine-tune Wav2Vec2Transducer model from audio files") +def main(): + parser = ArgumentParser( + description="Fine-tune Wav2Vec2Transducer model from audio files" + ) parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -228,3 +228,7 @@ def make_parser(model_class): # torch docs recommend using forkserver # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index fc3c7084..7020e32f 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -13,18 +13,25 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.models import (HFHubert2ResNet1dXVector, - HFWav2Vec2ResNet1dXVector, - HFWavLM2ResNet1dXVector) +from hyperion.torch.models import ( + HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector, +) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, @@ -34,7 +41,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -99,7 +105,6 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -182,8 +187,7 @@ def make_parser(model_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Finetunes Wav2Vec2XVector model from audio files" ) @@ -215,3 +219,7 @@ def make_parser(model_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2xvector.py b/hyperion/bin/finetune_wav2xvector.py index b100b544..97356c01 100755 --- a/hyperion/bin/finetune_wav2xvector.py +++ b/hyperion/bin/finetune_wav2xvector.py @@ -11,6 +11,13 @@ from pathlib import Path import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD @@ -27,12 +34,6 @@ # from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) xvec_dict = { "resnet": RXVec, @@ -45,7 +46,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -115,7 +115,6 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -196,8 +195,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Fine-tune x-vector model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -226,3 +224,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_dfr_from_feats.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py index 17cafb85..140cc3a2 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_feats.py +++ b/hyperion/bin/finetune_xvector_dfr_from_feats.py @@ -14,6 +14,13 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import ClassWeightedSeqSampler as Sampler @@ -22,8 +29,6 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): @@ -60,7 +65,6 @@ def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **k def init_xvector( num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs ): - xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: logging.info("xvector network ft args={}".format(xvec_args)) @@ -194,8 +198,7 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Fine-tune x-vector model with deep feature loss regularization" ) @@ -278,3 +281,7 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_dfr_from_wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py index f7832a47..9d745e67 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_wav.py +++ b/hyperion/bin/finetune_xvector_dfr_from_wav.py @@ -8,10 +8,18 @@ import os import sys import time +from pathlib import Path import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD @@ -21,8 +29,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data( @@ -36,7 +42,6 @@ def init_data( rank, **kwargs ): - ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: @@ -82,7 +87,6 @@ def init_feats(rank, **kwargs): def init_xvector( num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs ): - xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: logging.info("xvector network ft args={}".format(xvec_args)) @@ -103,7 +107,6 @@ def init_xvector( def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -231,8 +234,7 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Fine-tune x-vector model with deep feature loss " @@ -327,3 +329,7 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_from_feats.py b/hyperion/bin/finetune_xvector_from_feats.py index ac9c2d0b..01e0c778 100755 --- a/hyperion/bin/finetune_xvector_from_feats.py +++ b/hyperion/bin/finetune_xvector_from_feats.py @@ -12,6 +12,13 @@ import numpy as np import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import ClassWeightedSeqSampler as Sampler @@ -20,8 +27,6 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): @@ -161,8 +166,7 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Fine-tune x-vector model") parser.add_argument("--cfg", action=ActionConfigFile) @@ -230,3 +234,7 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index 1c7cbe58..2c884d0b 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -11,6 +11,13 @@ from pathlib import Path import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD @@ -25,8 +32,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) xvec_dict = { "resnet": RXVec, @@ -39,7 +44,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -120,7 +124,6 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -208,8 +211,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Fine-tune x-vector model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -238,3 +240,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index 4336b7b9..00452695 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -14,6 +14,13 @@ import torch import torch.nn as nn import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -24,12 +31,6 @@ from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialNdx, Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def read_utt_list(list_file, class2int_file, part_idx, num_parts): @@ -156,14 +157,13 @@ def generate_attacks( num_parts, **kwargs ): - device = init_device(use_gpu) model = init_model(model_path, **kwargs) model.to(device) logging.info("opening audio read stream: %s" % (wav_file)) audio_args = AR.filter_args(**kwargs) - audio_reader = AR(wav_file ** audio_args) + audio_reader = AR(wav_file**audio_args) wav_scale = audio_reader.wav_scale logging.info("opening audio write stream: %s" % (output_wav_dir)) @@ -207,7 +207,7 @@ def generate_attacks( s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) target = torch.as_tensor([class_id], dtype=torch.long).to(device) if vad_spec is not None: - vad = v_reader.read([key.seg_set[j]])[0] + vad = v_reader.read([key])[0] tot_frames = len(vad) speech_frames = np.sum(vad) vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( @@ -217,7 +217,7 @@ def generate_attacks( logging.info( "utt %s detected %d/%d (%.2f %%) speech frames" % ( - key.seg_set[j], + key, speech_frames, tot_frames, speech_frames / tot_frames * 100, @@ -315,8 +315,7 @@ def generate_attacks( yaml.dump(attacks_info, f, sort_keys=True) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Generate Attacks for speaker classification with x-vectors" ) @@ -332,7 +331,9 @@ def generate_attacks( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -413,3 +414,7 @@ def generate_attacks( logging.debug(args) generate_attacks(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py index 363e3afc..ab7d907b 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_verif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -14,6 +14,13 @@ import torch import torch.nn as nn import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -28,8 +35,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): @@ -73,7 +78,6 @@ def forward(self, s_t): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -173,7 +177,6 @@ def generate_attacks( random_seed, **kwargs ): - device = init_device(use_gpu) model = init_model(model_path, embed_layer, cal_file, threshold, **kwargs) model.to(device) @@ -346,8 +349,7 @@ def generate_attacks( yaml.dump(attacks_info, f, sort_keys=True) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Generate Attacks for speaker verification with x-vectors+cos+calibration" ) @@ -442,3 +444,7 @@ def generate_attacks( logging.debug(args) generate_attacks(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py index 2e3a35ec..17fff2ba 100755 --- a/hyperion/bin/hyperion_dataset.py +++ b/hyperion/bin/hyperion_dataset.py @@ -7,6 +7,14 @@ from pathlib import Path from typing import List, Optional, Union +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.utils import ( ClassInfo, @@ -18,13 +26,6 @@ RecordingSet, SegmentSet, ) -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, - ActionYesNo, -) subcommand_list = [ "add_features", @@ -41,7 +42,12 @@ def add_common_args(parser): parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, ) @@ -145,7 +151,8 @@ def make_make_from_recordings_parser(): def make_from_recordings( - dataset: PathLike, recordings_file: PathLike, + dataset: PathLike, + recordings_file: PathLike, ): output_dataset = dataset import pandas as pd @@ -186,7 +193,10 @@ def make_remove_short_segments_parser(): def remove_short_segments( - dataset: PathLike, min_length: float, length_name: str, output_dataset: PathLike, + dataset: PathLike, + min_length: float, + length_name: str, + output_dataset: PathLike, ): if output_dataset is None: output_dataset = dataset @@ -216,7 +226,9 @@ def make_rebuild_class_idx_parser(): def rebuild_class_idx( - dataset: PathLike, class_name: str, output_dataset: PathLike, + dataset: PathLike, + class_name: str, + output_dataset: PathLike, ): if output_dataset is None: output_dataset = dataset @@ -301,14 +313,21 @@ def make_split_train_val_parser(): help="""types of classes that need to have different classes in train and val""", ) parser.add_argument( - "--seed", default=11235813, type=int, help="""random seed""", + "--seed", + default=11235813, + type=int, + help="""random seed""", ) parser.add_argument( - "--train-dataset", required=True, help="""output train dataset dir""", + "--train-dataset", + required=True, + help="""output train dataset dir""", ) parser.add_argument( - "--val-dataset", required=True, help="""output val dataset dir""", + "--val-dataset", + required=True, + help="""output val dataset dir""", ) add_common_args(parser) @@ -361,7 +380,8 @@ def make_copy_parser(): def copy( - dataset: PathLike, output_dataset: PathLike, + dataset: PathLike, + output_dataset: PathLike, ): dataset = Dataset.load(dataset, lazy=True) dataset.save(output_dataset) @@ -383,7 +403,10 @@ def make_add_cols_to_segments_parser(): help="""columns to copy to segments table""", ) parser.add_argument( - "--on", default=["id"], nargs="+", help="""columns to match both tables rows""", + "--on", + default=["id"], + nargs="+", + help="""columns to match both tables rows""", ) parser.add_argument( "--right-on", @@ -418,8 +441,7 @@ def add_cols_to_segments( dataset.save(output_dataset) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Tool to manipulates the Hyperion dataset") parser.add_argument("--cfg", action=ActionConfigFile) @@ -436,3 +458,7 @@ def add_cols_to_segments( del kwargs["verbose"] del kwargs["cfg"] globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py index 7f61b35a..59472d83 100755 --- a/hyperion/bin/hyperion_tables.py +++ b/hyperion/bin/hyperion_tables.py @@ -7,6 +7,13 @@ from pathlib import Path from typing import List, Optional, Union +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.utils import ( ClassInfo, @@ -17,12 +24,6 @@ RecordingSet, SegmentSet, ) -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) subcommand_list = ["cat"] table_dict = { @@ -87,7 +88,6 @@ def cat( num_tables: int, base_idx: int = 1, ): - assert input_files is not None or num_tables != 0 output_file = Path(output_file) if input_files is None: @@ -108,8 +108,7 @@ def cat( output_table.save(output_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") parser.add_argument("--cfg", action=ActionConfigFile) @@ -126,3 +125,7 @@ def cat( del kwargs["verbose"] del kwargs["cfg"] globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py index 68e5b22b..43d6ab91 100755 --- a/hyperion/bin/make_babble_noise_audio_files.py +++ b/hyperion/bin/make_babble_noise_audio_files.py @@ -10,11 +10,6 @@ import time import numpy as np -from hyperion.hyp_defs import config_logger -from hyperion.io import AudioWriter as Writer -from hyperion.io import RandomAccessAudioReader as AR -from hyperion.io import VADReaderFactory as VRF -from hyperion.utils import Utt2Info from jsonargparse import ( ActionConfigFile, ActionParser, @@ -22,9 +17,14 @@ namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.io import AudioWriter as Writer +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.utils import Utt2Info + def make_noise(xs, max_value): - lens = np.array([x.shape[0] for x in xs]) max_len = np.max(lens) num_tiles = np.ceil(max_len / lens) @@ -53,7 +53,6 @@ def make_babble_noise_audio_files( random_seed=112358, **kwargs, ): - input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) logging.info(f"input_args={input_args}") @@ -105,8 +104,7 @@ def make_babble_noise_audio_files( logging.info("finished making babble files, elapsed-time=%f", time.time() - t1) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Creates babble noise by adding speech files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -137,3 +135,7 @@ def make_babble_noise_audio_files( logging.debug(args) make_babble_noise_audio_files(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/make_wav2xvector.py b/hyperion/bin/make_wav2xvector.py index b5972d1b..b3a1a2d5 100755 --- a/hyperion/bin/make_wav2xvector.py +++ b/hyperion/bin/make_wav2xvector.py @@ -12,6 +12,13 @@ import numpy as np import pandas as pd import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger # from hyperion.torch import TorchModelLoader as TML @@ -26,12 +33,6 @@ from hyperion.torch.models import Wav2ResNet1dXVector as W2R1dXVec from hyperion.torch.models import Wav2ResNetXVector as W2RXVec from hyperion.torch.narchs import AudioFeatsMVN as AF -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_feats(feats): @@ -51,7 +52,6 @@ def load_model(model_path): def make_wav2xvector(feats, xvector_path, output_path): - feats = init_feats(feats) xvector_model = load_model(xvector_path) if isinstance(xvector_model, RXVec): @@ -67,8 +67,7 @@ def make_wav2xvector(feats, xvector_path, output_path): model.save(output_path) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="""Combines the feature extractor config with XVector model to produce a Wav2XVector model with integrated feature extraction""" @@ -89,3 +88,7 @@ def make_wav2xvector(feats, xvector_path, output_path): logging.debug(args) make_wav2xvector(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/merge_scores.py b/hyperion/bin/merge_scores.py index cb8524b7..72ab6010 100755 --- a/hyperion/bin/merge_scores.py +++ b/hyperion/bin/merge_scores.py @@ -6,9 +6,6 @@ import logging from pathlib import Path -from hyperion.hyp_defs import config_logger - -from hyperion.utils import TrialScores from jsonargparse import ( ActionConfigFile, ActionParser, @@ -16,6 +13,9 @@ namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.utils import TrialScores + def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, base_idx): output_file = Path(output_file) @@ -63,7 +63,7 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas write_header = False -if __name__ == "__main__": +def main(): parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument( @@ -108,3 +108,7 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas del kwargs["verbose"] del kwargs["cfg"] merge_scores(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index b2a1bc2b..bf88d674 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -10,9 +10,6 @@ import time import numpy as np -from hyperion.hyp_defs import config_logger -from hyperion.io import DataWriterFactory as DWF -from hyperion.io import SequentialAudioReader as AR from jsonargparse import ( ActionConfigFile, ActionParser, @@ -20,9 +17,12 @@ namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR -def pack_wav_rirs(input_path, output_spec, **kwargs): +def pack_wav_rirs(input_path, output_spec, **kwargs): writer = DWF.create(output_spec, compress=False) t1 = time.time() with AR(input_path, wav_scale=1) as reader: @@ -47,8 +47,7 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): logging.info("Packed RIRS elapsed-time=%.f", time.time() - t1) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Packs RIRs in wave format to h5/ark files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -69,3 +68,7 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): logging.debug(args) pack_wav_rirs(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py index e2157e3e..60d7ac5c 100755 --- a/hyperion/bin/plot_embedding_tsne.py +++ b/hyperion/bin/plot_embedding_tsne.py @@ -13,12 +13,18 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] @@ -40,7 +46,6 @@ def plot_embedding_tsne( output_dir, **kwargs, ): - output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("loading data") @@ -126,8 +131,7 @@ def plot_embedding_tsne( # plt.clf() -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Projects embeddings using TSNE") parser.add_argument("--train-v-file", required=True) @@ -162,6 +166,9 @@ def plot_embedding_tsne( plot_embedding_tsne(**namespace_to_dict(args)) +if __name__ == "__main__": + main() + # #!/usr/bin/env python # """ # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py index 14da4d07..08e4ef70 100755 --- a/hyperion/bin/plot_embedding_tsne_per_class.py +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -13,12 +13,6 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from hyperion.hyp_defs import config_logger -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.np.clustering import AHC -from hyperion.np.transforms import PCA, LNorm, SklTSNE -from hyperion.utils import SegmentSet -from hyperion.utils.math_funcs import cosine_scoring from jsonargparse import ( ActionConfigFile, ActionParser, @@ -27,6 +21,13 @@ namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.clustering import AHC +from hyperion.np.transforms import PCA, LNorm, SklTSNE +from hyperion.utils import SegmentSet +from hyperion.utils.math_funcs import cosine_scoring + matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] @@ -50,7 +51,6 @@ def plot_embedding_tsne( output_dir, **kwargs, ): - output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("loading data") @@ -92,7 +92,7 @@ def plot_embedding_tsne( if do_ahc: if cluster_tsne: # in the low dim space, we cannot use cosine scoring - x2 = np.sum(x_tsne ** 2, axis=1)[:, None] + x2 = np.sum(x_tsne**2, axis=1)[:, None] d2 = x2 - 2 * np.dot(x_tsne, x_tsne.T) + x2.T d2 = np.clip(d2, a_min=0, a_max=None) scores = -np.sqrt(d2) @@ -140,8 +140,7 @@ def plot_embedding_tsne( train_segs.save(output_dir / "segments.csv") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Projects embeddings using TSNE, " @@ -194,3 +193,7 @@ def plot_embedding_tsne( logging.debug(args) plot_embedding_tsne(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py index f6723c7d..dd1bde27 100755 --- a/hyperion/bin/prepare_data.py +++ b/hyperion/bin/prepare_data.py @@ -6,8 +6,6 @@ import logging from pathlib import Path -from hyperion.data_prep import DataPrep -from hyperion.hyp_defs import config_logger from jsonargparse import ( ActionConfigFile, ActionParser, @@ -15,6 +13,9 @@ namespace_to_dict, ) +from hyperion.data_prep import DataPrep +from hyperion.hyp_defs import config_logger + def make_parser(data_prep_class): parser = ArgumentParser() @@ -22,7 +23,7 @@ def make_parser(data_prep_class): return parser -if __name__ == "__main__": +def main(): parser = ArgumentParser( description="""Prepares a dataset into relational database tables""" ) @@ -39,3 +40,7 @@ def make_parser(data_prep_class): args = namespace_to_dict(args)[args.subcommand] data_prep = data_prep_class(**args) data_prep.prepare() + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py index bda9a503..5e98a477 100755 --- a/hyperion/bin/preprocess_audio_files.py +++ b/hyperion/bin/preprocess_audio_files.py @@ -10,11 +10,6 @@ import time import numpy as np -from hyperion.hyp_defs import config_logger -from hyperion.io import AudioWriter as Writer -from hyperion.io import SequentialAudioReader as AR -from hyperion.io import VADReaderFactory as VRF -from hyperion.utils import Utt2Info from jsonargparse import ( ActionConfigFile, ActionParser, @@ -23,6 +18,12 @@ ) from scipy import ndimage, signal +from hyperion.hyp_defs import config_logger +from hyperion.io import AudioWriter as Writer +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.utils import Utt2Info + def resample_vad(vad, length): step = (len(vad) - 1) / length @@ -59,7 +60,6 @@ def process_audio_files( remove_dc_offset=False, **kwargs, ): - input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) logging.info(f"input_args={input_args}") @@ -72,7 +72,6 @@ def process_audio_files( with AR(recordings_file, **input_args) as reader, Writer( output_path, output_recordings_file, **output_args ) as writer: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) @@ -147,8 +146,7 @@ def process_audio_files( u2td.save(write_time_durs_spec) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Process pipes in wav.scp file, optionally applies vad and save all audios in the same format" ) @@ -204,3 +202,7 @@ def process_audio_files( logging.debug(args) process_audio_files(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/split_dataset_into_trials_and_cohort.py b/hyperion/bin/split_dataset_into_trials_and_cohort.py index 24ec10bf..50c2f1f2 100755 --- a/hyperion/bin/split_dataset_into_trials_and_cohort.py +++ b/hyperion/bin/split_dataset_into_trials_and_cohort.py @@ -6,8 +6,6 @@ import logging from pathlib import Path -from hyperion.hyp_defs import config_logger -from hyperion.utils import Dataset from jsonargparse import ( ActionConfigFile, ActionParser, @@ -16,8 +14,11 @@ namespace_to_dict, ) -if __name__ == "__main__": +from hyperion.hyp_defs import config_logger +from hyperion.utils import Dataset + +def main(): parser = ArgumentParser( description=( """Split speakers in dataset into test speaker to create ASV trials and @@ -66,3 +67,7 @@ trials_dataset, cohort_dataset = dataset.split_into_trials_and_cohort(**args) trials_dataset.save(trials_dir) cohort_dataset.save(cohort_dir) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_qmf.py b/hyperion/bin/train_qmf.py index a97e8a5f..42aabe0c 100755 --- a/hyperion/bin/train_qmf.py +++ b/hyperion/bin/train_qmf.py @@ -6,25 +6,25 @@ Trains calibration for SRE18 tel condition """ -import sys +import logging import os +import sys +import time +from pathlib import Path + +import numpy as np from jsonargparse import ( - ArgumentParser, ActionConfigFile, ActionParser, + ArgumentParser, namespace_to_dict, ) -import time -import logging -from pathlib import Path -import numpy as np - -from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.utils.trial_scores import TrialScores -from hyperion.utils.trial_key import TrialKey -from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.hyp_defs import config_logger, float_cpu from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.utils.trial_key import TrialKey +from hyperion.utils.trial_scores import TrialScores def print_q_stats(scr, q_names): @@ -110,7 +110,7 @@ def train_qmf( scr_out.save(output_file) -if __name__ == "__main__": +def main(): parser = ArgumentParser(description="Trains QMF calibration") parser.add_argument("--score-file", required=True) @@ -133,3 +133,7 @@ def train_qmf( logging.debug(args) train_qmf(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index 8930b299..c00c4633 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -14,15 +14,20 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.models import Wav2RNNRNNTransducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from torch.nn.utils.rnn import pad_sequence model_dict = { "rnn_rnn_transducer": Wav2RNNRNNTransducer, @@ -72,14 +77,12 @@ def init_data(partition, rank, num_gpus, **kwargs): num_workers = data_kwargs["data_loader"]["num_workers"] num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ({ - "num_workers": num_workers_per_gpu, - "pin_memory": True - } if num_gpus > 0 else {}) - data_loader = torch.utils.data.DataLoader(dataset, - batch_sampler=sampler, - **largs, - collate_fn=transducer_collate) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader @@ -97,7 +100,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -105,8 +107,8 @@ def train_model(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) set_float_cpu("float32") - #torch.backends.cudnn.deterministic = True - #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) @@ -115,8 +117,11 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.sp.piece_to_id(""), - train_loader.dataset.sp.get_piece_size(), **kwargs) + model = init_model( + train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + **kwargs, + ) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: @@ -159,8 +164,7 @@ def make_parser(model_class): help="num_workers of data loader", ) data_parser = ArgumentParser(prog="") - data_parser.add_argument("--train", - action=ActionParser(parser=train_parser)) + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) @@ -176,34 +180,27 @@ def make_parser(model_class): type=str, ) - parser.link_arguments("data.train.data_loader.num_workers", - "data.val.data_loader.num_workers") + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) - parser.link_arguments("data.train.dataset.bpe_model", - "data.val.dataset.bpe_model") + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") model_class.add_class_args(parser, prefix="model") - Trainer.add_class_args(parser, - prefix="trainer", - train_modes=model_class.valid_train_modes()) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) ddp.add_ddp_args(parser) - parser.add_argument("--seed", - type=int, - default=1123581321, - help="random seed") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == "__main__": - parser = ArgumentParser( - description="Train RNN Transducer model from audio files") +def main(): + parser = ArgumentParser(description="Train RNN Transducer model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -232,3 +229,7 @@ def make_parser(model_class): # torch docs recommend using forkserver # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py index 7018c406..5b802454 100755 --- a/hyperion/bin/train_wav2vec2rnn_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -14,23 +14,29 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory -from hyperion.torch.models import (HFWav2Vec2ConformerV1RNNTransducer, - HFWav2Vec2RNNRNNTransducer, - HFWav2Vec2RNNTransducer) +from hyperion.torch.models import ( + HFWav2Vec2ConformerV1RNNTransducer, + HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, +) from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from torch.nn.utils.rnn import pad_sequence model_dict = { "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, - "hf_wav2vec2conformer_v1_rnn_transducer": - HFWav2Vec2ConformerV1RNNTransducer, + "hf_wav2vec2conformer_v1_rnn_transducer": HFWav2Vec2ConformerV1RNNTransducer, # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, @@ -88,14 +94,12 @@ def init_data(partition, rank, num_gpus, **kwargs): num_workers = data_kwargs["data_loader"]["num_workers"] num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ({ - "num_workers": num_workers_per_gpu, - "pin_memory": True - } if num_gpus > 0 else {}) - data_loader = torch.utils.data.DataLoader(dataset, - batch_sampler=sampler, - **largs, - collate_fn=transducer_collate) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader @@ -113,7 +117,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -121,8 +124,8 @@ def train_model(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) set_float_cpu("float32") - #torch.backends.cudnn.deterministic = True - #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) @@ -137,13 +140,16 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.sp.piece_to_id(""), - train_loader.dataset.sp.get_piece_size(), **kwargs) + model = init_model( + train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + **kwargs, + ) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {} #{"acc": CategoricalAccuracy()} + metrics = {} # {"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, @@ -181,8 +187,7 @@ def make_parser(model_class): help="num_workers of data loader", ) data_parser = ArgumentParser(prog="") - data_parser.add_argument("--train", - action=ActionParser(parser=train_parser)) + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) @@ -198,34 +203,29 @@ def make_parser(model_class): type=str, ) - parser.link_arguments("data.train.data_loader.num_workers", - "data.val.data_loader.num_workers") + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) - parser.link_arguments("data.train.dataset.bpe_model", - "data.val.dataset.bpe_model") + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") model_class.add_class_args(parser, prefix="model") - Trainer.add_class_args(parser, - prefix="trainer", - train_modes=model_class.valid_train_modes()) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) ddp.add_ddp_args(parser) - parser.add_argument("--seed", - type=int, - default=1123581321, - help="random seed") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == "__main__": +def main(): parser = ArgumentParser( - description="Train Wav2Vec2Transducer model from audio files") + description="Train Wav2Vec2Transducer model from audio files" + ) parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -254,3 +254,7 @@ def make_parser(model_class): # torch docs recommend using forkserver # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 55f3b996..77a22bb8 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -14,6 +14,14 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory @@ -21,9 +29,6 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from torch.nn.utils.rnn import pad_sequence model_dict = { "hf_wav2vec2transducer": HFWav2Vec2Transducer, @@ -73,14 +78,12 @@ def init_data(partition, rank, num_gpus, **kwargs): num_workers = data_kwargs["data_loader"]["num_workers"] num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ({ - "num_workers": num_workers_per_gpu, - "pin_memory": True - } if num_gpus > 0 else {}) - data_loader = torch.utils.data.DataLoader(dataset, - batch_sampler=sampler, - **largs, - collate_fn=transducer_collate) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader @@ -98,7 +101,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -106,8 +108,8 @@ def train_model(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) set_float_cpu("float32") - #torch.backends.cudnn.deterministic = True - #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) @@ -122,13 +124,16 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.sp.piece_to_id(""), - train_loader.dataset.sp.get_piece_size(), **kwargs) + model = init_model( + train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + **kwargs, + ) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {} #{"acc": CategoricalAccuracy()} + metrics = {} # {"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, @@ -166,8 +171,7 @@ def make_parser(model_class): help="num_workers of data loader", ) data_parser = ArgumentParser(prog="") - data_parser.add_argument("--train", - action=ActionParser(parser=train_parser)) + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) @@ -183,34 +187,29 @@ def make_parser(model_class): type=str, ) - parser.link_arguments("data.train.data_loader.num_workers", - "data.val.data_loader.num_workers") + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) - parser.link_arguments("data.train.dataset.bpe_model", - "data.val.dataset.bpe_model") + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") model_class.add_class_args(parser, prefix="model") - Trainer.add_class_args(parser, - prefix="trainer", - train_modes=model_class.valid_train_modes()) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) ddp.add_ddp_args(parser) - parser.add_argument("--seed", - type=int, - default=1123581321, - help="random seed") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == "__main__": +def main(): parser = ArgumentParser( - description="Train Wav2Vec2Transducer model from audio files") + description="Train Wav2Vec2Transducer model from audio files" + ) parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -239,3 +238,7 @@ def make_parser(model_class): # torch docs recommend using forkserver # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index f132a35c..e6dd3d3e 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -14,6 +14,13 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory @@ -25,12 +32,6 @@ ) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, @@ -40,7 +41,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -83,7 +83,6 @@ def init_model(num_classes, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -105,7 +104,11 @@ def train_model(gpu_id, args): logging.info(f"trainer args={trn_args}") metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( - model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, ) trainer.load_last_checkpoint() trainer.fit(train_loader, val_loader) @@ -162,8 +165,7 @@ def make_parser(model_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Train Wav2Vec2XVector model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -193,3 +195,7 @@ def make_parser(model_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2xvector.py b/hyperion/bin/train_wav2xvector.py index ddf292b8..7373a338 100755 --- a/hyperion/bin/train_wav2xvector.py +++ b/hyperion/bin/train_wav2xvector.py @@ -9,6 +9,13 @@ from pathlib import Path import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory @@ -23,12 +30,6 @@ # from hyperion.torch.models import TransformerXVectorV1 as TFXVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) xvec_dict = { "resnet": RXVec, @@ -41,7 +42,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -84,7 +84,6 @@ def init_xvector(num_classes, rank, xvec_class, **kwargs): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -107,7 +106,11 @@ def train_xvec(gpu_id, args): logging.info("trainer args={}".format(trn_args)) metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( - model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, ) trainer.load_last_checkpoint() trainer.fit(train_loader, val_loader) @@ -164,8 +167,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Train Wav2XVector from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -194,3 +196,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py index 71bba080..a2acdf4c 100755 --- a/hyperion/bin/train_xvector_from_feats.py +++ b/hyperion/bin/train_xvector_from_feats.py @@ -13,6 +13,13 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.data import FeatSeqDataset as SD @@ -25,8 +32,6 @@ from hyperion.torch.models import TransformerXVectorV1 as TFXVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) xvec_dict = { "resnet": RXVec, @@ -39,7 +44,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] sd_args = SD.filter_args(**kwargs["dataset"]) sampler_args = Sampler.filter_args(**kwargs["sampler"]) @@ -80,7 +84,6 @@ def init_xvector(num_classes, rank, xvec_class, **kwargs): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -164,8 +167,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Train XVector from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -196,3 +198,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index b2e36cac..c3f6170d 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -9,6 +9,13 @@ from pathlib import Path import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory @@ -22,8 +29,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) xvec_dict = { "resnet": RXVec, @@ -36,7 +41,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -90,7 +94,6 @@ def init_xvector(num_classes, rank, xvec_class, **kwargs): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -176,8 +179,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Train XVector from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -206,3 +208,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/io/__init__.py b/hyperion/io/__init__.py index 14b1b35f..aa5ac653 100644 --- a/hyperion/io/__init__.py +++ b/hyperion/io/__init__.py @@ -16,10 +16,10 @@ from .hyp_data_reader import * from .hyp_data_writer import * from .kaldi_data_reader import * -from .packed_audio_reader import (RandomAccessPackedAudioReader, - SequentialPackedAudioReader) +from .packed_audio_reader import ( + RandomAccessPackedAudioReader, + SequentialPackedAudioReader, +) from .packed_audio_writer import PackedAudioWriter from .segment_vad_reader import SegmentVADReader from .vad_rw_factory import VADReaderFactory - -# from .queues import * diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py index 2186522e..d1cf7f68 100644 --- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -8,7 +8,6 @@ from ....hyp_defs import float_cpu from ....utils.math_funcs import logsumexp, softmax -from ....utils.queues import GeneratorQueue from ..core import PDF @@ -110,86 +109,6 @@ def fit( else: return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] - def fit_generator( - self, - generator, - train_steps, - epochs=10, - val_data=None, - val_steps=0, - max_queue_size=10, - workers=1, - use_multiprocessing=False, - ): - """Trains the model from data read by a generator function. - This function is deprecated. - - Args: - generator: train data generator function returning a tuple - (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. - train_steps: number of training steps / epoch - epochs: number of epochs. - val_data: val. data generator function returning a tuple - (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. - val_steps: number of validation steps / epoch - max_queue_size: max. size of the generator queue. - workers: number of workers in the generator. - use_multiprocessing: use multi-processing in the generator queue. - - Returns: - log p(X) of the training data. - log p(x) per sample. - log p(X) of the val. data, if present. - log p(x) of the val. data per sample, if present. - """ - - do_validation = bool(val_data) - val_gen = hasattr(val_data, "next") or hasattr(val_data, "__next__") - if val_gen and not val_steps: - raise ValueError( - "When using a generator for validation data, " - "you must specify a value for " - "`val_steps`." - ) - - if do_validation and not val_gen: - x, u_x_val, sample_weight_val = self.tuple2data(val_data) - log_h_val = self.accum_log_h(x, sample_weight_val) - - elbo = np.zeros((epochs,), dtype=float_cpu()) - elbo_val = np.zeros((epochs,), dtype=float_cpu()) - for epoch in range(epochs): - N, u_x, log_h = self.Estep_generator( - generator, - train_steps, - return_log_h=True, - max_queue_size=max_queue_size, - workers=workers, - use_multiprocessing=use_multiprocessing, - ) - - self.Mstep(N, u_x) - elbo[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h) - - if val_data is not None: - if val_gen: - N, u_x, log_h_val = self.Estep_generator( - val_data, - train_steps, - return_log_h=True, - max_queue_size=max_queue_size, - workers=workers, - use_multiprocessing=use_multiprocessing, - ) - else: - N, u_x = self.Estep(val_data, u_x_val, sample_weight_val) - elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) - - if val_data is None: - return elbo, elbo / x.shape[0] - else: - return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] - def log_h(self, x): """Computes log h(x) of the exp. family.""" return 0 @@ -404,7 +323,6 @@ def _accum_suff_stats_segments_prob_1batch( def _accum_suff_stats_segments_prob_nbatches( self, x, prob, sample_weight, batch_size ): - sw_i = None for i1 in range(0, x.shape[0], batch_size): i2 = np.minimum(i1 + batch_size, x.shape[0]) @@ -458,7 +376,6 @@ def accum_suff_stats_sorttime( def _accum_suff_stats_sorttime_1batch( self, x, frame_length, frame_shift, u_x=None, sample_weight=None ): - K = len(self.pi) num_frames = x.shape[0] num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) @@ -494,7 +411,6 @@ def _accum_suff_stats_sorttime_1batch( def _accum_suff_stats_sorttime_nbatches( self, x, frame_length, frame_shift, sample_weight, batch_size ): - K = len(self.pi) num_frames = x.shape[0] num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) @@ -539,65 +455,6 @@ def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): """ return self.accum_suff_stats(x, u_x, sample_weight, batch_size) - def Estep_generator( - self, - generator, - num_steps, - return_log_h, - max_queue_size=10, - workers=1, - use_multiprocessing=False, - ): - """Expectation step, where data is read from a generator function. - - Args: - generator: data generator function returning a tuple - (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. - num_steps: number of steps / epoch - return_log_h: returns accumlated log h(x). - max_queue_size: max. size of the generator queue. - workers: number of workers in the generator. - use_multiprocessing: use multi-processing in the generator queue. - - Returns: - N zero order sufficient statistics (number of samples). - Accumlated sufficient statistics \sum u(x). - Accumlated log h(x) (optional). - """ - wait_time = 0.01 # in secs - queue = None - N = None - acc_u_x = None - log_h = 0 - try: - queue = GeneratorQueue( - generator, use_multiprocessing=use_multiprocessing, wait_time=wait_time - ) - queue.start(workers=workers, max_queue_size=max_queue_size) - queue_generator = queue.get() - - cur_step = 0 - for cur_step in range(num_steps): - data = next(queue_generator) - x, u_x, sample_weight = self.tuple2data(data) - N_i, u_x_i = self.Estep(x, u_x, sample_weight) - if return_log_h: - log_h += self.accum_log_h(x) - if cur_step == 0: - N = N_i - acc_u_x = u_x_i - else: - N += N_i - acc_u_x += u_x_i - finally: - if queue is not None: - queue.stop() - - if return_log_h: - return N, acc_u_x, log_h - else: - return N, acc_u_x - def sum_suff_stats(self, N, u_x): """Sums suff. stats from muttiple sub-processes. @@ -754,28 +611,6 @@ def get_config(self): base_config = super(ExpFamilyMixture, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @staticmethod - def tuple2data(data): - if isinstance(data, tuple): - if len(data) == 2: - x, u_x = data - if u_x.ndim == 2: - sample_weight = None - elif u_x.ndim == 1: - sample_weight = u_x - u_x = None - else: - raise ValueError("Generator output: " + str(data)) - elif len(data) == 3: - x, u_x, sample_weight = data - else: - raise ValueError("Generator output: " + str(data)) - else: - x = data - u_x = None - sample_weight = None - return x, u_x, sample_weight - @staticmethod def compute_A_nat(eta): """Computes A_theta from the natural param.""" diff --git a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py index 7a2e82f8..3f7b2ec7 100644 --- a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py +++ b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py @@ -7,7 +7,11 @@ from functools import partial import torch -from torch._six import inf + +try: + from torch import inf +except: + from torch._six import inf from .lr_scheduler import LRScheduler diff --git a/hyperion/utils/queues.py b/hyperion/utils/queues.py deleted file mode 100644 index 8bfd0166..00000000 --- a/hyperion/utils/queues.py +++ /dev/null @@ -1,287 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import copy -import multiprocessing -import threading -import time -import warnings -from abc import abstractmethod - -import numpy as np -import six - -try: - import queue -except ImportError: - import Queue as queue - - -class SequenceQueue(object): - """Base class to enqueue inputs. - - The task of an Queue is to use parallelism to speed up preprocessing. - This is done with processes or threads. - - # Examples - - ```python - enqueuer = SequenceQueue(...) - enqueuer.start() - datas = enqueuer.get() - for data in datas: - # Use the inputs; training, evaluating, predicting. - # ... stop sometime. - enqueuer.close() - ``` - - The `enqueuer.get()` should be an infinite stream of datas. - - """ - - @abstractmethod - def is_running(self): - raise NotImplemented - - @abstractmethod - def start(self, workers=1, max_queue_size=10): - """Starts the handler's workers. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, threads could block on `put()`). - """ - raise NotImplemented - - @abstractmethod - def stop(self, timeout=None): - """Stop running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called start(). - - # Arguments - timeout: maximum time to wait on thread.join() - """ - raise NotImplemented - - @abstractmethod - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - Generator yielding tuples `(inputs, targets)` - or `(inputs, targets, sample_weights)`. - """ - raise NotImplemented - - -class OrderedQueue(SequenceQueue): - """Builds a Queue from a Sequence. - - Used in `fit_generator`, `evaluate_generator`, `predict_generator`. - - # Arguments - sequence: A `keras.utils.data_utils.Sequence` object. - use_multiprocessing: use multiprocessing if True, otherwise threading - scheduling: Sequential querying of datas if 'sequential', random otherwise. - """ - - def __init__(self, sequence, use_multiprocessing=False, scheduling="sequential"): - self.sequence = sequence - self.use_multiprocessing = use_multiprocessing - self.scheduling = scheduling - self.workers = 0 - self.executor = None - self.queue = None - self.run_thread = None - self.stop_signal = None - - def is_running(self): - return self.stop_signal is not None and not self.stop_signal.is_set() - - def start(self, workers=1, max_queue_size=10): - """Start the handler's workers. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, workers could block on `put()`) - """ - if self.use_multiprocessing: - self.executor = multiprocessing.Pool(workers) - else: - self.executor = ThreadPool(workers) - self.queue = queue.Queue(max_queue_size) - self.stop_signal = threading.Event() - self.run_thread = threading.Thread(target=self._run) - self.run_thread.daemon = True - self.run_thread.start() - - def _run(self): - """Function to submit request to the executor and queue the `Future` objects.""" - sequence = list(range(len(self.sequence))) - while True: - if self.scheduling is not "sequential": - random.shuffle(sequence) - for i in sequence: - if self.stop_signal.is_set(): - return - self.queue.put( - self.executor.apply_async(get_index, (self.sequence, i)), block=True - ) - - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - Generator yielding tuples (inputs, targets) - or (inputs, targets, sample_weights) - """ - try: - while self.is_running(): - inputs = self.queue.get(block=True).get() - if inputs is not None: - yield inputs - except Exception as e: - self.stop() - raise StopIteration(e) - - def stop(self, timeout=None): - """Stops running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called `start()`. - - # Arguments - timeout: maximum time to wait on `thread.join()` - """ - self.stop_signal.set() - with self.queue.mutex: - self.queue.queue.clear() - self.queue.unfinished_tasks = 0 - self.queue.not_full.notify() - self.executor.close() - self.executor.join() - self.run_thread.join(timeout) - - -class GeneratorQueue(SequenceQueue): - """Builds a queue out of a data generator. - - Used in `fit_generator`, `evaluate_generator`, `predict_generator`. - - # Arguments - generator: a generator function which endlessly yields data - use_multiprocessing: use multiprocessing if True, otherwise threading - wait_time: time to sleep in-between calls to `put()` - random_seed: Initial seed for workers, - will be incremented by one for each workers. - """ - - def __init__( - self, generator, use_multiprocessing=False, wait_time=0.05, random_seed=None - ): - self.wait_time = wait_time - self._generator = generator - self._use_multiprocessing = use_multiprocessing - self._threads = [] - self._stop_event = None - self.queue = None - self.random_seed = random_seed - - def start(self, workers=1, max_queue_size=10): - """Kicks off threads which add data from the generator into the queue. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, threads could block on `put()`) - """ - - def data_generator_task(): - while not self._stop_event.is_set(): - try: - if self._use_multiprocessing or self.queue.qsize() < max_queue_size: - generator_output = next(self._generator) - self.queue.put(generator_output) - else: - time.sleep(self.wait_time) - except Exception: - self._stop_event.set() - raise - - try: - if self._use_multiprocessing: - self.queue = multiprocessing.Queue(maxsize=max_queue_size) - self._stop_event = multiprocessing.Event() - else: - self.queue = queue.Queue() - self._stop_event = threading.Event() - - for _ in range(workers): - if self._use_multiprocessing: - # Reset random seed else all children processes - # share the same seed - np.random.seed(self.random_seed) - thread = multiprocessing.Process(target=data_generator_task) - thread.daemon = True - if self.random_seed is not None: - self.random_seed += 1 - else: - thread = threading.Thread(target=data_generator_task) - self._threads.append(thread) - thread.start() - except: - self.stop() - raise - - def is_running(self): - return self._stop_event is not None and not self._stop_event.is_set() - - def stop(self, timeout=None): - """Stops running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called `start()`. - - # Arguments - timeout: maximum time to wait on `thread.join()`. - """ - if self.is_running(): - self._stop_event.set() - - for thread in self._threads: - if thread.is_alive(): - if self._use_multiprocessing: - thread.terminate() - else: - thread.join(timeout) - - if self._use_multiprocessing: - if self.queue is not None: - self.queue.close() - - self._threads = [] - self._stop_event = None - self.queue = None - - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - A generator - """ - while self.is_running(): - if not self.queue.empty(): - inputs = self.queue.get() - if inputs is not None: - yield inputs - else: - time.sleep(self.wait_time) diff --git a/setup.py b/setup.py index 9780586d..e1fb35cc 100644 --- a/setup.py +++ b/setup.py @@ -15,15 +15,26 @@ # limitations under the License. # -import setuptools from pathlib import Path +import setuptools + project_root = Path(__file__).parent -with open(project_root / "apps.txt") as f: - apps = f.read().splitlines() +# with open(project_root / "apps.txt") as f: +# apps = f.read().splitlines() -apps = [str(project_root / "hyperion" / "bin" / app) for app in apps] +# apps = [str(project_root / "hyperion" / "bin" / app) for app in apps] +binaries = (project_root / "hyperion" / "bin").glob("*.py") +console_scripts = [] +for binary in binaries: + stem = binary.stem + script_name = stem.replace("hyperion_", "").replace("_", "-") + if script_name[0] == "-": + continue + module = f"hyperion.bin.{stem}:main" + console_script = f"hyperion-{script_name} = {module}" + console_scripts.append(console_script) with open(project_root / "requirements.txt") as f: requirements = f.read().splitlines() @@ -77,10 +88,22 @@ def get_version(): "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ], python_requires=">=3.7", install_requires=requirements, - scripts=apps, + entry_points={ + "console_scripts": console_scripts, + } + # entry_points={ + # "console_scripts": [ + # "hyperion-prepare-data = hyperion.bin.prepare_data:main", + # "hyperion-train-wav2xvector = hyperion.bin.train_wav2xvector:main", + # ] + # }, + # scripts=apps, ) From 610547682764789844af201c1a16bccc6b8d34ab Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Sun, 10 Sep 2023 20:01:36 -0400 Subject: [PATCH 77/89] make it work with cuda 11 --- README.md | 11 +++++++++-- hyp_utils/conda_env.sh | 32 +++++++++++++++++--------------- hyperion/torch/utils/ddp.py | 13 +++++-------- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 4838157b..d56406d7 100644 --- a/README.md +++ b/README.md @@ -26,14 +26,21 @@ The full API is described in the documentation page [https://hyperion-ml.readthe ### Prerequisites We use anaconda or miniconda, though you should be able to make it work in other python distributions - To start, you should create a new enviroment and install PyTorch>=1.9, (older versions are not supported any longer) e.g.: + To start, you should create a new enviroment and install PyTorch: ``` conda create --name ${your_env} python=3.11 conda activate ${your_env} -conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=10.2 -c pytorch conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia ``` +For systems with cuda 10.2 driver: +``` +conda create --name ${your_env} python=3.10 +conda activate ${your_env} +conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=10.2 -c pytorch +``` + + ### Installing Hyperion - First, clone the repo: diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index 8d5c67c1..90ffa369 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -52,22 +52,24 @@ fi # echo "LRU_CACHE_CAPACITY=$LRU_CACHE_CAPACITY" conda activate $conda_env -command="python" +command="" if [ $num_gpus -gt 0 ];then - # set CUDA_VISIBLE_DEVICES - if [ ! -z "$SGE_HGR_gpu" ]; then - echo "SGE_HGR_gpu=$SGE_HGR_gpu" - export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g') - else - # seach location of free-gpu program in the PATH or hyp_utils directory - free_gpu=$(which free-gpu) - if [ -z "$free_gpu" ];then - free_gpu=$(which hyp_utils/free-gpu) - fi - - if [ ! -z "$free_gpu" ];then - # if free-gpu found set env var, otherwise we assume that you can use any gpu - export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus) + if [ -z "$CUDA_VISIBLE_DEVICES" ];then + # set CUDA_VISIBLE_DEVICES + if [ ! -z "$SGE_HGR_gpu" ]; then + echo "SGE_HGR_gpu=$SGE_HGR_gpu" + export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g') + else + # seach location of free-gpu program in the PATH or hyp_utils directory + free_gpu=$(which free-gpu) + if [ -z "$free_gpu" ];then + free_gpu=$(which hyp_utils/free-gpu) + fi + + if [ ! -z "$free_gpu" ];then + # if free-gpu found set env var, otherwise we assume that you can use any gpu + export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus) + fi fi fi echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py index 1aefb3d4..4f006c0a 100644 --- a/hyperion/torch/utils/ddp.py +++ b/hyperion/torch/utils/ddp.py @@ -6,19 +6,16 @@ import logging import os -from fairscale.nn.data_parallel import \ - FullyShardedDataParallel as FullyShardedDDP -from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP - import torch import torch.distributed as dist import torch.nn as nn +from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP +from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP from .devices import open_device def add_ddp_args(parser): - parser.add_argument( "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" ) @@ -50,7 +47,6 @@ def filter_ddp_args(**kwargs): def ddp_init( gpu_id, num_gpus, node_id=0, num_nodes=1, master_addr="localhost", master_port=None ): - rank = node_id * num_gpus + gpu_id world_size = num_nodes * num_gpus @@ -62,15 +58,16 @@ def ddp_init( os.environ["MASTER_PORT"] = master_port logging.info( - f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port} gpu_id={gpu_id}" + f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port} gpu_id={gpu_id}" ) dist.init_process_group( "nccl", rank=rank, world_size=world_size, ) + torch.cuda.set_device(rank) torch.tensor([0]).to(gpu_id) - device = torch.device('cuda', gpu_id) + device = torch.device("cuda", gpu_id) return device, rank, world_size # return gpu_id, rank, world_size From 392cd30f6bae594e9121bde48379aae787d16e6f Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 11 Sep 2023 11:41:35 -0400 Subject: [PATCH 78/89] started vox/v2.1 recipe and fix some readmes --- egs/voxceleb/v1.1/README.md | 2 + egs/voxceleb/v1.2/README.md | 249 ++++++-------- .../train_cfwseresnet34_xvec_stage1_v3.0.yaml | 72 ++++ .../train_cfwseresnet34_xvec_stage2_v3.0.yaml | 69 ++++ .../train_cwseresnet34_xvec_stage1_v3.0.yaml | 72 ++++ .../train_cwseresnet34_xvec_stage2_v3.0.yaml | 69 ++++ .../train_fwseresnet34_xvec_stage1_v3.0.yaml | 72 ++++ .../train_fwseresnet34_xvec_stage2_v3.0.yaml | 69 ++++ ...rain_idrnd_resnet100_xvec_stage1_v3.0.yaml | 73 ++++ ...rain_idrnd_resnet100_xvec_stage2_v3.0.yaml | 69 ++++ .../conf/train_resnet34_xvec_stage1_v3.0.yaml | 71 ++++ .../conf/train_resnet34_xvec_stage2_v3.0.yaml | 69 ++++ .../train_tseresnet34_xvec_stage1_v3.0.yaml | 72 ++++ .../train_tseresnet34_xvec_stage2_v3.0.yaml | 69 ++++ .../config_fbank80_stmn_cfwseresnet34.v3.0.sh | 44 +++ .../config_fbank80_stmn_cwseresnet34.v3.0.sh | 45 +++ .../config_fbank80_stmn_fwseresnet34.v3.0.sh | 44 +++ ...onfig_fbank80_stmn_idrnd_resnet100.v3.0.sh | 44 +++ .../config_fbank80_stmn_resnet34.v3.0.sh | 44 +++ .../config_fbank80_stmn_tseresnet34.v3.0.sh | 44 +++ egs/voxceleb/v2.1/cmd.sh | 28 ++ egs/voxceleb/v2.1/conf/clsp.conf | 11 + egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf | 11 + egs/voxceleb/v2.1/conf/coe_gpu_long.conf | 13 + egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf | 11 + egs/voxceleb/v2.1/conf/coe_gpu_short.conf | 11 + egs/voxceleb/v2.1/conf/coe_gpu_v100.conf | 11 + egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml | 35 ++ ...lsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml | 59 ++++ ...c2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...baseplus9l_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...lmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++ ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml | 73 ++++ ...lmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...lmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++ ...lmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml | 73 ++++ ...avlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml | 63 ++++ ...avlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml | 73 ++++ ...wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++ ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml | 73 ++++ egs/voxceleb/v2.1/conf/vad_16k.yaml | 8 + ...v2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml | 45 +++ ...wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml | 44 +++ .../wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml | 44 +++ .../wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml | 45 +++ .../wavlmbaseplus_ecapatdnn512x3_v2.0.yaml | 44 +++ .../wavlmlarge12l_ecapatdnn512x3_v2.0.yaml | 45 +++ .../conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml | 44 +++ egs/voxceleb/v2.1/datapath.sh | 23 ++ egs/voxceleb/v2.1/default_config.sh | 1 + ...wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | 54 +++ ...g_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh | 54 +++ ...ig_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | 54 +++ ...fig_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | 54 +++ ...onfig_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | 54 +++ ...onfig_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | 54 +++ .../config_wavlmlarge_ecapatdnn512x3_v2.0.sh | 54 +++ egs/voxceleb/v2.1/hyp_utils | 1 + egs/voxceleb/v2.1/path.sh | 5 + egs/voxceleb/v2.1/run_001_prepare_data.sh | 46 +++ egs/voxceleb/v2.1/run_002_compute_evad.sh | 66 ++++ .../v2.1/run_003_prepare_noises_rirs.sh | 102 ++++++ .../v2.1/run_004_prepare_xvec_train_data.sh | 76 +++++ egs/voxceleb/v2.1/run_005_train_xvector.sh | 78 +++++ egs/voxceleb/v2.1/run_006_extract_xvectors.sh | 103 ++++++ egs/voxceleb/v2.1/run_007_eval_be.sh | 321 ++++++++++++++++++ egs/voxceleb/v2/README.md | 10 +- egs/voxceleb/v2/default_config.sh | 2 +- 71 files changed, 3829 insertions(+), 152 deletions(-) create mode 100644 egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh create mode 100755 egs/voxceleb/v2.1/cmd.sh create mode 100644 egs/voxceleb/v2.1/conf/clsp.conf create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_long.conf create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_short.conf create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_v100.conf create mode 100644 egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/vad_16k.yaml create mode 100644 egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/datapath.sh create mode 120000 egs/voxceleb/v2.1/default_config.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh create mode 120000 egs/voxceleb/v2.1/hyp_utils create mode 100755 egs/voxceleb/v2.1/path.sh create mode 100755 egs/voxceleb/v2.1/run_001_prepare_data.sh create mode 100755 egs/voxceleb/v2.1/run_002_compute_evad.sh create mode 100755 egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh create mode 100755 egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh create mode 100755 egs/voxceleb/v2.1/run_005_train_xvector.sh create mode 100755 egs/voxceleb/v2.1/run_006_extract_xvectors.sh create mode 100755 egs/voxceleb/v2.1/run_007_eval_be.sh diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 3b9eeaa9..efdb77c1 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -1,5 +1,7 @@ # VoxCeleb V1.1 +This recipe will be deprecated, use V1.2 + Recipe for the VoxCeleb Speaker Verification Task ## Differences w.r.t VoxCeleb V1 recipe diff --git a/egs/voxceleb/v1.2/README.md b/egs/voxceleb/v1.2/README.md index 1ee9468f..6e8ba07a 100644 --- a/egs/voxceleb/v1.2/README.md +++ b/egs/voxceleb/v1.2/README.md @@ -1,4 +1,4 @@ -# VoxCeleb V1.1 +# VoxCeleb V1.2 Recipe for the VoxCeleb Speaker Verification Task @@ -9,7 +9,7 @@ In recipe version V1: - Augmentation is performed using Kaldi scripts and wav-reverbate tool - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files. -In this recipe: +In V1.1: - We compute speech augmentations and acoustic features are computed always on-the-fly, we don't dump any features to disk. - Augmentation is performed using Hyperin SpeechAugment class. @@ -18,6 +18,11 @@ In this recipe: which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe. - Babble noise is created offline by mixing 3-10 single speaker files. +In V1.2: + - Feaure extractor is embedded into the pytorch model in classes derived from Wav2XVector base class. + - Kaldi format is replaced by new format based on pandas tables + - Kaldi style bash scripts are removed and replaced by python scripts + - Most python scripts are called using Hyperion entry points ## Citing @@ -30,13 +35,11 @@ In this recipe: ## Test data - Test data is VoxCeleb 1 - - We evaluate 6 conditions: + - We evaluate the 3 conditions (with cleaned lists): - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers - - Voxceleb-O-cleaned: VoxCeleb-O cleaned-up of some errors - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1 - - Voxceleb-E-cleaned: VoxCeleb-E cleaned-up of some errors - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials. - - Voxceleb-H-cleaned: VoxCeleb-H cleaned-up of some errors + ## Usage @@ -44,9 +47,9 @@ In this recipe: - By default it will use Light ResNet (16 base channels) - For better performance use full ResNet (64 base channels) using `config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` file as ```bash -run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh -run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true -run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_005_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_006_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true +run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh ``` - To train with mixed precision training use config file `config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` @@ -66,25 +69,26 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. - - `run_010_prepare_xvec_train_data.sh` + - `run_004_prepare_xvec_train_data.sh` - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. - Removes silence from the audios - Removes utterances shorter than 4secs and speakers with less than 8 utterances. - Creates training and validation lists for x-vector training - - `run_011_train_xvector.sh` + - `run_005_train_xvector.sh` - Trains the x-vector network - - `run_030_extract_xvectors.sh` + - `run_006_extract_xvectors.sh` - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training - Exctracts x-vectors for VoxCeleb1 test sets - - `run_040_eval_be.sh` + - `run_007_eval_be.sh` - Trains PLDA and evals PLDA and cosine scoring back-ends ## Results + ### VoxCeleb 1 Original-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -95,9 +99,28 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | | | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 | | | | | Cosine + QMF | 0.57 | 0.037 | 0.071 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | || | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.039 | 0.048 | +| | | | Cosine + QMF | 0.62 | 0.034 | 0.042 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.76 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.041 | 0.061 | +| | | | Cosine + QMF | 0.62 | 0.037 | 0.056 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 | +| | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062| +| | | | Cosine + QMF | 0.62 | 0.036 | 0.063 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.053 | 0.082 | +| | | | Cosine + AS-Norm | 0.70 | 0.043 | 0.076 | +| | | | Cosine + QMF | 0.63 | 0.042 | 0.071 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.051 | 0.095 | +| | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 | +| | | | Cosine + QMF | 0.67 | 0.039 | 0.074 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | +| | | | Cosine + AS-Norm | 0.52 | 0.033 | 0.045 | +| | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 | +| | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 | +| | | | Cosine + QMF | 0.49 | 0.033 | 0.054 | + ### VoxCeleb 1 Entire-Clean trial list @@ -109,9 +132,27 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | | | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 | | | | | Cosine + QMF | 0.76 | 0.047 | 0.083 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.86 | 0.054 | 0.098 | +| | | | Cosine + AS-Norm | 0.81 | 0.049 | 0.087 | +| | | | Cosine + QMF | 0.77 | 0.046 | 0.082 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.89 | 0.058 | 0.098 | +| | | | Cosine + AS-Norm | 0.84 | 0.053 | 0.087| +| | | | Cosine + QMF | 0.80 | 0.050 | 0.081 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 | +| | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 | +| | | | Cosine + QMF | 0.74 | 0.045 | 0.081 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.91 | 0.057 | 0.100 | +| | | | Cosine + AS-Norm | 0.85 | 0.052 | 0.089 | +| | | | Cosine + QMF | 0.81 | 0.049 | 0.085 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.94 | 0.059 | 0.105 | +| | | | Cosine + AS-Norm | 0.88 | 0.053 | 0.093 | +| | | | Cosine + QMF | 0.84 | 0.051 | 0.088 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| +| | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | +| | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 | +| | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 | +| | | | Cosine + QMF | 0.68 | 0.040 | 0.069 | ### VoxCeleb 1 Hard-Clean trial list @@ -123,9 +164,28 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | | | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 | | | | | Cosine + QMF | 1.44 | 0.087 | 0.145 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.62 | 0.098 | 0.164 | +| | | | Cosine + AS-Norm | 1.45 | 0.085 | 0.142 | +| | | | Cosine + QMF | 1.36 | 0.082 | 0.137 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.70 | 0.1 | 0.165 | +| | | | Cosine + AS-Norm | 1.50 | 0.086 | 0.138 | +| | | | Cosine + QMF | 1.44 | 0.085 | 0.139 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 | +| | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 | +| | | | Cosine + QMF | 1.34 | 0.079 | 0.136 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.75 | 0.104 | 0.171 | +| | | | Cosine + AS-Norm | 1.56 | 0.091 | 0.152 | +| | | | Cosine + QMF | 1.50 | 0.087 | 0.145 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.76 | 0.104 | 0.174 | +| | | | Cosine + AS-Norm | 1.58 | 0.092 | 0.152 | +| | | | Cosine + QMF | 1.51 | 0.089 | 0.149 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | +| | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | +| | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 | +| | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 | +| | | | Cosine + QMF | 1.21 | 0.069 | 0.113 | + ### VoxSRC2022 dev @@ -137,127 +197,24 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | | | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 | | | | | Cosine + QMF | 2.06 | 0.137 | 0.251 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | || | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | - -## Results before 2023 - -### VoxCeleb 1 Original-Clean trial list - -| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | -| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 | -| | | | Cosine | 2.04 | 0.138 | 0.210 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.35 | 0.091 | 0.159 | -| | | | Cosine | 1.22 | 0.082 | 0.129 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.074 | 0.124 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.090 | 0.160 | -| | | | Cosine | 1.44 | 0.100 | 0.173 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.091 | 0.143 | -| | | | Cosine | 1.17 | 0.081 | 0.110 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.37 | 0.104 | 0.179 | -| | | | Cosine | 1.31 | 0.080 | 0.139 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.29 | 0.088 | 0.129 | -| | | | Cosine | 1.23 | 0.083 | 0.136 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.095 | 0.156 | -| | | | Cosine | 1.29 | 0.089 | 0.146 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.084 | 0.136 | -| | | | Cosine | 1.18 | 0.078 | 0.115 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.084 | 0.145 | -| | | | Cosine | 1.12 | 0.073 | 0.131 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.53 | 0.104 | 0.189 | -| | | | Cosine | 1.31 | 0.084 | 0.132 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 0.98 | 0.066 | 0.116 | -| | | | Cosine | 1.12 | 0.071 | 0.103 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.05 | 0.077 | 0.123 | -| | | | Cosine | 0.96 | 0.065 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.04 | 0.071 | 0.118 | -| | | | Cosine | 0.93 | 0.067 | 0.108 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 0.90 | 0.067 | 0.118 | -| | | | Cosine | 0.85 | 0.060 | 0.094 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.44 | 0.102 | 0.169 | -| | | | Cosine | 1.29 | 0.084 | 0.140 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.071 | 0.116 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.074 | 0.116 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.09 | 0.081 | 0.150 | - - -### VoxCeleb 1 Entire-Clean trial list - -| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | -| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.86 | 0.124 | 0.210 | -| | | | Cosine | 1.93 | 0.122 | 0.201 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.43 | 0.091 | 0.159 | -| | | | Cosine | 1.24 | 0.080 | 0.136 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.077 | 0.132 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.084 | 0.150 | -| | | | Cosine | 1.30 | 0.082 | 0.150 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.083 | 0.146 | -| | | | Cosine | 1.09 | 0.071 | 0.124 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.45 | 0.097 | 0.165 | -| | | | Cosine | 1.15 | 0.076 | 0.132 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.47 | 0.094 | 0.165 | -| | | | Cosine | 1.27 | 0.082 | 0.148 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.31 | 0.086 | 0.149 | -| | | | Cosine | 1.22 | 0.079 | 0.134 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.082 | 0.145 | -| | | | Cosine | 1.16 | 0.074 | 0.130 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.077 | 0.136 | -| | | | Cosine | 1.11 | 0.071 | 0.125 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.46 | 0.097 | 0.173 | -| | | | Cosine | 1.24 | 0.080 | 0.140 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.071 | 0.127 | -| | | | Cosine | 1.05 | 0.067 | 0.117 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.078 | 0.134 | -| | | | Cosine | 1.05 | 0.069 | 0.121 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.18 | 0.075 | 0.131 | -| | | | Cosine | 0.98 | 0.063 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | Res2Net50 width=26x8 | + SWA | PLDA | 1.17 | 0.072 | 0.123 | -| | | | Cosine | 0.94 | 0.061 | 0.107 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.56 | 0.095 | 0.166 | -| | | | Cosine | 1.27 | 0.079 | 0.142 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.19 | 0.077 | 0.137 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.073 | 0.129 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | TSE-Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.068 | 0.120 | - - -### VoxCeleb 1 Hard-Clean trial list - -| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | -| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 3.29 | 0.195 | 0.318 | -| | | | Cosine | 3.27 | 0.188 | 0.303 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.66 | 0.160 | 0.258 | -| | | | Cosine | 2.32 | 0.139 | 0.232 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 2.19 | 0.133 | 0.215 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 2.33 | 0.139 | 0.227 | -| | | | Cosine | 2.33 | 0.142 | 0.235 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.46 | 0.142 | 0.237 | -| | | | Cosine | 2.14 | 0.126 | 0.203 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 2.57 | 0.153 | 0.255 | -| | | | Cosine | 2.11 | 0.127 | 0.205 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 2.64 | 0.157 | 0.244 | -| | | | Cosine | 2.33 | 0.141 | 0.232 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 2.42 | 0.144 | 0.245 | -| | | | Cosine | 2.26 | 0.133 | 0.224 -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.39 | 0.141 | 0.235 | -| | | | Cosine | 2.17 | 0.128 | 0.215 -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.28 | 0.131 | 0.225 | -| | | | Cosine | 2.11 | 0.124 | 0.204 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 2.77 | 0.172 | 0.271 | -| | | | Cosine | 2.45 | 0.141 | 0.225 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 2.07 | 0.124 | 0.201 | -| | | | Cosine | 1.95 | 0.113 | 0.181 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 2.34 | 0.136 | 0.230 | -| | | | Cosine | 1.99 | 0.119 | 0.196 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 2.18 | 0.127 | 0.211 | -| | | | Cosine | 1.89 | 0.112 | 0.184 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 2.14 | 0.125 | 0.209 | -| | | | Cosine | 1.84 | 0.110 | 0.186 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 2.78 | 0.156 | 0.252 | -| | | | Cosine | 2.26 | 0.134 | 0.214 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 2.24 | 0.134 | 0.221 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.20 | 0.132 | 0.219 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.02 | 0.123 | 0.203 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.19 | 0.142 | 0.242 | +| | | | Cosine + AS-Norm | 2.00 | 0.133 | 0.254 | +| | | | Cosine + QMF | 1.86 | 0.126 | 0.229 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.34 | 0.145 | 0.246 | +| | | | Cosine + AS-Norm | 2.10 | 0.135 | 0.248 | +| | | | Cosine + QMF | 2.01 | 0.127 | 0.218 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 | +| | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 | +| | | | Cosine + QMF | 1.87 | 0.119 | 0.216 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.36 | 0.153 | 0.259 | +| | | | Cosine + AS-Norm | 2.18 | 0.139 | 0.249 | +| | | | Cosine + QMF | 2.08 | 0.128 | 0.222 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.49 | 0.158 | 0.265 | +| | | | Cosine + AS-Norm | 2.29 | 0.145 | 0.251 | +| | | | Cosine + QMF | 2.17 | 0.133 | 0.230 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | +| | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | +| | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 | +| | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 | +| | | | Cosine + QMF | 1.68 | 0.114 | 0.216 | diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..f4306e2e --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: cfwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..b5458f9d --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: seresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..01b2cc50 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 4 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..74553395 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.05 + se_r: 4 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..11d33ae2 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..6659b2f6 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: resnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..58d22733 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: tseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 256 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh new file mode 100644 index 00000000..56d18bd0 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# Channel-freq-wise-SE-ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cfwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh new file mode 100644 index 00000000..68849f78 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh @@ -0,0 +1,45 @@ +# Channel-wise ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + + +nnet_s2_base_cfg=conf/train_cwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh new file mode 100644 index 00000000..f962c2b3 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# Freq-wise-SE ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_fwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_fwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh new file mode 100644 index 00000000..6ea334b4 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh @@ -0,0 +1,44 @@ +# IdRnd ResNet100 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v3.0 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0029.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh new file mode 100644 index 00000000..bb5d990c --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34.v3.0 + +nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name.kk2 +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh new file mode 100644 index 00000000..2528d13f --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# TSE-ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_tseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_tseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + +nnet_s2_base_cfg=conf/train_tseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/cmd.sh b/egs/voxceleb/v2.1/cmd.sh new file mode 100755 index 00000000..040f458b --- /dev/null +++ b/egs/voxceleb/v2.1/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/voxceleb/v2.1/conf/clsp.conf b/egs/voxceleb/v2.1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf b/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_long.conf b/egs/voxceleb/v2.1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf b/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_short.conf b/egs/voxceleb/v2.1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf b/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..ad991124 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml new file mode 100644 index 00000000..0b1d0454 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..254ff796 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..52be6db5 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..bd3e7f86 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..69a8322b --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..3443591a --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..abe5da6e --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..7287188c --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..3443591a --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml new file mode 100644 index 00000000..69a8322b --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml new file mode 100644 index 00000000..5e1260ad --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..2addaa1e --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..69a8322b --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..5e1260ad --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/vad_16k.yaml b/egs/voxceleb/v2.1/conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/voxceleb/v2.1/conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..c3466259 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml new file mode 100644 index 00000000..d9c9b782 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..dc3737e3 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..d7e3388f --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..b2430d97 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..5025f047 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..0a6303f5 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/datapath.sh b/egs/voxceleb/v2.1/datapath.sh new file mode 100644 index 00000000..a7eb575c --- /dev/null +++ b/egs/voxceleb/v2.1/datapath.sh @@ -0,0 +1,23 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + # voxceleb1_root=/export/corpora5/VoxCeleb1_v1 #voxceleb1 v1 + voxceleb1_root=/export/corpora5/VoxCeleb1_v2 #voxceleb1 v2 + voxceleb2_root=/export/corpora5/VoxCeleb2 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + voxsrc22_root=/exp/jvillalba/corpora/voxsrc22 + musan_root=/expscratch/dgromero/corpora-open/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/voxceleb/v2.1/default_config.sh b/egs/voxceleb/v2.1/default_config.sh new file mode 120000 index 00000000..f2d8812d --- /dev/null +++ b/egs/voxceleb/v2.1/default_config.sh @@ -0,0 +1 @@ +global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh \ No newline at end of file diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..67a4665e --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params layers 2-12 + +# hugging face model +hf_model_name=wav2vec2xlsr300m12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh new file mode 100644 index 00000000..b4130fad --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn1024x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..80ee785b --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..c2b30f68 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..373535c2 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..530096cc --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..1b276bcd --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/hyp_utils b/egs/voxceleb/v2.1/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/v2.1/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/v2.1/path.sh b/egs/voxceleb/v2.1/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/voxceleb/v2.1/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/voxceleb/v2.1/run_001_prepare_data.sh b/egs/voxceleb/v2.1/run_001_prepare_data.sh new file mode 100755 index 00000000..563d3c2d --- /dev/null +++ b/egs/voxceleb/v2.1/run_001_prepare_data.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. datapath.sh +. $config_file + +if [ $stage -le 1 ];then + # Prepare the VoxCeleb2 dataset for training. + hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train +fi + +if [ $stage -le 2 ];then + # prepare voxceleb1 for test + hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test +fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + # hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + # --vox1-corpus-dir $voxceleb1_root \ + # --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # split vox2 into 2 parts, for cohort and qmf training + hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/v2.1/run_002_compute_evad.sh b/egs/voxceleb/v2.1/run_002_compute_evad.sh new file mode 100755 index 00000000..acccace3 --- /dev/null +++ b/egs/voxceleb/v2.1/run_002_compute_evad.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +vad_dir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml +nj=40 + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ -z "$vad_config" ];then + echo "We are not using VAD in this configuration" + exit 0 +fi + +if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" +fi + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + hyp_utils/create_data_split_dirs.sh \ + $vad_dir/$name \ + $USER/hyp-data/voxceleb/v1.2/vad $nodes + done +fi + +#Train datasets +if [ $stage -le 2 ];then + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj + echo "compute vad for $name" + $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-compute-energy-vad --cfg $vad_config \ + --recordings-file data/$name/recordings.csv \ + --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ + --part-idx JOB --num-parts $nj || exit 1 + + hyperion-tables cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion-dataset add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv + done +fi + + diff --git a/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..73c7ed82 --- /dev/null +++ b/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh new file mode 100755 index 00000000..4e0c5b19 --- /dev/null +++ b/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +nodes=b1 +nj=40 +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + hyp_utils/create_data_split_dirs.sh \ + exp/xvector_audios/$nnet_data \ + $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes +fi + +if [ $stage -le 2 ];then + output_dir=exp/proc_audio/$nnet_data + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$nnet_data/vad.csv" + update_durs="--update-seg-durs" + fi + + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac --remove-dc-offset $vad_args \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$nnet_data/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + + hyperion-dataset set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad +fi + +if [ $stage -le 3 ];then + hyperion-dataset remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 + + hyperion-dataset remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 +fi + +if [ $stage -le 4 ];then + hyperion-dataset split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --joint-classes speaker --min-train-samples 1 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val +fi + diff --git a/egs/voxceleb/v2.1/run_005_train_xvector.sh b/egs/voxceleb/v2.1/run_005_train_xvector.sh new file mode 100755 index 00000000..2479d565 --- /dev/null +++ b/egs/voxceleb/v2.1/run_005_train_xvector.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_data_dir=data/${nnet_data}_xvector_train +val_data_dir=data/${nnet_data}_xvector_val + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu \ + +fi + + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh new file mode 100755 index 00000000..0dc58048 --- /dev/null +++ b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +use_gpu=false +xvec_chunk_length=120.0 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" + num_gpus=1 +else + xvec_cmd="$train_cmd --mem 12G" + num_gpus=0 +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then + # Extract xvectors for training LDA/PLDA + nj=100 + for name in voxceleb2cat_train + do + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --random-utt-length --min-utt-length 2 --max-utt-length 30 \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + nj=100 + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}') + nj=$(($num_segs < 100 ? $num_segs:100)) + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + + diff --git a/egs/voxceleb/v2.1/run_007_eval_be.sh b/egs/voxceleb/v2.1/run_007_eval_be.sh new file mode 100755 index 00000000..53621488 --- /dev/null +++ b/egs/voxceleb/v2.1/run_007_eval_be.sh @@ -0,0 +1,321 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name +score_plda_dir=$score_dir/${be_name}/plda +score_cosine_dir=$score_dir/cosine +score_cosine_snorm_dir=$score_dir/cosine_snorm +score_cosine_qmf_dir=$score_dir/cosine_qmf + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_dir/voxceleb1_scores.csv \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_dir/voxceleb1_results.csv + + cat $score_cosine_dir/voxceleb1_results.csv +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring" + $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_dir/voxsrc22_dev_scores.csv + + # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \ + # hyp_utils/conda_env.sh \ + # hyperion-eval-cosine-scoring-backend \ + # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ + # --ndx-file data/voxsrc22_eval/trials.csv \ + # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ + # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_dir/voxsrc22_dev_results.csv + + cat $score_cosine_dir/voxsrc22_dev_results.csv + +fi + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_snorm_dir/voxceleb1_results.csv + + cat $score_cosine_snorm_dir/voxceleb1_results.csv + fi + + if [ $stage -le 6 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + AS-Norm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + fi + +fi + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + echo "Train QMF in Vox2" + echo "...Calculating quality measures for Vox2" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --ndx-file data/voxceleb2cat_train_trials/trials.csv \ + --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv \ + --score-file $score_cosine_qmf_dir/voxceleb2_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + hyperion-train-qmf --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --key-file data/voxceleb2cat_train_trials/trials.csv \ + --model-file $score_cosine_qmf_dir/qmf.h5 + + fi + + if [ $stage -le 8 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxceleb1_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + ) & + done + wait + fi + + if [ $stage -le 9 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + ) & + done + wait + fi + +fi + diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md index a005b6e8..0bafe85e 100644 --- a/egs/voxceleb/v2/README.md +++ b/egs/voxceleb/v2/README.md @@ -26,12 +26,12 @@ Recipe for the VoxCeleb Speaker Verification Task using Wav2Vec2, WavLM or Huber ## Usage - Run the run_0*.sh scripts in sequence - - By default it will use + - By default it will use config global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh - For better performance use ```bash -run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh -run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true -run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_011_train_xvector.sh --config-file global_conf/other_config.sh +run_030_extract_xvectors.sh --config-file global_conf/other_config.sh --use-gpu true +run_040_eval_be.sh --config-file global_conf/other_config.sh ``` @@ -155,7 +155,7 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | | | | Cosine + QMF | 2.38 | 0.159 | 0.266 | | config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.62 | 0.153 | 0.251 | | | | | Cosine + AS-Norm | 2.53 | 0.149 | 0.247 | -| | | | Cosine + QMF | 0.242 | 0.144 | 0.231 | +| | | | Cosine + QMF | 2.42 | 0.144 | 0.231 | | config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.225 | | | | | Cosine + AS-Norm | 2.01 | 0.125 | 0.209 | | | | | Cosine + QMF | 1.92 | 0.117 | 0.200 | diff --git a/egs/voxceleb/v2/default_config.sh b/egs/voxceleb/v2/default_config.sh index abcc2a2e..f2d8812d 120000 --- a/egs/voxceleb/v2/default_config.sh +++ b/egs/voxceleb/v2/default_config.sh @@ -1 +1 @@ -global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh \ No newline at end of file +global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh \ No newline at end of file From ed35173f534f98cb85b609642226b99d17163ddb Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 11 Sep 2023 12:12:49 -0400 Subject: [PATCH 79/89] vox/v2.1 recipe done, not tested --- egs/voxceleb/v2.1/run_005_train_xvector.sh | 27 ++++++++++++++++--- egs/voxceleb/v2.1/run_006_extract_xvectors.sh | 9 ++++--- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/egs/voxceleb/v2.1/run_005_train_xvector.sh b/egs/voxceleb/v2.1/run_005_train_xvector.sh index 2479d565..eb1c591e 100755 --- a/egs/voxceleb/v2.1/run_005_train_xvector.sh +++ b/egs/voxceleb/v2.1/run_005_train_xvector.sh @@ -44,7 +44,7 @@ if [ $stage -le 1 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + hyperion-train-wav2vec2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ --data.train.dataset.segments-file $train_data_dir/segments.csv \ --data.train.dataset.class-files $train_data_dir/speaker.csv \ @@ -56,7 +56,7 @@ if [ $stage -le 1 ]; then fi -# Large Margin Fine-tuning +# Finetune full model if [ $stage -le 2 ]; then if [ "$use_wandb" == "true" ];then extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" @@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ --data.train.dataset.segments-file $train_data_dir/segments.csv \ --data.train.dataset.class-files $train_data_dir/speaker.csv \ @@ -76,3 +76,24 @@ if [ $stage -le 2 ]; then --num-gpus $ngpu \ fi + +# Finetune full model +if [ $stage -le 3 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh index 0dc58048..2cfe27fe 100755 --- a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh +++ b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh @@ -8,15 +8,16 @@ set -e stage=1 -nnet_stage=2 +nnet_stage=3 config_file=default_config.sh use_gpu=false +hf_chunk_length=120.0 #seconds xvec_chunk_length=120.0 . parse_options.sh || exit 1; . $config_file if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu --chunk-length $xvec_chunk_length" + xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" num_gpus=1 else @@ -58,7 +59,7 @@ if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qm echo "Extracting x-vectors for $name" $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + hyperion-extract-wav2vec2xvectors ${xvec_args} ${vad_args} \ --part-idx JOB --num-parts $nj \ --recordings-file data/$name/recordings.csv \ --random-utt-length --min-utt-length 2 --max-utt-length 30 \ @@ -88,7 +89,7 @@ if [ $stage -le 2 ]; then echo "Extracting x-vectors for $name" $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + hyperion-extract-wav2vec2xvectors ${xvec_args} ${vad_args} \ --part-idx JOB --num-parts $nj \ --recordings-file data/$name/recordings.csv \ --model-path $nnet \ From 8760d055520609a57bc69ac9fc05ef159e9f336a Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 12 Sep 2023 14:06:02 -0400 Subject: [PATCH 80/89] implemented lora in w2v2, not tested --- hyperion/io/bin_vad_reader.py | 4 +- hyperion/np/augment/noise_augment.py | 2 +- hyperion/torch/layers/__init__.py | 13 +- hyperion/torch/layers/lora.py | 80 +++++ .../models/wav2xvectors/hf_wav2xvector.py | 26 +- hyperion/torch/tpm/hf/hf_wav2vec2.py | 18 +- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 320 ++++++++++++++---- hyperion/utils/dataset.py | 68 +++- requirements.txt | 4 +- 9 files changed, 425 insertions(+), 110 deletions(-) create mode 100644 hyperion/torch/layers/lora.py diff --git a/hyperion/io/bin_vad_reader.py b/hyperion/io/bin_vad_reader.py index 82e2a0c5..8ce91d15 100644 --- a/hyperion/io/bin_vad_reader.py +++ b/hyperion/io/bin_vad_reader.py @@ -59,7 +59,7 @@ def read( vad = self.r.read(keys) output_vad = [] for i in range(len(keys)): - vad_i = vad[i].astype(np.bool, copy=False) + vad_i = vad[i].astype(bool, copy=False) offset_i = offset[i] if offset_is_list else offset num_frames_i = num_frames[i] if num_frames_is_list else num_frames vad_i = self._get_bin_vad_slice(vad_i, offset_i, num_frames_i) @@ -77,7 +77,7 @@ def read_timestamps(self, keys, merge_tol=0.001): vad = self.r.read(keys) ts = [] for i in range(len(keys)): - vad_i = vad[i].astype(np.bool, copy=False) + vad_i = vad[i].astype(bool, copy=False) ts_i = bin_vad_to_timestamps( vad_i, self.frame_length / 1000, diff --git a/hyperion/np/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py index 1cc1a0be..92bd57dd 100644 --- a/hyperion/np/augment/noise_augment.py +++ b/hyperion/np/augment/noise_augment.py @@ -55,7 +55,7 @@ def __init__( @staticmethod def _power(x): """Computes power of x in dB.""" - return 10 * np.log10((x ** 2).sum()) + return 10 * np.log10((x**2).sum() + 1e-10) @staticmethod def snr(x, n): diff --git a/hyperion/torch/layers/__init__.py b/hyperion/torch/layers/__init__.py index 6b508b0e..bea52c95 100644 --- a/hyperion/torch/layers/__init__.py +++ b/hyperion/torch/layers/__init__.py @@ -4,20 +4,23 @@ """ from .activation_factory import ActivationFactory -from .attention import (LocalScaledDotProdAttRelPosEncV1, - LocalScaledDotProdAttV1, ScaledDotProdAttRelPosEncV1, - ScaledDotProdAttV1) +from .attention import ( + LocalScaledDotProdAttRelPosEncV1, + LocalScaledDotProdAttV1, + ScaledDotProdAttRelPosEncV1, + ScaledDotProdAttV1, +) from .audio_feats import * from .audio_feats_factory import AudioFeatsFactory from .calibrators import LinBinCalibrator from .dropout import DropConnect1d, DropConnect2d, Dropout1d from .global_pool import * from .interpolate import Interpolate +from .lora import LoRAFactory from .margin_losses import ArcLossOutput, CosLossOutput, SubCenterArcLossOutput from .mvn import MeanVarianceNorm from .norm_layer_factory import NormLayer1dFactory, NormLayer2dFactory from .pool_factory import GlobalPool1dFactory -from .pos_encoder import (ConvPosEncoder, NoPosEncoder, PosEncoder, - RelPosEncoder) +from .pos_encoder import ConvPosEncoder, NoPosEncoder, PosEncoder, RelPosEncoder from .spec_augment import AxisMasker, SpecAugment, SpecWarper from .subpixel_convs import ICNR1d, ICNR2d, SubPixelConv1d, SubPixelConv2d diff --git a/hyperion/torch/layers/lora.py b/hyperion/torch/layers/lora.py new file mode 100644 index 00000000..1436caf5 --- /dev/null +++ b/hyperion/torch/layers/lora.py @@ -0,0 +1,80 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from typing import Union + +import loralib as lora +import torch.nn as nn +from loralib import * + + +class LoRAFactory: + def create_from_pretrained( + layer: Union[nn.Embedding, nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d], + r: int = 8, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + merge_weights: bool = True, + ): + if isinstance(layer, nn.Embedding): + lora_layer = lora.Embedding( + layer.num_embeddings, + layer.embedding_dim, + padding_idx=layer.padding_idx, + max_norm=layer.max_norm, + norm_type=layer.norm_type, + scale_grad_by_freq=layer.scale_grad_by_freq, + sparse=layer.sparse, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + + elif isinstance(layer, nn.Linear): + bias = layer.bias is not None + lora_layer = lora.Linear( + layer.in_features, + layer.out_features, + bias=bias, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + if bias: + lora_layer.bias.data = layer.bias.data + + elif isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): + if isinstance(layer, nn.Conv1d): + lora_class = lora.Conv1d + elif isinstance(layer, nn.Conv2d): + lora_class = lora.Conv2d + elif isinstance(layer, nn.Conv3d): + lora_class = lora.Conv3d + + bias = layer.bias is not None + lora_layer = lora_class( + layer.in_channels, + layer.out_channels, + layer.kernel_size, + stride=layer.stride, + padding=layer.padding, + dilation=layer.dilation, + groups=layer.groups, + bias=bias, + padding_mode=layer.padding_mode, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + if bias: + lora_layer.bias.data = layer.bias.data + + return lora_layer diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 24ab5bbb..925f1172 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -5,10 +5,9 @@ import contextlib import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...torch_model import TorchModel from ...utils import remove_silence @@ -29,7 +28,6 @@ class HFWav2XVector(TorchModel): def __init__( self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" ): - super().__init__() self.hf_feats = hf_feats self.xvector = xvector @@ -222,7 +220,6 @@ def extract_embed( embed_layer=None, detach_chunks=False, ): - if vad_samples is not None: x, x_lengths = remove_silence(x, vad_samples, x_lengths) @@ -256,6 +253,9 @@ def freeze_hf_feats(self): def freeze_hf_feature_encoder(self): self.hf_feats.freeze_feature_encoder() + def freeze_hf_except_lora(self, bias=None): + self.hf_feats.freeze_except_lora(bias) + def has_param_groups(self): return self.hf_feats.has_param_groups() @@ -296,6 +296,15 @@ def set_train_mode(self, mode): elif mode == "hf-feat-extractor-frozen": self.unfreeze() self.freeze_hf_feature_encoder() + elif mode == "hf-lora": + self.unfreeze() + self.freeze_hf_except_lora() + elif mode == "hf-all-bias-lora": + self.unfreeze() + self.freeze_hf_except_lora(bias="all") + elif mode == "hf-lora-with-bias": + self.unfreeze() + self.freeze_hf_except_lora(bias="lora_only") else: raise ValueError(f"invalid train_mode={mode}") @@ -310,7 +319,6 @@ def set_train_mode(self, mode): self._train_mode = mode def _train(self, train_mode: str): - if train_mode in ["full", "frozen"]: super()._train(train_mode) elif train_mode == "ft-embed-affine": @@ -322,6 +330,9 @@ def _train(self, train_mode: str): "ft-xvector-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", ]: self.hf_feats.train() self.xvector._train("full") @@ -339,6 +350,9 @@ def valid_train_modes(): "ft-xvector-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", ] @staticmethod @@ -353,7 +367,6 @@ def filter_args(**kwargs): return args def get_config(self): - hf_cfg = self.hf_feats.get_config() xvec_cfg = self.xvector.get_config() del hf_cfg["class_name"] @@ -375,7 +388,6 @@ def change_config(self, hf_feats, xvector): @staticmethod def add_class_args(parser, prefix=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index 26da7beb..dd5de2fe 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -6,11 +6,10 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import Wav2Vec2Config, Wav2Vec2Model - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2Config, Wav2Vec2Model from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase @@ -204,8 +203,13 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, ): - super().__init__( pretrained_model_path=pretrained_model_path, normalize_input=normalize_input, @@ -223,6 +227,12 @@ def __init__( sample_frequency=sample_frequency, feat_extract_lr=feat_extract_lr, encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, ) if pretrained_model_path is not None and not ignore_pretrained: diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index a9c4ddef..2c8d239f 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -8,12 +8,13 @@ from turtle import right from typing import List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor +from ....utils.misc import filter_func_args +from ...layers import LoRAFactory from ...torch_model import TorchModel from ...utils import scale_seq_lengths, seq_lengths_to_mask from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs @@ -55,6 +56,12 @@ class HFWav2VecBase(TorchModel): sample_frequency: (`int`) waveform sample frequency used to train the model. feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -75,6 +82,12 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, ): super().__init__() self.pretrained_model_path = pretrained_model_path @@ -90,6 +103,12 @@ def __init__( self.left_encoder_context = left_encoder_context self.feat_extract_lr = feat_extract_lr self.encoder_lr = encoder_lr + self.use_lora = use_lora + self.lora_components = lora_components + self.lora_rank = lora_rank + self.lora_alpha = lora_alpha + self.lora_dropout = lora_dropout + self.lora_merge_weights = lora_merge_weights if pretrained_model_path is not None and not ignore_pretrained: rank = ddp_get_rank() @@ -153,6 +172,16 @@ def __init__( self._feature_encoder_context = None self._frame_shift = None + self.hf_model = None + + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) def __deepcopy__(self, memo): """Reimplementation of deepcopy for Hugging Face models. @@ -225,18 +254,36 @@ def change_config( self, override_dropouts: bool, override_spec_augment: bool, + override_lora: bool, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, **kwargs, ): if override_spec_augment: - logging.info("overriding speech augment") + logging.info(f"overriding speech augment with args={kwargs}") self.change_spec_augment(**kwargs) if override_dropouts: - logging.info("overriding hf model dropouts") + logging.info(f"overriding hf model dropouts with args={kwargs}") self.change_dropouts(**kwargs) + if override_lora: + logging.info("overriding LoRA config") + self.change_lora( + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, + ) + self.feat_extract_lr = feat_extract_lr self.encoder_lr = encoder_lr @@ -259,12 +306,109 @@ def change_spec_augment( self.hf_model.config.mask_feature_length = mask_feature_length self.hf_model.config.mask_feature_min_masks = mask_feature_min_masks + def change_lora( + self, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, + ): + if not self.use_lora: + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + pass + else: + # TODO + pass + else: + if use_lora: + # TODO + pass + else: + # TODO + pass + + self.use_lora = use_lora + self.lora_components = lora_components + self.lora_rank = lora_rank + self.lora_alpha = lora_alpha + self.lora_dropout = lora_dropout + self.lora_merge_weights = lora_merge_weights + + def _make_lora_layers( + self, + lora_components: List[str], + lora_rank: int, + lora_alpha: int, + lora_dropout: float, + lora_merge_weights: bool, + ): + counts = {k: 0 for k in lora_components} + self._recursive_replace_layer_by_lora( + self.hf_model, + counts, + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + for k, v in counts.items(): + logging.info("count of LoRA layers for %s = %d", k, v) + assert v > 0, f"did not make any {k} LoRA" + + @staticmethod + def _recursive_replace_layer_by_lora( + model: nn.Module, + counts: dict, + lora_components: List[str], + lora_rank: int, + lora_alpha: int, + lora_dropout: float, + lora_merge_weights: bool, + ): + for name, module in model.named_children(): + if len(list(module.children())) > 0: + HFWav2VecBase._recursive_replace_layer_by_lora( + module, + counts, + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + if isinstance(module, nn.Linear) and name in lora_components: + lora_layer = LoRAFactory.create_from_pretrained( + module, + r=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=lora_merge_weights, + ) + setattr(model, name, lora_layer) + counts[name] += 1 + def change_dropouts(self, **kwargs): pass # needs to be overloaded def freeze_feature_encoder(self): self.hf_model.freeze_feature_encoder() + def freeze_except_lora(self, bias=None): + bias = "none" if bias is None else bias + from ...layers.lora import mark_only_lora_as_trainable + + mark_only_lora_as_trainable(self.hf_model, bias=bias) + def has_param_groups(self): return self.feat_extract_lr is not None or self.encoder_lr is not None @@ -302,14 +446,14 @@ def _normalize(self, x, x_mask=None): """Normalizes the audio to have zero mean and unit variance.""" if x_mask is None: x = x - x.mean(dim=1, keepdim=True) - std = torch.sqrt((x ** 2).mean(dim=1, keepdim=True) + 1e-7) + std = torch.sqrt((x**2).mean(dim=1, keepdim=True) + 1e-7) x = x / std else: x_mask = x_mask.to(dtype=x.dtype) x_samples = torch.mean(x_mask, dim=1, keepdim=True) x_mean = torch.mean(x * x_mask, dim=1, keepdim=True) / x_samples - x2_mean = torch.mean(x ** 2 * x_mask, dim=1, keepdim=True) / x_samples - std = torch.sqrt(x2_mean - x_mean ** 2 + 1e-7) + x2_mean = torch.mean(x**2 * x_mask, dim=1, keepdim=True) / x_samples + std = torch.sqrt(x2_mean - x_mean**2 + 1e-7) x = (x - x_mean) / std return x @@ -544,14 +688,6 @@ def forward_long_impl( else scale_seq_lengths(x_lengths, max_out_length, max_in_length) ) output["hidden_states_lengths"] = feat_lengths - # print( - # "lens", - # mol0, - # max_out_length, - # output.last_hidden_state.size(1), - # output.hidden_states[0].size(1), - # flush=True, - # ) return output def get_config(self): @@ -572,6 +708,14 @@ def get_config(self): "left_encoder_context": self.left_encoder_context, "right_encoder_context": self.right_encoder_context, "sample_frequency": self.sample_frequency, + "feat_extract_lr": self.feat_extract_lr, + "encoder_lr": self.encoder_lr, + "use_lora": self.use_lora, + "lora_components": self.lora_components, + "lora_rank": self.lora_rank, + "lora_alpha": self.lora_alpha, + "lora_dropout": self.lora_dropout, + "lora_merge_weights": self.lora_merge_weights, } base_config = super().get_config() @@ -584,24 +728,78 @@ def save(self, file_path: str): @staticmethod def filter_args(**kwargs): - valid_args = ( - "pretrained_model_path", - "normalize_input", - "use_input_attention_mask", - "cache_dir", - "force_download", - "resume_download", - "revision", - "drop_layers_gt", - "ignore_pretrained", - "override_dropouts", - "override_spec_augment", - "left_encoder_context", - "right_encoder_context", - "sample_frequency", + return filter_func_args(HFWav2VecBase.__init__, **kwargs) + # valid_args = ( + # "pretrained_model_path", + # "normalize_input", + # "use_input_attention_mask", + # "cache_dir", + # "force_download", + # "resume_download", + # "revision", + # "drop_layers_gt", + # "ignore_pretrained", + # "override_dropouts", + # "override_spec_augment", + # "left_encoder_context", + # "right_encoder_context", + # "sample_frequency", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return args + + @staticmethod + def _add_lr_args(parser): + parser.add_argument( + "--feat-extractor-lr", + default=None, + type=float, + help=( + "lr for conv feature extractor, it serves to set a lr " + "different than the global one." + ), + ) + parser.add_argument( + "--encoder-lr", + default=None, + type=float, + help=( + "lr for transformer encoder, it serves to set a lr " + "different than the global one." + ), + ) + + @staticmethod + def _add_lora_args(parser): + parser.add_argument( + "--use-lora", + default=False, + action=ActionYesNo, + help="use low-rank adapters", + ) + parser.add_argument( + "--lora-components", + default=["q_proj", "v_proj"], + nargs="+", + choices=[ + "k_proj", + "q_proj", + "v_proj", + "out_proj", + "intermediate_dense", + "output_dense", + ], + help="list of components where we apply LoRA, eg [Wq, Wv]", + ) + parser.add_argument("--lora-rank", default=4, help="rank of LoRA") + parser.add_argument("--lora-alpha", default=1.0, help="scale for LoRA") + parser.add_argument("--lora-dropout", default=0.0, help="dropout rate for LoRA") + parser.add_argument( + "--lora-merge-weights", + default=True, + action=ActionYesNo, + help="lora weights are merged with the pretrained weights at inference.", ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return args @staticmethod def add_class_args(parser, prefix=None, skip=set()): @@ -703,36 +901,22 @@ def add_class_args(parser, prefix=None, skip=set()): "when the signal is evaluated chunk by chunk." ), ) - parser.add_argument( - "--feat-extractor-lr", - default=None, - type=float, - help=( - "lr for conv feature extractor, it serves to set a lr " - "different than the global one." - ), - ) - parser.add_argument( - "--encoder-lr", - default=None, - type=float, - help=( - "lr for transformer encoder, it serves to set a lr " - "different than the global one." - ), - ) + + HFWav2VecBase._add_lr_args(parser) + HFWav2VecBase._add_lora_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod def filter_finetune_args(**kwargs): - valid_args = ( - "override_dropouts", - "override_spec_augment", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return args + return filter_func_args(HFWav2VecBase.change_config, **kwargs) + # valid_args = ( + # "override_dropouts", + # "override_spec_augment", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return args @staticmethod def add_finetune_args(parser, prefix=None, skip=set()): @@ -759,23 +943,13 @@ def add_finetune_args(parser, prefix=None, skip=set()): ), ) parser.add_argument( - "--feat-extractor-lr", - default=None, - type=float, - help=( - "lr for conv feature extractor, it serves to set a lr " - "different than the global one." - ), - ) - parser.add_argument( - "--encoder-lr", - default=None, - type=float, - help=( - "lr for transformer encoder, it serves to set a lr " - "different than the global one." - ), + "--override-lora", + default=False, + action=ActionYesNo, + help=("whether to change the config of LoRA layers in the model."), ) + HFWav2VecBase._add_lr_args(parser) + HFWav2VecBase._add_lora_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index dd446576..51f0f37a 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -55,7 +55,6 @@ def __init__( sparse_trials: bool = False, table_sep: Optional[str] = None, ): - if isinstance(segments, SegmentSet): self._segments = segments self._segments_path = None @@ -82,10 +81,12 @@ def __init__( features, FeatureSet ) self._enrollments, self._enrollments_paths = self._parse_dict_args( - enrollments, EnrollmentMap, + enrollments, + EnrollmentMap, ) self._trials, self._trials_paths = self._parse_dict_args( - trials, (TrialKey, TrialNdx, SparseTrialKey), + trials, + (TrialKey, TrialNdx, SparseTrialKey), ) self.sparse_trials = sparse_trials @@ -711,7 +712,8 @@ def add_features(self, features_name: str, features: Union[PathLike, FeatureSet] raise ValueError() def set_segments( - self, segments: Union[PathLike, SegmentSet], update_seg_durs: bool, + self, + segments: Union[PathLike, SegmentSet], ): if isinstance(segments, (str, Path)): self._segments = None @@ -723,7 +725,9 @@ def set_segments( raise ValueError() def set_recordings( - self, recordings: Union[PathLike, RecordingSet], update_seg_durs: bool, + self, + recordings: Union[PathLike, RecordingSet], + update_seg_durs: bool = False, ): if isinstance(recordings, (str, Path)): self._recordings = None @@ -753,7 +757,9 @@ def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): raise ValueError() def add_enrollments( - self, enrollments_name: str, enrollments: Union[PathLike, EnrollmentMap], + self, + enrollments_name: str, + enrollments: Union[PathLike, EnrollmentMap], ): if self._enrollments is None: self._enrollments = {} @@ -793,7 +799,9 @@ def remove_features(self, features_name: str): del self._features[features_name] del self._features_paths[features_name] - def remove_recordings(self,): + def remove_recordings( + self, + ): if self._recordings_path is not None: self._files_to_delete.append(self._recordings_path) @@ -820,7 +828,8 @@ def remove_classes(self, classes_name: str): del self._classes_paths[classes_name] def remove_enrollments( - self, enrollments_name: str, + self, + enrollments_name: str, ): if self._enrollments_paths[enrollments_name] is not None: self._files_to_delete.append(self._enrollments_paths[enrollments_name]) @@ -829,7 +838,8 @@ def remove_enrollments( del self._enrollments_paths[enrollments_name] def remove_trials( - self, trials_name: str, + self, + trials_name: str, ): if self._trials_paths[trials_name] is not None: self._files_to_delete.append(self._trials_paths[trials_name]) @@ -981,14 +991,20 @@ def split_into_trials_and_cohort( segments_male = SegmentSet(segments[segments["gender"] == "m"]) segments_female = SegmentSet(segments[segments["gender"] == "f"]) trials_male, enroll_male, cohort_male = self._split_into_trials_and_cohort( - segments_male, num_tar_trials, num_trial_speakers, seed, + segments_male, + num_tar_trials, + num_trial_speakers, + seed, ) ( trials_female, enroll_female, cohort_female, ) = self._split_into_trials_and_cohort( - segments_female, num_tar_trials, num_trial_speakers, seed, + segments_female, + num_tar_trials, + num_trial_speakers, + seed, ) trials = TrialKey.merge([trials_male, trials_female]) enroll = EnrollmentMap.cat([enroll_male, enroll_female]) @@ -996,7 +1012,10 @@ def split_into_trials_and_cohort( else: segments = self.segments() trials, enroll, cohort = self._split_into_trials_and_cohort( - segments, num_tar_trials, num_trial_speakers, seed, + segments, + num_tar_trials, + num_trial_speakers, + seed, ) dataset_trials = self.clone() @@ -1019,7 +1038,10 @@ def remove_short_segments(self, min_length: float, length_name: str = "duration" self.clean() def remove_classes_few_segments( - self, class_name: str, min_segs: int, rebuild_idx: bool = False, + self, + class_name: str, + min_segs: int, + rebuild_idx: bool = False, ): segments = self.segments() classes, counts = np.unique(segments[class_name], return_counts=True) @@ -1082,7 +1104,10 @@ def _segments_split_joint_classes( return train_segs, val_segs def _segments_split_disjoint_classes( - self, val_prob: float, disjoint_classes: List[str], rng: np.random.Generator, + self, + val_prob: float, + disjoint_classes: List[str], + rng: np.random.Generator, ): segments = self.segments() classes = segments[disjoint_classes].apply("-".join, axis=1) @@ -1165,15 +1190,24 @@ def split_train_val( train_segs, val_segs = self._segments_split(val_prob, rng) elif joint_classes is not None and disjoint_classes is None: train_segs, val_segs = self._segments_split_joint_classes( - val_prob, joint_classes, min_train_samples, rng, + val_prob, + joint_classes, + min_train_samples, + rng, ) elif joint_classes is None and disjoint_classes is not None: train_segs, val_segs = self._segments_split_disjoint_classes( - val_prob, disjoint_classes, rng, + val_prob, + disjoint_classes, + rng, ) else: train_segs, val_segs = self._segments_split_joint_and_disjoint_classes( - val_prob, joint_classes, disjoint_classes, min_train_samples, rng, + val_prob, + joint_classes, + disjoint_classes, + min_train_samples, + rng, ) train_ds = self.clone() diff --git a/requirements.txt b/requirements.txt index c3410829..1e1aea9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,6 @@ memory_profiler gdown fairscale==0.4.4 tensorboard>=2.5.0 -yapf jsonargparse>=3.5.0 wandb>=0.10.30 librosa>=0.8.1 @@ -22,3 +21,6 @@ twine wheel transformers>=4.16.2 sentencepiece>=0.1.97 +loralib +lhotse + From 71f629d94aa981ea39a87b1a9a0afe8ab257b2a5 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Tue, 12 Sep 2023 22:24:37 -0400 Subject: [PATCH 81/89] add lora into ASR (haven't tested) --- egs/commonvoice/v1/cmd.sh | 2 +- ...v2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml | 88 +++++++++++++++++++ egs/commonvoice/v1/datapath.sh | 2 +- .../config_pruned_transducer_v6.0_13langs.sh | 44 ++++++++++ .../wav2transducer/hf_wav2rnn_transducer.py | 18 ++++ 5 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v6.0_13langs.sh diff --git a/egs/commonvoice/v1/cmd.sh b/egs/commonvoice/v1/cmd.sh index cedd70f9..697d5219 100755 --- a/egs/commonvoice/v1/cmd.sh +++ b/egs/commonvoice/v1/cmd.sh @@ -18,7 +18,7 @@ if [ "$(hostname -d)" == "cm.gemini" ];then export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G" #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" -elif [ "$(hostname -d)" == "rockfish.cluster" ];then +elif [ "$(hostname -d)" == "cm.cluster" ];then export train_cmd="slurm.pl --config conf/slurm.conf --mem 4G" export cuda_cmd="slurm.pl --config conf/slurm.conf --mem 20G" export cuda_eval_cmd="$train_cmd" diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml new file mode 100644 index 00000000..54ccd48e --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml @@ -0,0 +1,88 @@ +# for LoRA ASR +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.3 + + data_loader: + num_workers: 1 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 1 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + use_lora: true + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-lora + + \ No newline at end of file diff --git a/egs/commonvoice/v1/datapath.sh b/egs/commonvoice/v1/datapath.sh index 56b242ed..a1430c8b 100644 --- a/egs/commonvoice/v1/datapath.sh +++ b/egs/commonvoice/v1/datapath.sh @@ -9,7 +9,7 @@ if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then musan_root=/export/corpora5/JHU/musan echo "Put your database paths here" exit 1 -elif [ "$(hostname --domain)" == "rockfish.cluster" ];then +elif [ "$(hostname --domain)" == "cm.cluster" ];then commonvoice_root=/data/jvillal7/corpora/commonvoice musan_root=/data/jvillal7/corpora/musan elif [ "$(hostname --domain)" == "cm.gemini" ];then diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v6.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v6.0_13langs.sh new file mode 100644 index 00000000..cce21f4c --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v6.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio" + +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v6.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v6.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0015.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v6.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0015.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py index 1d16675c..8fc59a3d 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py @@ -226,6 +226,9 @@ def freeze_feat_fuser(self): def freeze_hf_feats(self): self.hf_feats.freeze() + def freeze_hf_except_lora(self, bias=None): + self.hf_feats.freeze_except_lora(bias) + def freeze_hf_feature_encoder(self): self.hf_feats.freeze_feature_encoder() @@ -247,6 +250,15 @@ def set_train_mode(self, mode): elif mode == "hf-feat-extractor-frozen": self.unfreeze() self.freeze_hf_feature_encoder() + elif mode == "hf-lora": + self.unfreeze() + self.freeze_hf_except_lora() + elif mode == "hf-all-bias-lora": + self.unfreeze() + self.freeze_hf_except_lora(bias="all") + elif mode == "hf-lora-with-bias": + self.unfreeze() + self.freeze_hf_except_lora(bias="lora_only") else: raise ValueError(f"invalid train_mode={mode}") @@ -270,6 +282,9 @@ def _train(self, train_mode: str): "ft-transducer-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", ]: self.hf_feats.train() self.transducer._train("full") @@ -287,6 +302,9 @@ def valid_train_modes(): "ft-transducer-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", ] @staticmethod From a75610ee27acf2cd15ecc38151f5efff6fa09623 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 13 Sep 2023 10:59:46 -0400 Subject: [PATCH 82/89] vox2.1 working and lora --- egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml | 13 ++-- ...lsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml | 13 ++-- ...c2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...baseplus9l_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...lmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml | 13 ++-- ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml | 13 ++-- ...lmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...lmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml | 13 ++-- ...lmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml | 13 ++-- ...avlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml | 13 ++-- ...avlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml | 13 ++-- ...wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml | 13 ++-- ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml | 13 ++-- ...rge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml | 71 ++++++++++++++++++ ...rge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml | 74 +++++++++++++++++++ ...vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh | 55 ++++++++++++++ hyperion/torch/layers/lora.py | 52 +++++++++++-- hyperion/torch/models/xvectors/xvector.py | 36 +-------- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 22 +++++- 22 files changed, 378 insertions(+), 140 deletions(-) create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh diff --git a/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml index 4fdf8068..86f55073 100644 --- a/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml +++ b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml @@ -4,32 +4,31 @@ reverb_aug: rir_types: smallroom: weight: 1 - rir_path: scp:data/rirs_smallroom/rirs.scp + rir_path: csv:data/rirs_smallroom/rirs.csv rir_norm: max mediumroom: weight: 1 - rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_path: csv:data/rirs_mediumroom/rirs.csv rir_norm: max realroom: weight: 1 - rir_path: scp:data/rirs_real/rirs.scp + rir_path: csv:data/rirs_real/rirs.csv rir_norm: max noise_aug: noise_prob: 0.7 noise_types: noise: weight: 1 - noise_path: data/musan_noise_proc_audio/wav.scp + noise_path: data/musan_noise_proc_audio/recordings.csv min_snr: 0 max_snr: 18 music: weight: 1 - noise_path: data/musan_music_proc_audio/wav.scp + noise_path: data/musan_music_proc_audio/recordings.csv min_snr: 3 max_snr: 18 babble: weight: 1 - noise_path: data/musan_speech_babble/wav.scp + noise_path: data/musan_speech_babble/recordings.csv min_snr: 3 max_snr: 18 - diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml index ad991124..ffd2f374 100644 --- a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml index 0b1d0454..7dcc56ef 100644 --- a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml index 254ff796..3f5c46bc 100644 --- a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml index 52be6db5..9e1d0928 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml index bd3e7f86..0d0dc398 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml index 69a8322b..8504db9e 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: @@ -60,4 +60,5 @@ trainer: log_interval: 1000 epochs: 8 eff_batch_size: 512 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml index 3443591a..dda0c632 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -15,7 +15,7 @@ data: max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -25,11 +25,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -38,7 +38,7 @@ data: max_chunk_length: 3.0 min_chunk_length: 3.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -70,4 +70,5 @@ trainer: log_interval: 1000 epochs: 4 eff_batch_size: 256 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml index abe5da6e..46ee7d18 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml index 7287188c..db36f8ee 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 64 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 64 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: @@ -60,4 +60,5 @@ trainer: log_interval: 1000 epochs: 8 eff_batch_size: 512 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml index 3443591a..dda0c632 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -15,7 +15,7 @@ data: max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -25,11 +25,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -38,7 +38,7 @@ data: max_chunk_length: 3.0 min_chunk_length: 3.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -70,4 +70,5 @@ trainer: log_interval: 1000 epochs: 4 eff_batch_size: 256 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml index 69a8322b..8504db9e 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: @@ -60,4 +60,5 @@ trainer: log_interval: 1000 epochs: 8 eff_batch_size: 512 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml index 5e1260ad..ad56e80d 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -15,7 +15,7 @@ data: max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -25,11 +25,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -38,7 +38,7 @@ data: max_chunk_length: 3.0 min_chunk_length: 3.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -70,4 +70,5 @@ trainer: log_interval: 1000 epochs: 4 eff_batch_size: 256 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml index 2addaa1e..40341a27 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wavlmlarge_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml index 69a8322b..8504db9e 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: @@ -60,4 +60,5 @@ trainer: log_interval: 1000 epochs: 8 eff_batch_size: 512 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml index 5e1260ad..ad56e80d 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -15,7 +15,7 @@ data: max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -25,11 +25,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -38,7 +38,7 @@ data: max_chunk_length: 3.0 min_chunk_length: 3.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -70,4 +70,5 @@ trainer: log_interval: 1000 epochs: 4 eff_batch_size: 256 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..b5b9b6b6 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + hf_feats: + override_lora: true + use_lora: true + lora_rank: 4 + lora_components: + - q_proj + - v_proj + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + target_key: speaker + train_mode: hf-lora diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..a39445ff --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + target_key: speaker + train_mode: hf-lora diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..96ef76c5 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,55 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_name=${hf_model_name}_loraqv_ecapatdnn512x3_v2.0 +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/hyperion/torch/layers/lora.py b/hyperion/torch/layers/lora.py index 1436caf5..18401669 100644 --- a/hyperion/torch/layers/lora.py +++ b/hyperion/torch/layers/lora.py @@ -7,7 +7,47 @@ import loralib as lora import torch.nn as nn -from loralib import * +from loralib import mark_only_lora_as_trainable + + +def repr_lora(self, str_base): + if isinstance(self.lora_dropout, nn.Dropout): + lora_dropout = self.lora_dropout.p + else: + lora_dropout = 0 + + str_lora = f", r={self.r}, alpha={self.lora_alpha}, dropout={lora_dropout}, merge_weights={self.merge_weights})" + return str_base[:-1] + str_lora + + +class LinearLoRA(lora.Linear): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class EmbeddingLoRA(lora.Embedding): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv1dLoRA(lora.Conv1d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv2dLoRA(lora.Conv2d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv3dLoRA(lora.Conv3d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) class LoRAFactory: @@ -19,7 +59,7 @@ def create_from_pretrained( merge_weights: bool = True, ): if isinstance(layer, nn.Embedding): - lora_layer = lora.Embedding( + lora_layer = EmbeddingLoRA( layer.num_embeddings, layer.embedding_dim, padding_idx=layer.padding_idx, @@ -36,7 +76,7 @@ def create_from_pretrained( elif isinstance(layer, nn.Linear): bias = layer.bias is not None - lora_layer = lora.Linear( + lora_layer = LinearLoRA( layer.in_features, layer.out_features, bias=bias, @@ -51,11 +91,11 @@ def create_from_pretrained( elif isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): if isinstance(layer, nn.Conv1d): - lora_class = lora.Conv1d + lora_class = Conv1dLoRA elif isinstance(layer, nn.Conv2d): - lora_class = lora.Conv2d + lora_class = Conv2dLoRA elif isinstance(layer, nn.Conv3d): - lora_class = lora.Conv3d + lora_class = Conv3dLoRA bias = layer.bias is not None lora_layer = lora_class( diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index d67785d2..9ccd0d31 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -6,10 +6,9 @@ from enum import Enum from typing import Optional -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ....utils.misc import filter_func_args from ...layer_blocks import TDNNBlock @@ -52,7 +51,6 @@ def __init__( in_feats=None, proj_feats=None, ): - super().__init__() # encoder network @@ -407,7 +405,6 @@ def extract_embed_slidwin( embed_layer=None, detach_chunks=False, ): - if feat_frame_shift is not None: # assume win_length/shift are in secs, transform to frames # pass feat times from msecs to secs @@ -464,7 +461,6 @@ def compute_slidwin_timestamps( feat_frame_shift=10, feat_snip_edges=False, ): - P = self.compute_slidwin_left_padding( win_length, win_shift, @@ -495,7 +491,6 @@ def compute_slidwin_left_padding( feat_frame_shift=10, feat_snip_edges=False, ): - # pass feat times from msecs to secs feat_frame_shift = feat_frame_shift / 1000 feat_frame_length = feat_frame_length / 1000 @@ -526,7 +521,6 @@ def compute_slidwin_left_padding( return P1 + P2 def get_config(self): - enc_cfg = self.encoder_net.get_config() pool_cfg = PF.get_config(self.pool_net) @@ -694,42 +688,14 @@ def valid_train_modes(): @staticmethod def filter_args(**kwargs): - # get arguments for pooling pool_args = PF.filter_args(**kwargs["pool_net"]) args = filter_func_args(ClassifHead.__init__, kwargs) args["pool_net"] = pool_args return args - # valid_args = ( - # "num_classes", - # "embed_dim", - # "num_embed_layers", - # "hid_act", - # "loss_type", - # "cos_scale", - # "margin", - # "margin_warmup_epochs", - # "intertop_k", - # "intertop_margin", - # "num_subcenters", - # "use_norm", - # "norm_before", - # "in_feats", - # "proj_feats", - # "dropout_rate", - # "norm_layer", - # "head_norm_layer", - # "head_use_in_norm", - # ) - # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - # args["pool_net"] = pool_args - # return args - @staticmethod def add_class_args(parser, prefix=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index 2c8d239f..a981d1ec 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -545,6 +545,24 @@ def forward_impl( """ max_in_length = x.size(-1) x, x_mask = self._preprocess(x, x_lengths) + if ddp_get_rank() == 0: + lora_layer = self.hf_model.encoder.layers[0].attention.v_proj + # print( + # "lora\nw=", + # lora_layer.weight[:3, :3], + # "\na=", + # lora_layer.lora_A[:3, :3], + # "\nb=", + # lora_layer.lora_B[:3, :3], + # "\n", + # "merged=", + # lora_layer.merged, + # "training=", + # lora_layer.training, + # flush=True, + # ) + assert self.training == lora_layer.training + assert self.training == (not lora_layer.merged) output = self.hf_model( x, x_mask, @@ -728,7 +746,7 @@ def save(self, file_path: str): @staticmethod def filter_args(**kwargs): - return filter_func_args(HFWav2VecBase.__init__, **kwargs) + return filter_func_args(HFWav2VecBase.__init__, kwargs) # valid_args = ( # "pretrained_model_path", # "normalize_input", @@ -910,7 +928,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - return filter_func_args(HFWav2VecBase.change_config, **kwargs) + return filter_func_args(HFWav2VecBase.change_config, kwargs) # valid_args = ( # "override_dropouts", # "override_spec_augment", From c23103ee406a833726516ff8ac35b3a06382e97e Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 13 Sep 2023 19:32:11 -0400 Subject: [PATCH 83/89] lora in wavlm and hubert --- hyperion/torch/tpm/hf/hf_hubert.py | 25 ++++++++++++++++++++----- hyperion/torch/tpm/hf/hf_wav2vec2.py | 8 +++++++- hyperion/torch/tpm/hf/hf_wavlm.py | 25 ++++++++++++++++++++----- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index 2957e433..32355bf6 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -6,11 +6,10 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import HubertConfig, HubertModel - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import HubertConfig, HubertModel from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase @@ -137,6 +136,12 @@ class HFHubert(HFWav2VecBase): sample_frequency: (`int`) waveform sample frequency used to train the model. feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -186,8 +191,12 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, ): - super().__init__( pretrained_model_path=pretrained_model_path, normalize_input=normalize_input, @@ -205,6 +214,12 @@ def __init__( sample_frequency=sample_frequency, feat_extract_lr=feat_extract_lr, encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -618,7 +633,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - args_base = HFWav2VecBase.filter_args(**kwargs) + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) valid_args = ( "hidden_dropout", "activation_dropout", diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index dd5de2fe..bc98f460 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -149,6 +149,12 @@ class HFWav2Vec2(HFWav2VecBase): sample_frequency: (`int`) waveform sample frequency used to train the model. feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -697,7 +703,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - args_base = HFWav2VecBase.filter_args(**kwargs) + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) valid_args = ( "hidden_dropout", "activation_dropout", diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index e1b67d81..400e6a8b 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -6,11 +6,10 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import WavLMConfig, WavLMModel - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import WavLMConfig, WavLMModel from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase @@ -150,6 +149,12 @@ class HFWavLM(HFWav2VecBase): sample_frequency: (`int`) waveform sample frequency used to train the model. feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -204,8 +209,12 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, ): - super().__init__( pretrained_model_path=pretrained_model_path, normalize_input=normalize_input, @@ -223,6 +232,12 @@ def __init__( sample_frequency=sample_frequency, feat_extract_lr=feat_extract_lr, encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -687,7 +702,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - args_base = HFWav2VecBase.filter_args(**kwargs) + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) valid_args = ( "hidden_dropout", "activation_dropout", From 81c540b1492ec7b42299f0ebb871f6af66d11304 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 15 Sep 2023 12:35:56 -0400 Subject: [PATCH 84/89] fix bug in w2v constructors with lora --- ...v2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh | 6 +++--- egs/voxceleb/v2.1/run_006_extract_xvectors.sh | 2 +- hyperion/torch/tpm/hf/hf_hubert.py | 10 ++++++++++ hyperion/torch/tpm/hf/hf_wav2vec2.py | 9 +++++++++ hyperion/torch/tpm/hf/hf_wav2vec_base.py | 17 ++++------------- hyperion/torch/tpm/hf/hf_wavlm.py | 10 ++++++++++ 6 files changed, 37 insertions(+), 17 deletions(-) diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh index 96ef76c5..1985b8e6 100644 --- a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh @@ -37,9 +37,9 @@ nnet_s3=$nnet_s3_dir/model_ep0004.pth # back-end do_plda=false -do_snorm=true -do_qmf=true -do_voxsrc22=true +#do_snorm=true +#do_qmf=true +#do_voxsrc22=true plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 diff --git a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh index 2cfe27fe..72b019cd 100755 --- a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh +++ b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh @@ -17,7 +17,7 @@ xvec_chunk_length=120.0 . $config_file if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" + xvec_args="--use-gpu --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" num_gpus=1 else diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index 32355bf6..638bf561 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -191,6 +191,7 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + use_lora: bool = False, lora_components: List[str] = ["q_proj", "v_proj"], lora_rank: int = 4, lora_alpha: int = 1, @@ -298,6 +299,15 @@ def __init__( if drop_layers_gt is not None: self.drop_upper_layers(drop_layers_gt) + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + self.ignore_pretrained = True @property diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index bc98f460..5b59d79a 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -322,6 +322,15 @@ def __init__( if drop_layers_gt is not None: self.drop_upper_layers(drop_layers_gt) + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + self.ignore_pretrained = True @property diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index a981d1ec..e0bcee1c 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -174,15 +174,6 @@ def __init__( self._frame_shift = None self.hf_model = None - if use_lora: - self._make_lora_layers( - lora_components, - lora_rank, - lora_alpha, - lora_dropout, - lora_merge_weights, - ) - def __deepcopy__(self, memo): """Reimplementation of deepcopy for Hugging Face models. The weight_norm in the Conv. Pos. Encoder of Wav2Vec models make the default deepcopy to fail. @@ -545,8 +536,8 @@ def forward_impl( """ max_in_length = x.size(-1) x, x_mask = self._preprocess(x, x_lengths) - if ddp_get_rank() == 0: - lora_layer = self.hf_model.encoder.layers[0].attention.v_proj + # if ddp_get_rank() == 0: + # lora_layer = self.hf_model.encoder.layers[0].attention.v_proj # print( # "lora\nw=", # lora_layer.weight[:3, :3], @@ -561,8 +552,8 @@ def forward_impl( # lora_layer.training, # flush=True, # ) - assert self.training == lora_layer.training - assert self.training == (not lora_layer.merged) + # assert self.training == lora_layer.training + # assert self.training == (not lora_layer.merged) output = self.hf_model( x, x_mask, diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index 400e6a8b..1db5fa23 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -209,6 +209,7 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + use_lora: bool = False, lora_components: List[str] = ["q_proj", "v_proj"], lora_rank: int = 4, lora_alpha: int = 1, @@ -321,6 +322,15 @@ def __init__( if drop_layers_gt is not None: self.drop_upper_layers(drop_layers_gt) + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + self.ignore_pretrained = True @property From a54c963d8d40dbcb49604ef3febb909768e02790 Mon Sep 17 00:00:00 2001 From: ylu125 Date: Sat, 23 Sep 2023 17:01:27 -0400 Subject: [PATCH 85/89] update default argument of lora_merge_weights to false --- hyperion/torch/layers/lora.py | 2 +- hyperion/torch/tpm/hf/hf_wav2vec2.py | 5 ++++- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 10 +++++++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/hyperion/torch/layers/lora.py b/hyperion/torch/layers/lora.py index 18401669..91279119 100644 --- a/hyperion/torch/layers/lora.py +++ b/hyperion/torch/layers/lora.py @@ -56,7 +56,7 @@ def create_from_pretrained( r: int = 8, lora_alpha: int = 1, lora_dropout: float = 0.0, - merge_weights: bool = True, + merge_weights: bool = False, ): if isinstance(layer, nn.Embedding): lora_layer = EmbeddingLoRA( diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index 5b59d79a..901c5072 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -204,6 +204,7 @@ def __init__( ignore_pretrained: bool = False, override_dropouts: bool = False, override_spec_augment: bool = False, + override_lora: bool = False, left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, @@ -214,7 +215,7 @@ def __init__( lora_rank: int = 4, lora_alpha: int = 1, lora_dropout: float = 0.0, - lora_merge_weights: bool = True, + lora_merge_weights: bool = False, ): super().__init__( pretrained_model_path=pretrained_model_path, @@ -228,6 +229,7 @@ def __init__( ignore_pretrained=ignore_pretrained, override_dropouts=override_dropouts, override_spec_augment=override_spec_augment, + override_lora=override_lora, left_encoder_context=left_encoder_context, right_encoder_context=right_encoder_context, sample_frequency=sample_frequency, @@ -269,6 +271,7 @@ def __init__( self.change_config( override_dropouts=self.override_dropouts, override_spec_augment=self.override_spec_augment, + override_lora=self.override_lora, hidden_dropout=hidden_dropout, activation_dropout=activation_dropout, attention_dropout=attention_dropout, diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index e0bcee1c..21dbcd54 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -15,6 +15,7 @@ from ....utils.misc import filter_func_args from ...layers import LoRAFactory +import loralib as lora from ...torch_model import TorchModel from ...utils import scale_seq_lengths, seq_lengths_to_mask from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs @@ -77,6 +78,7 @@ def __init__( ignore_pretrained: bool = False, override_dropouts: bool = False, override_spec_augment: bool = False, + override_lora: bool = False, left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, @@ -87,7 +89,7 @@ def __init__( lora_rank: int = 4, lora_alpha: int = 1, lora_dropout: float = 0.0, - lora_merge_weights: bool = True, + lora_merge_weights: bool = False, ): super().__init__() self.pretrained_model_path = pretrained_model_path @@ -99,6 +101,7 @@ def __init__( self.ignore_pretrained = ignore_pretrained self.override_dropouts = override_dropouts self.override_spec_augment = override_spec_augment + self.override_lora = override_lora self.right_encoder_context = right_encoder_context self.left_encoder_context = left_encoder_context self.feat_extract_lr = feat_extract_lr @@ -253,7 +256,7 @@ def change_config( lora_rank: int = 4, lora_alpha: int = 1, lora_dropout: float = 0.0, - lora_merge_weights: bool = True, + lora_merge_weights: bool = False, **kwargs, ): if override_spec_augment: @@ -304,7 +307,7 @@ def change_lora( lora_rank: int = 4, lora_alpha: int = 1, lora_dropout: float = 0.0, - lora_merge_weights: bool = True, + lora_merge_weights: bool = False, ): if not self.use_lora: if use_lora: @@ -714,6 +717,7 @@ def get_config(self): "ignore_pretrained": self.ignore_pretrained, "override_dropouts": self.override_dropouts, "override_spec_augment": self.override_spec_augment, + "override_lora": self.override_lora, "left_encoder_context": self.left_encoder_context, "right_encoder_context": self.right_encoder_context, "sample_frequency": self.sample_frequency, From 6a72173026af1a7d57cb1cc0dfb99cd62ba2975c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 25 Sep 2023 00:42:36 +0000 Subject: [PATCH 86/89] update config for 4 langs experiment --- ...ase_rnnt_k2_pruned_4langs_stage1_v4.0.yaml | 87 +++++++++++++++++++ .../config_pruned_transducer_v4.0_4langs.sh | 46 ++++++++++ 2 files changed, 133 insertions(+) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_4langs.sh diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml new file mode 100644 index 00000000..465cfcdb --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml @@ -0,0 +1,87 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 1.0 + num_chunks_per_seg_epoch: 0.6 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 50 + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + transducer: + decoder: + prune_range: 15 + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + reduction: mean + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 45000 + hold_steps: 30000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_4langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_4langs.sh new file mode 100644 index 00000000..424c2649 --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_transducer_v4.0_4langs.sh @@ -0,0 +1,46 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=4_langs_train_proc_audio +dev_data=4_langs_dev_proc_audio + +test_data="tr_test_proc_audio fr_test_proc_audio de_test_proc_audio it_test_proc_audio" + + +lans="tr de fr it" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_4langs_stage1_v4.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v4.0_4_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0015.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage2_v4.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0015.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth From e15b227c66e80cb69e600ddde9a0b56ef32bd389 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 30 Sep 2023 17:01:48 +0000 Subject: [PATCH 87/89] Add FiLM inside the Wav2vec2 --- ...2base_rnnt_film_k2_pruned_stage3_v7.0.yaml | 98 + ...g_pruned_filmed_transducer_v7.0_13langs.sh | 44 + hyperion/torch/tpm/hf/hf_wav2vec2.py | 19 +- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 164 +- .../tpm/hf/wav2vec2/modeling_wav2vec2.py | 2477 +++++++++++++++++ 5 files changed, 2788 insertions(+), 14 deletions(-) create mode 100644 egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml create mode 100644 egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v7.0_13langs.sh create mode 100644 hyperion/torch/tpm/hf/wav2vec2/modeling_wav2vec2.py diff --git a/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml new file mode 100644 index 00000000..9ab275a6 --- /dev/null +++ b/egs/commonvoice/v1/conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml @@ -0,0 +1,98 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: false + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 0.1 + + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + - language + sampler: + #sampler_type: 'bucketing_seg_sampler' + sampler_type: 'class_weighted_random_bucketing_seg_sampler' + max_batch_length: 20 + max_audio_length: 15. + min_batch_size: 1 + drop_last: true + # for class_weighted_random_bucketing_seg_sampler + base_sampler_type: class_weighted_seg_sampler + weight_mode: "data-prior" + class_name: "language" + weight_exponent: 0.3 + num_chunks_per_seg_epoch: 1.0 + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + # override_condition: true + use_condition: true + condition_size: 128 + condition_components: + - attention + condition_type: "one-hot" + transducer: + decoder: + prune_range: 15 + reduction: mean + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + condition_size: 128 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: film-fused-feature + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.0002 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.6 + decay_steps: 30000 + hold_steps: 20000 + min_lr: 4e-5 + warmup_steps: 6000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: full + + diff --git a/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v7.0_13langs.sh b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v7.0_13langs.sh new file mode 100644 index 00000000..b101854c --- /dev/null +++ b/egs/commonvoice/v1/global_conf/config_pruned_filmed_transducer_v7.0_13langs.sh @@ -0,0 +1,44 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=13_langs_train_proc_audio +dev_data=13_langs_dev_proc_audio +test_data="en_test_proc_audio ca_test_proc_audio" +#ga-IE_test_proc_audio br_test_proc_audio sl_test_proc_audio cv_test_proc_audio tt_test_proc_audio tr_test_proc_audio cy_test_proc_audio it_test_proc_audio kab_test_proc_audio fr_test_proc_audio de_test_proc_audio ca_test_proc_audio en_test_proc_audio +lans="sl ga-IE cv br tr cy tt ca kab de fr it en" +language=13_langs_weighted + +# bpe_model=data/13_langs_lang_bpe_4000/bpe.model +bpe_model=data/13_langs_weighted_lang_bpe_8000/bpe.model +# bpe_model=data/13_langs_weighted_lang_bpe_16000/bpe.model + + + +nnet_type=hf_wav2vec2rnn_filmed_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage3_v7.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned_film.v7.0_13_langs_weighted_8000_bpe +nnet_s1_name=$nnet_name.s3 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0015.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage4_v7.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s4 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0005.pth + +nnet_s3_base_cfg=conf/train_wav2vec2base_rnnt_film_k2_pruned_stage5_v7.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s5 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0011.pth diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index 901c5072..d2638acd 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -13,7 +13,7 @@ from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase - +from .wav2vec2.modeling_wav2vec2 import Wav2Vec2CondModel class HFWav2Vec2(HFWav2VecBase): r"""This is wrapper over HuggingFace Wav2Vec2 model. @@ -205,6 +205,7 @@ def __init__( override_dropouts: bool = False, override_spec_augment: bool = False, override_lora: bool = False, + override_condition: bool = False, left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, @@ -216,6 +217,10 @@ def __init__( lora_alpha: int = 1, lora_dropout: float = 0.0, lora_merge_weights: bool = False, + use_condition: bool = False, + condition_size: int = 128, + condition_components: List[str] = ["attention"], + condition_type: str = "one-hot", ): super().__init__( pretrained_model_path=pretrained_model_path, @@ -230,6 +235,7 @@ def __init__( override_dropouts=override_dropouts, override_spec_augment=override_spec_augment, override_lora=override_lora, + override_condition=override_condition, left_encoder_context=left_encoder_context, right_encoder_context=right_encoder_context, sample_frequency=sample_frequency, @@ -241,6 +247,10 @@ def __init__( lora_alpha=lora_alpha, lora_dropout=lora_dropout, lora_merge_weights=lora_merge_weights, + use_condition=use_condition, + condition_size=condition_size, + condition_components=condition_components, + condition_type=condition_type, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -272,6 +282,7 @@ def __init__( override_dropouts=self.override_dropouts, override_spec_augment=self.override_spec_augment, override_lora=self.override_lora, + override_condition=self.override_condition, hidden_dropout=hidden_dropout, activation_dropout=activation_dropout, attention_dropout=attention_dropout, @@ -333,6 +344,12 @@ def __init__( lora_dropout, lora_merge_weights, ) + if use_condition: + self._make_condition_layers( + condition_size, + condition_components, + condition_type, + ) self.ignore_pretrained = True diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index 21dbcd54..9f799ded 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -19,6 +19,7 @@ from ...torch_model import TorchModel from ...utils import scale_seq_lengths, seq_lengths_to_mask from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs +from .wav2vec2.modeling_wav2vec2 import Wav2Vec2CondModel class HFWav2VecBase(TorchModel): @@ -79,6 +80,7 @@ def __init__( override_dropouts: bool = False, override_spec_augment: bool = False, override_lora: bool = False, + override_condition: bool = False, left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, @@ -90,6 +92,10 @@ def __init__( lora_alpha: int = 1, lora_dropout: float = 0.0, lora_merge_weights: bool = False, + use_condition: bool = False, + condition_size: int = 128, + condition_components: List[str] = ["attention"], + condition_type: str = "one-hot", ): super().__init__() self.pretrained_model_path = pretrained_model_path @@ -102,6 +108,7 @@ def __init__( self.override_dropouts = override_dropouts self.override_spec_augment = override_spec_augment self.override_lora = override_lora + self.override_condition = override_condition self.right_encoder_context = right_encoder_context self.left_encoder_context = left_encoder_context self.feat_extract_lr = feat_extract_lr @@ -112,6 +119,10 @@ def __init__( self.lora_alpha = lora_alpha self.lora_dropout = lora_dropout self.lora_merge_weights = lora_merge_weights + self.use_condition = use_condition + self.condition_size = condition_size + self.condition_components = condition_components + self.condition_type = condition_type if pretrained_model_path is not None and not ignore_pretrained: rank = ddp_get_rank() @@ -249,14 +260,19 @@ def change_config( override_dropouts: bool, override_spec_augment: bool, override_lora: bool, + override_condition: bool, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, use_lora: bool = False, + use_condition: bool = False, lora_components: List[str] = ["q_proj", "v_proj"], lora_rank: int = 4, lora_alpha: int = 1, lora_dropout: float = 0.0, lora_merge_weights: bool = False, + condition_size: int = 128, + condition_components: List[str] = ["attention"], + condition_type: str = "one-hot", **kwargs, ): if override_spec_augment: @@ -278,6 +294,15 @@ def change_config( lora_merge_weights=lora_merge_weights, ) + if override_condition: + logging.info(f"overriding Condition config") + self.change_condition( + use_condition=use_condition, + condition_size=condition_size, + condition_components=condition_components, + condition_type=condition_type, + ) + self.feat_extract_lr = feat_extract_lr self.encoder_lr = encoder_lr @@ -391,6 +416,53 @@ def _recursive_replace_layer_by_lora( setattr(model, name, lora_layer) counts[name] += 1 + def change_condition(self, + use_condition: bool = False, + condition_size: int = 128, + condition_components: List[str] = ["attention"], + condition_type: str = "one-hot", + ): + if not self.use_condition: + if use_condition: + self._make_condition_layers( + condition_size, + condition_components, + condition_type, + ) + else: + pass + else: + if use_condition: + pass + else: + pass + self.use_condition = use_condition + self.condition_size = condition_size + self.condition_components = condition_components + self.condition_type = condition_type + + def _make_condition_layers(self, + condition_size: int, + condition_components: List[str], + condition_type: str, + ): + # TODO: copy weight from self.hf_model to self.hf_model_with_condition + config = self.hf_model.config + config.condition_size = condition_size + config.condition_components = condition_components + config.condition_type = condition_type + + hf_model_with_condition = Wav2Vec2CondModel(config) + self._copy_condition_weights(self.hf_model, hf_model_with_condition) + # TODO: make weight for the FiLM layers (0,1) + self.hf_model = hf_model_with_condition + + + def _copy_condition_weights(self, hf_model, hf_model_with_condition): + for name, param in hf_model.named_parameters(): + if name in hf_model_with_condition.state_dict(): + hf_model_with_condition.state_dict()[name].data.copy_(param.data) + def change_dropouts(self, **kwargs): pass # needs to be overloaded @@ -466,6 +538,7 @@ def forward( self, x: torch.Tensor, x_lengths: Optional[torch.LongTensor] = None, + condition_features: Optional[torch.Tensor] = None, return_attentions: bool = False, return_hid_states: bool = False, chunk_length: float = 0, @@ -496,11 +569,12 @@ def forward( (tuple(torch.FloatTensor)). """ if chunk_length == 0 or x.size(1) < chunk_length * self.sample_frequency: - return self.forward_impl(x, x_lengths, return_attentions, return_hid_states) + return self.forward_impl(x, x_lengths, condition_features, return_attentions, return_hid_states) else: return self.forward_long_impl( x, x_lengths, + condition_features, return_attentions, return_hid_states, chunk_length, @@ -511,6 +585,7 @@ def forward_impl( self, x: torch.Tensor, x_lengths: Optional[torch.LongTensor] = None, + condition_features: Optional[torch.Tensor] = None, return_attentions: bool = False, return_hid_states: bool = False, ): @@ -557,12 +632,23 @@ def forward_impl( # ) # assert self.training == lora_layer.training # assert self.training == (not lora_layer.merged) - output = self.hf_model( - x, - x_mask, - output_attentions=return_attentions, - output_hidden_states=return_hid_states, - ) + + if condition_features is not None: + output = self.hf_model( + x, + condition_features, + x_mask, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) + + else: + output = self.hf_model( + x, + x_mask, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) max_out_length = output.last_hidden_state.size(1) feat_lengths = ( None @@ -577,6 +663,7 @@ def forward_long_impl( self, x: torch.Tensor, x_lengths: Optional[torch.LongTensor] = None, + condition_features: Optional[torch.Tensor] = None, return_attentions: bool = False, return_hid_states: bool = False, chunk_length: float = 120.0, @@ -633,12 +720,21 @@ def forward_long_impl( stop_i = min(start + chunk_length + right_context, x.size(1)) x_i = x[:, start_i:stop_i] x_mask_i = None if x_mask is None else x_mask[start_i:stop_i] - output_i = self.hf_model( - x_i, - x_mask_i, - output_attentions=return_attentions, - output_hidden_states=return_hid_states, - ) + if condition_features is not None: + output_i = self.hf_model( + x_i, + x_mask_i, + condition_features=condition_features, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) + else: + output_i = self.hf_model( + x_i, + x_mask_i, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) if i < num_chunks - 1: start_out_i = max( @@ -718,6 +814,7 @@ def get_config(self): "override_dropouts": self.override_dropouts, "override_spec_augment": self.override_spec_augment, "override_lora": self.override_lora, + "override_condition": self.override_condition, "left_encoder_context": self.left_encoder_context, "right_encoder_context": self.right_encoder_context, "sample_frequency": self.sample_frequency, @@ -729,6 +826,10 @@ def get_config(self): "lora_alpha": self.lora_alpha, "lora_dropout": self.lora_dropout, "lora_merge_weights": self.lora_merge_weights, + "use_condition": self.use_condition, + "condition_size": self.condition_size, + "condition_components": self.condition_components, + "condition_type": self.condition_type, } base_config = super().get_config() @@ -814,6 +915,34 @@ def _add_lora_args(parser): help="lora weights are merged with the pretrained weights at inference.", ) + def _add_condition_args(parser): + parser.add_argument( + "--use-condition", + default=False, + action=ActionYesNo, + help="use condition", + ) + parser.add_argument( + "--condition-size", + default=128, + type=int, + help="size of the condition", + ) + parser.add_argument( + "--condition-components", + default=["attention"], + nargs="+", + choices=["attention"], + help="list of components where we apply condition, eg [attention]", + ) + parser.add_argument( + "--condition-type", + default="one-hot", + choices=["one-hot", "learned"], + help="type of condition", + ) + + @staticmethod def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: @@ -917,6 +1046,7 @@ def add_class_args(parser, prefix=None, skip=set()): HFWav2VecBase._add_lr_args(parser) HFWav2VecBase._add_lora_args(parser) + HFWav2VecBase._add_condition_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @@ -962,7 +1092,15 @@ def add_finetune_args(parser, prefix=None, skip=set()): help=("whether to change the config of LoRA layers in the model."), ) + parser.add_argument( + "--override-condition", + default=False, + action=ActionYesNo, + help=("whether to change the config of condition layers in the model."), + ) + HFWav2VecBase._add_lr_args(parser) HFWav2VecBase._add_lora_args(parser) + HFWav2VecBase._add_condition_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tpm/hf/wav2vec2/modeling_wav2vec2.py b/hyperion/torch/tpm/hf/wav2vec2/modeling_wav2vec2.py new file mode 100644 index 00000000..ceeda9a9 --- /dev/null +++ b/hyperion/torch/tpm/hf/wav2vec2/modeling_wav2vec2.py @@ -0,0 +1,2477 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Wav2Vec2 model.""" + +import math +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from transformers.activations import ACT2FN +# from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled +from transformers.models.wav2vec2.modeling_wav2vec2 import is_deepspeed_zero3_enabled +from transformers.modeling_outputs import ( + BaseModelOutput, + CausalLMOutput, + MaskedLMOutput, + SequenceClassifierOutput, + TokenClassifierOutput, + Wav2Vec2BaseModelOutput, + XVectorOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + cached_file, + is_safetensors_available, + logging, + replace_return_docstrings, +) + +from transformers import Wav2Vec2Config + +from ....layer_blocks import FiLM + +WAV2VEC2_ADAPTER_PT_FILE = "adapter.{}.bin" +WAV2VEC2_ADAPTER_SAFE_FILE = "adapter.{}.safetensors" + +if is_safetensors_available(): + from safetensors.torch import load_file as safe_load_file + + +logger = logging.get_logger(__name__) + + +_HIDDEN_STATES_START_POSITION = 2 + +# General docstring +_CONFIG_FOR_DOC = "Wav2Vec2Config" + +# Base docstring +_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h" +_EXPECTED_OUTPUT_SHAPE = [1, 292, 768] + +# CTC docstring +_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'" +_CTC_EXPECTED_LOSS = 53.48 + +# Audio class docstring +_SEQ_CLASS_CHECKPOINT = "superb/wav2vec2-base-superb-ks" +_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'" +_SEQ_CLASS_EXPECTED_LOSS = 6.54 + +# Frame class docstring +_FRAME_CLASS_CHECKPOINT = "anton-l/wav2vec2-base-superb-sd" +_FRAME_EXPECTED_OUTPUT = [0, 0] + +# Speaker Verification docstring +_XVECTOR_CHECKPOINT = "anton-l/wav2vec2-base-superb-sv" +_XVECTOR_EXPECTED_OUTPUT = 0.98 + + +WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "facebook/wav2vec2-base-960h", + "facebook/wav2vec2-large-960h", + "facebook/wav2vec2-large-960h-lv60", + "facebook/wav2vec2-large-960h-lv60-self", + # See all Wav2Vec2 models at https://huggingface.co/models?filter=wav2vec2 +] + + +@dataclass +class Wav2Vec2ForPreTrainingOutput(ModelOutput): + """ + Output type of [`Wav2Vec2ForPreTraining`], with potential hidden states and attentions. + + Args: + loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official + paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss. + projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked + projected quantized states. + projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive + target vectors for contrastive loss. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) . + diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) . + """ + + loss: Optional[torch.FloatTensor] = None + projected_states: torch.FloatTensor = None + projected_quantized_states: torch.FloatTensor = None + codevector_perplexity: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + contrastive_loss: Optional[torch.FloatTensor] = None + diversity_loss: Optional[torch.FloatTensor] = None + + +def _compute_mask_indices( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + attention_mask: Optional[torch.LongTensor] = None, + min_masks: int = 0, +) -> np.ndarray: + """ + Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for + ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on + CPU as part of the preprocessing during training. + + Args: + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. + mask_length: size of the mask + min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. + """ + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" + ) + + # epsilon is used for probabilistic rounding + epsilon = np.random.rand(1).item() + + def compute_num_masked_span(input_length): + """Given input length, compute how many spans should be masked""" + num_masked_span = int(mask_prob * input_length / mask_length + epsilon) + num_masked_span = max(num_masked_span, min_masks) + + # make sure num masked span <= sequence_length + if num_masked_span * mask_length > sequence_length: + num_masked_span = sequence_length // mask_length + + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + + return num_masked_span + + # compute number of masked spans in batch + input_lengths = ( + attention_mask.sum(-1).detach().tolist() + if attention_mask is not None + else [sequence_length for _ in range(batch_size)] + ) + + # SpecAugment mask to fill + spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool) + spec_aug_mask_idxs = [] + + max_num_masked_span = compute_num_masked_span(sequence_length) + + if max_num_masked_span == 0: + return spec_aug_mask + + for input_length in input_lengths: + # compute num of masked spans for this input + num_masked_span = compute_num_masked_span(input_length) + + # get random indices to mask + spec_aug_mask_idx = np.random.choice( + np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False + ) + + # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] + + spec_aug_mask_idx = np.concatenate( + [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] + ) + spec_aug_mask_idxs.append(spec_aug_mask_idx) + + spec_aug_mask_idxs = np.array(spec_aug_mask_idxs) + + # expand masked indices to masked spans + spec_aug_mask_idxs = np.broadcast_to( + spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length) + ) + spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + + # add offset to the starting indexes so that indexes now create a span + offsets = np.arange(mask_length)[None, None, :] + offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( + batch_size, max_num_masked_span * mask_length + ) + spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + + # scatter indices to mask + np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) + + return spec_aug_mask + + +def _sample_negative_indices( + features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None +): + """ + Sample `num_negatives` vectors from feature vectors. + """ + batch_size, sequence_length = features_shape + + # generate indices of the positive vectors themselves, repeat them `num_negatives` times + sequence_length_range = np.arange(sequence_length) + + # get `num_negatives` random vector indices from the same utterance + sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32) + + mask_time_indices = ( + mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool) + ) + + for batch_idx in range(batch_size): + high = mask_time_indices[batch_idx].sum() - 1 + mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]] + + feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives)) + sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives)) + # avoid sampling the same positive vector, but keep the distribution uniform + sampled_indices[sampled_indices >= feature_indices] += 1 + + # remap to actual indices + sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices] + + # correct for batch size + sampled_negative_indices[batch_idx] += batch_idx * sequence_length + + return sampled_negative_indices + + +class Wav2Vec2NoLayerNormConvLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2LayerNormConvLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + + hidden_states = hidden_states.transpose(-2, -1) + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states.transpose(-2, -1) + + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2GroupNormConvLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.activation = ACT2FN[config.feat_extract_activation] + + self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2PositionalConvEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.conv = nn.Conv1d( + config.hidden_size, + config.hidden_size, + kernel_size=config.num_conv_pos_embeddings, + padding=config.num_conv_pos_embeddings // 2, + groups=config.num_conv_pos_embedding_groups, + ) + + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0): + self.conv = weight_norm(self.conv, name="weight", dim=2) + deepspeed.zero.register_external_parameter(self, self.conv.weight_v) + deepspeed.zero.register_external_parameter(self, self.conv.weight_g) + else: + self.conv = weight_norm(self.conv, name="weight", dim=2) + + self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = hidden_states.transpose(1, 2) + + hidden_states = self.conv(hidden_states) + hidden_states = self.padding(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class Wav2Vec2SamePadLayer(nn.Module): + def __init__(self, num_conv_pos_embeddings): + super().__init__() + self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0 + + def forward(self, hidden_states): + if self.num_pad_remove > 0: + hidden_states = hidden_states[:, :, : -self.num_pad_remove] + return hidden_states + + +class Wav2Vec2FeatureEncoder(nn.Module): + """Construct the features from raw audio waveform""" + + def __init__(self, config): + super().__init__() + + if config.feat_extract_norm == "group": + conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [ + Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1) + ] + elif config.feat_extract_norm == "layer": + conv_layers = [ + Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers) + ] + else: + raise ValueError( + f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']" + ) + self.conv_layers = nn.ModuleList(conv_layers) + self.gradient_checkpointing = False + self._requires_grad = True + + def _freeze_parameters(self): + for param in self.parameters(): + param.requires_grad = False + self._requires_grad = False + + def forward(self, input_values): + hidden_states = input_values[:, None] + + # make sure hidden_states require grad for gradient_checkpointing + if self._requires_grad and self.training: + hidden_states.requires_grad = True + + for conv_layer in self.conv_layers: + if self._requires_grad and self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(conv_layer), + hidden_states, + ) + else: + hidden_states = conv_layer(hidden_states) + + return hidden_states + + +class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder): + def __init__(self, config): + super().__init__(config) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + +class Wav2Vec2FeatureProjection(nn.Module): + def __init__(self, config): + super().__init__() + self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps) + self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size) + self.dropout = nn.Dropout(config.feat_proj_dropout) + + def forward(self, hidden_states): + # non-projected hidden states are needed for quantization + norm_hidden_states = self.layer_norm(hidden_states) + hidden_states = self.projection(norm_hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states, norm_hidden_states + + +# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Wav2Vec2 +class Wav2Vec2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +class Wav2Vec2FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + self.intermediate_dropout = nn.Dropout(config.activation_dropout) + + self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.output_dropout = nn.Dropout(config.hidden_dropout) + + def forward(self, hidden_states): + hidden_states = self.intermediate_dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.intermediate_dropout(hidden_states) + + hidden_states = self.output_dense(hidden_states) + hidden_states = self.output_dropout(hidden_states) + return hidden_states + + +class Wav2Vec2CondEncoderLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.attention = Wav2Vec2Attention( + embed_dim=config.hidden_size, + num_heads=config.num_attention_heads, + dropout=config.attention_dropout, + is_decoder=False, + ) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.feed_forward = Wav2Vec2FeedForward(config) + self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.condition_type = config.condition_type + self.condition_layer = FiLM(config.hidden_size, config.condition_size, "linear") + + + def forward(self, hidden_states, condition_features, attention_mask=None, output_attentions=False): + attn_residual = hidden_states + hidden_states, attn_weights, _ = self.attention( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = self.dropout(hidden_states) + hidden_states = self.condition_layer(hidden_states, condition_features) + hidden_states = attn_residual + hidden_states + + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states + self.feed_forward(hidden_states) + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class Wav2Vec2CondEncoderLayerStableLayerNorm(nn.Module): + def __init__(self, config): + super().__init__() + self.attention = Wav2Vec2Attention( + embed_dim=config.hidden_size, + num_heads=config.num_attention_heads, + dropout=config.attention_dropout, + is_decoder=False, + ) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.feed_forward = Wav2Vec2FeedForward(config) + self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if getattr(config, "adapter_attn_dim", None) is not None: + self.adapter_layer = Wav2Vec2AttnAdapterLayer(config) + else: + self.adapter_layer = None + + self.condition_type = config.condition_type + self.condition_layer = FiLM(config.hidden_size, config.condition_size, "linear") + + def forward( + self, + hidden_states: torch.Tensor, + condition_features: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ): + attn_residual = hidden_states + hidden_states = self.layer_norm(hidden_states) + hidden_states, attn_weights, _ = self.attention( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = self.dropout(hidden_states) + hidden_states = self.condition_layer(hidden_states, condition_features) + hidden_states = attn_residual + hidden_states + hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states)) + + if self.adapter_layer is not None: + hidden_states = hidden_states + self.adapter_layer(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class Wav2Vec2CondEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layers = nn.ModuleList([Wav2Vec2CondEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.tensor, + condition_features: torch.tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if attention_mask is not None: + # make sure padded tokens output 0 + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0 + + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min + attention_mask = attention_mask.expand( + attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] + ) + + position_embeddings = self.pos_conv_embed(hidden_states) + hidden_states = hidden_states + position_embeddings + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states) + + deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() + + for layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False + if not skip_the_layer or deepspeed_zero3_is_enabled: + # under deepspeed zero3 all gpus must run in sync + if self.gradient_checkpointing and self.training: + # create gradient checkpointing function + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer), + hidden_states, + attention_mask, + ) + else: + layer_outputs = layer( + hidden_states, condition_features, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class Wav2Vec2CondEncoderStableLayerNorm(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layers = nn.ModuleList( + [Wav2Vec2CondEncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + condition_features, + attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if attention_mask is not None: + # make sure padded tokens are not attended to + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0 + + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min + attention_mask = attention_mask.expand( + attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] + ) + + position_embeddings = self.pos_conv_embed(hidden_states) + hidden_states = hidden_states + position_embeddings + hidden_states = self.dropout(hidden_states) + + deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() + + for layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False + if not skip_the_layer or deepspeed_zero3_is_enabled: + # under deepspeed zero3 all gpus must run in sync + # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication + if self.gradient_checkpointing and self.training: + # create gradient checkpointing function + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer), + hidden_states, + attention_mask, + ) + else: + layer_outputs = layer( + hidden_states, condition_features, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class Wav2Vec2GumbelVectorQuantizer(nn.Module): + """ + Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH + GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information. + """ + + def __init__(self, config): + super().__init__() + self.num_groups = config.num_codevector_groups + self.num_vars = config.num_codevectors_per_group + + if config.codevector_dim % self.num_groups != 0: + raise ValueError( + f"`config.codevector_dim {config.codevector_dim} must be divisible " + f"by `config.num_codevector_groups` {self.num_groups} for concatenation" + ) + + # storage for codebook variables (codewords) + self.codevectors = nn.Parameter( + torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups) + ) + self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars) + + # can be decayed for training + self.temperature = 2 + + @staticmethod + def _compute_perplexity(probs, mask=None): + if mask is not None: + mask_extended = mask.flatten()[:, None, None].expand(probs.shape) + probs = torch.where(mask_extended, probs, torch.zeros_like(probs)) + marginal_probs = probs.sum(dim=0) / mask.sum() + else: + marginal_probs = probs.mean(dim=0) + + perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum() + return perplexity + + def forward(self, hidden_states, mask_time_indices=None): + batch_size, sequence_length, hidden_size = hidden_states.shape + + # project to codevector dim + hidden_states = self.weight_proj(hidden_states) + hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1) + + if self.training: + # sample code vector probs via gumbel in differentiateable way + codevector_probs = nn.functional.gumbel_softmax( + hidden_states.float(), tau=self.temperature, hard=True + ).type_as(hidden_states) + + # compute perplexity + codevector_soft_dist = torch.softmax( + hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1 + ) + perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices) + else: + # take argmax in non-differentiable way + # comptute hard codevector distribution (one hot) + codevector_idx = hidden_states.argmax(dim=-1) + codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_( + -1, codevector_idx.view(-1, 1), 1.0 + ) + codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1) + + perplexity = self._compute_perplexity(codevector_probs, mask_time_indices) + + codevector_probs = codevector_probs.view(batch_size * sequence_length, -1) + # use probs to retrieve codevectors + codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors + codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1) + codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1) + + return codevectors, perplexity + + +class Wav2Vec2Adapter(nn.Module): + def __init__(self, config): + super().__init__() + + # feature dim might need to be down-projected + if config.output_hidden_size != config.hidden_size: + self.proj = nn.Linear(config.hidden_size, config.output_hidden_size) + self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size) + else: + self.proj = self.proj_layer_norm = None + + self.layers = nn.ModuleList(Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers)) + self.layerdrop = config.layerdrop + + def forward(self, hidden_states): + # down project hidden_states if necessary + if self.proj is not None and self.proj_layer_norm is not None: + hidden_states = self.proj(hidden_states) + hidden_states = self.proj_layer_norm(hidden_states) + + hidden_states = hidden_states.transpose(1, 2) + + for layer in self.layers: + layerdrop_prob = np.random.random() + if not self.training or (layerdrop_prob > self.layerdrop): + hidden_states = layer(hidden_states) + + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class Wav2Vec2AdapterLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.conv = nn.Conv1d( + config.output_hidden_size, + 2 * config.output_hidden_size, + config.adapter_kernel_size, + stride=config.adapter_stride, + padding=1, + ) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = nn.functional.glu(hidden_states, dim=1) + + return hidden_states + + +class Wav2Vec2AttnAdapterLayer(nn.Module): + def __init__(self, config): + """ + Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed + up training throughput. + """ + super().__init__() + self.input_dim = config.adapter_attn_dim + self.hidden_dim = config.hidden_size + + self.norm = nn.LayerNorm(self.hidden_dim) + self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim) + self.act_fn = nn.ReLU() + self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim) + + def forward(self, hidden_states: torch.FloatTensor): + hidden_states = self.norm(hidden_states) + + hidden_states = self.linear_1(hidden_states) + hidden_states = self.act_fn(hidden_states) + hidden_states = self.linear_2(hidden_states) + + return hidden_states + + +class Wav2Vec2PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = Wav2Vec2Config + base_model_prefix = "wav2vec2" + main_input_name = "input_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init. + if isinstance(module, Wav2Vec2ForPreTraining): + module.project_hid.reset_parameters() + module.project_q.reset_parameters() + module.project_hid._is_hf_initialized = True + module.project_q._is_hf_initialized = True + # gumbel softmax requires special init + elif isinstance(module, Wav2Vec2GumbelVectorQuantizer): + module.weight_proj.weight.data.normal_(mean=0.0, std=1) + module.weight_proj.bias.data.zero_() + nn.init.uniform_(module.codevectors) + elif isinstance(module, Wav2Vec2PositionalConvEmbedding): + nn.init.normal_( + module.conv.weight, + mean=0, + std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), + ) + nn.init.constant_(module.conv.bias, 0) + elif isinstance(module, Wav2Vec2FeatureProjection): + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) + + def _get_feat_extract_output_lengths( + self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None + ): + """ + Computes the output length of the convolutional layers + """ + + add_adapter = self.config.add_adapter if add_adapter is None else add_adapter + + def _conv_out_length(input_length, kernel_size, stride): + # 1D convolutional layer output length formula taken + # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html + return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1 + + for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride): + input_lengths = _conv_out_length(input_lengths, kernel_size, stride) + + if add_adapter: + for _ in range(self.config.num_adapter_layers): + input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride) + + return input_lengths + + def _get_feature_vector_attention_mask( + self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None + ): + # Effectively attention_mask.sum(-1), but not inplace to be able to run + # on inference mode. + non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1] + + output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter) + output_lengths = output_lengths.to(torch.long) + + batch_size = attention_mask.shape[0] + + attention_mask = torch.zeros( + (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device + ) + # these two operations makes sure that all values before the output lengths idxs are attended to + attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1 + attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool() + return attention_mask + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (Wav2Vec2CondEncoder, Wav2Vec2CondEncoderStableLayerNorm, Wav2Vec2FeatureEncoder)): + module.gradient_checkpointing = value + + def _get_adapters(self): + if self.config.adapter_attn_dim is None: + raise ValueError(f"{self.__class__} has no adapter layers. Make sure to define `config.adapter_attn_dim`.") + + adapter_weights = {} + for name, module in self.named_modules(): + if isinstance(module, Wav2Vec2AttnAdapterLayer): + for param_name, param in module.named_parameters(): + adapter_weights[".".join([name, param_name])] = param + + if isinstance(self, Wav2Vec2ForCTC): + for name, param in self.lm_head.named_parameters(): + adapter_weights[".".join(["lm_head", name])] = param + + return adapter_weights + + def init_adapter_layers(self): + """ + (Re-)initialize attention adapter layers and lm head for adapter-only fine-tuning + """ + # init attention adapters + for module in self.modules(): + if isinstance(module, Wav2Vec2AttnAdapterLayer): + self._init_weights(module) + + # init lm head + if isinstance(self, Wav2Vec2ForCTC): + self._init_weights(self.lm_head) + + def load_adapter(self, target_lang: str, force_load=True, **kwargs): + r""" + Load a language adapter model from a pre-trained adapter model. + + Parameters: + target_lang (`str`): + Has to be a language id of an existing adapter weight. Adapter weights are stored in the format + adapter..safetensors or adapter..bin + force_load (`bool`, defaults to `True`): + Whether the weights shall be loaded even if `target_lang` matches `self.target_lang`. + cache_dir (`Union[str, os.PathLike]`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + token (`str` or `bool`, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use + the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + + + + To test a pull request you made on the Hub, you can pass `revision="refs/pr/". + + + + mirror (`str`, *optional*): + Mirror source to accelerate downloads in China. If you are from China and have an accessibility + problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. + Please refer to the mirror site for more information. + + + + Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to + use this method in a firewalled environment. + + + + Examples: + + ```python + >>> from transformers import Wav2Vec2ForCTC, AutoProcessor + + >>> ckpt = "facebook/mms-1b-all" + >>> processor = AutoProcessor.from_pretrained(ckpt) + >>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng") + >>> # set specific language + >>> processor.tokenizer.set_target_lang("spa") + >>> model.load_adapter("spa") + ``` + """ + if self.config.adapter_attn_dim is None: + raise ValueError(f"Cannot load_adapter for {target_lang} if `config.adapter_attn_dim` is not defined.") + + if target_lang == self.target_lang and not force_load: + logger.warning(f"Adapter weights are already set to {target_lang}.") + return + + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", False) + token = kwargs.pop("token", None) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + model_path_or_id = self.config._name_or_path + state_dict = None + + # 1. Let's first try loading a safetensors adapter weight + if use_safetensors is not False: + filepath = WAV2VEC2_ADAPTER_SAFE_FILE.format(target_lang) + + try: + weight_path = cached_file( + model_path_or_id, + filename=filepath, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + cache_dir=cache_dir, + ) + + state_dict = safe_load_file(weight_path) + + except EnvironmentError: + if use_safetensors: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted + # to the original exception. + raise + + except Exception: + # For any other exception, we throw a generic error. + if use_safetensors: + raise EnvironmentError( + f"Can't load the model for '{model_path_or_id}'. If you were trying to load it" + " from 'https://huggingface.co/models', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a" + f" directory containing a file named {filepath}." + ) + + # 2. If this didn't work let's try loading a PyTorch adapter weight + if state_dict is None: + filepath = WAV2VEC2_ADAPTER_PT_FILE.format(target_lang) + + try: + weight_path = cached_file( + model_path_or_id, + filename=filepath, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + cache_dir=cache_dir, + ) + + state_dict = torch.load(weight_path, map_location="cpu") + + except EnvironmentError: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted + # to the original exception. + raise + + except Exception: + # For any other exception, we throw a generic error. + raise EnvironmentError( + f"Can't load the model for '{model_path_or_id}'. If you were trying to load it" + " from 'https://huggingface.co/models', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a" + f" directory containing a file named {filepath}." + ) + + adapter_weights = self._get_adapters() + unexpected_keys = set(state_dict.keys()) - set(adapter_weights.keys()) + missing_keys = set(adapter_weights.keys()) - set(state_dict.keys()) + + if len(unexpected_keys) > 0: + raise ValueError(f"The adapter weights {weight_path} has unexpected keys: {', '.join(unexpected_keys)}.") + elif len(missing_keys) > 0: + raise ValueError(f"The adapter weights {weight_path} has missing keys: {', '.join(missing_keys)}.") + + # make sure now vocab size is correct + target_vocab_size = state_dict["lm_head.weight"].shape[0] + if target_vocab_size != self.config.vocab_size: + self.lm_head = nn.Linear( + self.config.output_hidden_size, target_vocab_size, device=self.device, dtype=self.dtype + ) + self.config.vocab_size = target_vocab_size + + # make sure that adapter weights are put in exactly the same precision and device placement and overwritten adapter weights + state_dict = {k: v.to(adapter_weights[k]) for k, v in state_dict.items()} + self.load_state_dict(state_dict, strict=False) + + # set target language corectly + self.target_lang = target_lang + + +WAV_2_VEC_2_START_DOCSTRING = r""" + Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech + Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael + Auli. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving etc.). + + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`Wav2Vec2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +WAV_2_VEC_2_INPUTS_DOCSTRING = r""" + Args: + input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file + into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install + soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and + conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. + attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, + 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + + + `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask == + True`. For all models whose processor has `config.return_attention_mask == False`, such as + [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), `attention_mask` should **not** be + passed to avoid degraded performance when doing batched inference. For such models `input_values` should + simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly + different results depending on whether `input_values` is padded or not. + + + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.", + WAV_2_VEC_2_START_DOCSTRING, +) +class Wav2Vec2CondModel(Wav2Vec2PreTrainedModel): + def __init__(self, config: Wav2Vec2Config): + super().__init__(config) + self.config = config + self.feature_extractor = Wav2Vec2FeatureEncoder(config) + self.feature_projection = Wav2Vec2FeatureProjection(config) + + # model only needs masking vector if mask prob is > 0.0 + if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: + self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_()) + + if config.do_stable_layer_norm: + self.encoder = Wav2Vec2CondEncoderStableLayerNorm(config) + else: + self.encoder = Wav2Vec2CondEncoder(config) + + self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None + + # Initialize weights and apply final processing + self.post_init() + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.feature_extractor._freeze_parameters() + + def _mask_hidden_states( + self, + hidden_states: torch.FloatTensor, + mask_time_indices: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + ): + """ + Masks extracted features along time axis and/or along feature axis according to + [SpecAugment](https://arxiv.org/abs/1904.08779). + """ + + # `config.apply_spec_augment` can set masking to False + if not getattr(self.config, "apply_spec_augment", True): + return hidden_states + + # generate indices & apply SpecAugment along time axis + batch_size, sequence_length, hidden_size = hidden_states.size() + + if mask_time_indices is not None: + # apply SpecAugment along time axis with given mask_time_indices + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) + elif self.config.mask_time_prob > 0 and self.training: + mask_time_indices = _compute_mask_indices( + (batch_size, sequence_length), + mask_prob=self.config.mask_time_prob, + mask_length=self.config.mask_time_length, + attention_mask=attention_mask, + min_masks=self.config.mask_time_min_masks, + ) + mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) + + if self.config.mask_feature_prob > 0 and self.training: + # generate indices & apply SpecAugment along feature axis + mask_feature_indices = _compute_mask_indices( + (batch_size, hidden_size), + mask_prob=self.config.mask_feature_prob, + mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, + ) + mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) + mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) + hidden_states[mask_feature_indices] = 0 + + return hidden_states + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Wav2Vec2BaseModelOutput, + config_class=_CONFIG_FOR_DOC, + modality="audio", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + input_values: Optional[torch.Tensor], + condition_features: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + mask_time_indices: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Wav2Vec2BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + extract_features = self.feature_extractor(input_values) + extract_features = extract_features.transpose(1, 2) + + if attention_mask is not None: + # compute reduced attention_mask corresponding to feature vectors + attention_mask = self._get_feature_vector_attention_mask( + extract_features.shape[1], attention_mask, add_adapter=False + ) + + hidden_states, extract_features = self.feature_projection(extract_features) + hidden_states = self._mask_hidden_states( + hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask + ) + + encoder_outputs = self.encoder( + hidden_states, + condition_features=condition_features, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = encoder_outputs[0] + + if self.adapter is not None: + hidden_states = self.adapter(hidden_states) + + if not return_dict: + return (hidden_states, extract_features) + encoder_outputs[1:] + + return Wav2Vec2BaseModelOutput( + last_hidden_state=hidden_states, + extract_features=extract_features, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings("""Wav2Vec2 Model with a quantizer and `VQ` head on top.""", WAV_2_VEC_2_START_DOCSTRING) +class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel): + def __init__(self, config: Wav2Vec2Config): + super().__init__(config) + self.wav2vec2 = Wav2Vec2Model(config) + self.dropout_features = nn.Dropout(config.feat_quantizer_dropout) + + self.quantizer = Wav2Vec2GumbelVectorQuantizer(config) + + self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim) + self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim) + + # Initialize weights and apply final processing + self.post_init() + + def set_gumbel_temperature(self, temperature: int): + """ + Set the Gumbel softmax temperature to a given value. Only necessary for training + """ + self.quantizer.temperature = temperature + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2.feature_extractor._freeze_parameters() + + @staticmethod + def compute_contrastive_logits( + target_features: torch.FloatTensor, + negative_features: torch.FloatTensor, + predicted_features: torch.FloatTensor, + temperature: int = 0.1, + ): + """ + Compute logits for contrastive loss based using cosine similarity as the distance measure between + `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied. + """ + target_features = torch.cat([target_features, negative_features], dim=0) + + logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as( + target_features + ) + + # apply temperature + logits = logits / temperature + return logits + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Wav2Vec2ForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + mask_time_indices: Optional[torch.BoolTensor] = None, + sampled_negative_indices: Optional[torch.BoolTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Wav2Vec2ForPreTrainingOutput]: + r""" + mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict + masked extracted features in *config.proj_codevector_dim* space. + sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*): + Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss. + Required input for pre-training. + + Returns: + + Example: + + ```python + >>> import torch + >>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining + >>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices + >>> from datasets import load_dataset + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") + >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base") + + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 + + >>> # compute masked indices + >>> batch_size, raw_sequence_length = input_values.shape + >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item() + >>> mask_time_indices = _compute_mask_indices( + ... shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2 + ... ) + >>> sampled_negative_indices = _sample_negative_indices( + ... features_shape=(batch_size, sequence_length), + ... num_negatives=model.config.num_negatives, + ... mask_time_indices=mask_time_indices, + ... ) + >>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long) + >>> sampled_negative_indices = torch.tensor( + ... data=sampled_negative_indices, device=input_values.device, dtype=torch.long + ... ) + + >>> with torch.no_grad(): + ... outputs = model(input_values, mask_time_indices=mask_time_indices) + + >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states) + >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1) + + >>> # show that cosine similarity is much higher than random + >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5 + tensor(True) + + >>> # for contrastive loss training model should be put into train mode + >>> model = model.train() + >>> loss = model( + ... input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices + ... ).loss + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if mask_time_indices is not None: + mask_time_indices = mask_time_indices.to(torch.bool) + + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + mask_time_indices=mask_time_indices, + return_dict=return_dict, + ) + + # 1. project all transformed features (including masked) to final vq dim + transformer_features = self.project_hid(outputs[0]) + + # 2. quantize all (unmasked) extracted features and project to final vq dim + extract_features = self.dropout_features(outputs[1]) + + if attention_mask is not None: + # compute reduced attention_mask correponding to feature vectors + attention_mask = self._get_feature_vector_attention_mask( + extract_features.shape[1], attention_mask, add_adapter=False + ) + + quantized_features, codevector_perplexity = self.quantizer( + extract_features, mask_time_indices=mask_time_indices + ) + quantized_features = self.project_q(quantized_features) + + loss = contrastive_loss = diversity_loss = None + if sampled_negative_indices is not None: + batch_size, sequence_length, hidden_size = quantized_features.shape + + # for training, we sample negatives + # 3. sample K negatives (distractors) quantized states for contrastive loss + # if attention_mask is passed, make sure that padded feature vectors cannot be sampled + # sample negative quantized vectors BTC => (BxT)C + negative_quantized_features = quantized_features.view(-1, hidden_size)[ + sampled_negative_indices.long().view(-1) + ] + negative_quantized_features = negative_quantized_features.view( + batch_size, sequence_length, -1, hidden_size + ).permute(2, 0, 1, 3) + + # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa` + # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf + logits = self.compute_contrastive_logits( + quantized_features[None, :], + negative_quantized_features, + transformer_features, + self.config.contrastive_logits_temperature, + ) + + # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low), + # its cosine similarity will be masked + neg_is_pos = (quantized_features == negative_quantized_features).all(-1) + + if neg_is_pos.any(): + logits[1:][neg_is_pos] = float("-inf") + + # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) = + # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa)) + logits = logits.transpose(0, 2).reshape(-1, logits.size(0)) + target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten() + + contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum") + # 7. compute diversity loss: \mathbf{L}_d + num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups + diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum() + + # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d + loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss + + if not return_dict: + if loss is not None: + return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:] + return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:] + + return Wav2Vec2ForPreTrainingOutput( + loss=loss, + projected_states=transformer_features, + projected_quantized_states=quantized_features, + codevector_perplexity=codevector_perplexity, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + contrastive_loss=contrastive_loss, + diversity_loss=diversity_loss, + ) + + +@add_start_docstrings("""Wav2Vec2 Model with a `language modeling` head on top.""", WAV_2_VEC_2_START_DOCSTRING) +class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + warnings.warn( + "The class `Wav2Vec2ForMaskedLM` is deprecated. Please use `Wav2Vec2ForCTC` instead.", FutureWarning + ) + + self.wav2vec2 = Wav2Vec2Model(config) + self.dropout = nn.Dropout(config.final_dropout) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + def forward( + self, + input_values: torch.FloatTensor, + attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[Tuple, MaskedLMOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.wav2vec2( + input_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.lm_head(hidden_states) + + if not return_dict: + output = (logits,) + outputs[2:] + return output + + return MaskedLMOutput(logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions) + + +@add_start_docstrings( + """Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""", + WAV_2_VEC_2_START_DOCSTRING, +) +class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel): + def __init__(self, config, target_lang: Optional[str] = None): + super().__init__(config) + + self.wav2vec2 = Wav2Vec2Model(config) + self.dropout = nn.Dropout(config.final_dropout) + + self.target_lang = target_lang + + if config.vocab_size is None: + raise ValueError( + f"You are trying to instantiate {self.__class__} with a configuration that " + "does not define the vocabulary size of the language model head. Please " + "instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. " + "or define `vocab_size` of your model's configuration." + ) + output_hidden_size = ( + config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size + ) + self.lm_head = nn.Linear(output_hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + def tie_weights(self): + """ + This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when + passing `target_lang=...` to `from_pretrained(...)`. + + This method is **not** supposed to be called by the user and is prone to be changed in the future. + """ + + # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to + # correctly load adapter layers for Wav2Vec2 so that we do not have to introduce a new API to + # [`PreTrainedModel`]. While slightly hacky, Wav2Vec2 never has to tie input and output embeddings, so that it is + # ok to repurpose this function here. + target_lang = self.target_lang + + if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None: + raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.") + elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None: + logger.info("By default `target_lang` is set to 'eng'.") + elif target_lang is not None: + self.load_adapter(target_lang, force_load=True) + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.wav2vec2.parameters(): + param.requires_grad = False + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_CTC_EXPECTED_OUTPUT, + expected_loss=_CTC_EXPECTED_LOSS, + ) + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[Tuple, CausalLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*): + Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to + the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. + All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size - 1]`. + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states) + + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + if labels.max() >= self.config.vocab_size: + raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") + + # retrieve loss input_lengths from attention_mask + attention_mask = ( + attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long) + ) + input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long) + + # assuming that padded tokens are filled with -100 + # when not being attended to + labels_mask = labels >= 0 + target_lengths = labels_mask.sum(-1) + flattened_targets = labels.masked_select(labels_mask) + + # ctc_loss doesn't support fp16 + log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1) + + with torch.backends.cudnn.flags(enabled=False): + loss = nn.functional.ctc_loss( + log_probs, + flattened_targets, + input_lengths, + target_lengths, + blank=self.config.pad_token_id, + reduction=self.config.ctc_loss_reduction, + zero_infinity=self.config.ctc_zero_infinity, + ) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +@add_start_docstrings( + """ + Wav2Vec2 Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like + SUPERB Keyword Spotting. + """, + WAV_2_VEC_2_START_DOCSTRING, +) +class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Sequence classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)" + ) + self.wav2vec2 = Wav2Vec2Model(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) + self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.wav2vec2.parameters(): + param.requires_grad = False + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_SEQ_CLASS_CHECKPOINT, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + modality="audio", + expected_output=_SEQ_CLASS_EXPECTED_OUTPUT, + expected_loss=_SEQ_CLASS_EXPECTED_LOSS, + ) + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[Tuple, SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states + + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = outputs[_HIDDEN_STATES_START_POSITION] + hidden_states = torch.stack(hidden_states, dim=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) + else: + hidden_states = outputs[0] + + hidden_states = self.projector(hidden_states) + if attention_mask is None: + pooled_output = hidden_states.mean(dim=1) + else: + padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) + hidden_states[~padding_mask] = 0.0 + pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization. + """, + WAV_2_VEC_2_START_DOCSTRING, +) +class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Audio frame classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)" + ) + self.wav2vec2 = Wav2Vec2Model(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.num_labels = config.num_labels + + self.init_weights() + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.wav2vec2.parameters(): + param.requires_grad = False + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_FRAME_CLASS_CHECKPOINT, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + modality="audio", + expected_output=_FRAME_EXPECTED_OUTPUT, + ) + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states + + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = outputs[_HIDDEN_STATES_START_POSITION] + hidden_states = torch.stack(hidden_states, dim=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) + else: + hidden_states = outputs[0] + + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1)) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class AMSoftmaxLoss(nn.Module): + def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4): + super(AMSoftmaxLoss, self).__init__() + self.scale = scale + self.margin = margin + self.num_labels = num_labels + self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True) + self.loss = nn.CrossEntropyLoss() + + def forward(self, hidden_states, labels): + labels = labels.flatten() + weight = nn.functional.normalize(self.weight, dim=0) + hidden_states = nn.functional.normalize(hidden_states, dim=1) + cos_theta = torch.mm(hidden_states, weight) + psi = cos_theta - self.margin + + onehot = nn.functional.one_hot(labels, self.num_labels) + logits = self.scale * torch.where(onehot.bool(), psi, cos_theta) + loss = self.loss(logits, labels) + + return loss + + +class TDNNLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id] + self.out_conv_dim = config.tdnn_dim[layer_id] + self.kernel_size = config.tdnn_kernel[layer_id] + self.dilation = config.tdnn_dilation[layer_id] + + self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim) + self.activation = nn.ReLU() + + def forward(self, hidden_states): + hidden_states = hidden_states.unsqueeze(1) + hidden_states = nn.functional.unfold( + hidden_states, + (self.kernel_size, self.in_conv_dim), + stride=(1, self.in_conv_dim), + dilation=(self.dilation, 1), + ) + hidden_states = hidden_states.transpose(1, 2) + hidden_states = self.kernel(hidden_states) + + hidden_states = self.activation(hidden_states) + return hidden_states + + +@add_start_docstrings( + """ + Wav2Vec2 Model with an XVector feature extraction head on top for tasks like Speaker Verification. + """, + WAV_2_VEC_2_START_DOCSTRING, +) +class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.wav2vec2 = Wav2Vec2Model(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0]) + + tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))] + self.tdnn = nn.ModuleList(tdnn_layers) + + self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim) + self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim) + + self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels) + + self.init_weights() + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.wav2vec2.parameters(): + param.requires_grad = False + + def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]): + """ + Computes the output length of the TDNN layers + """ + + def _conv_out_length(input_length, kernel_size, stride): + # 1D convolutional layer output length formula taken + # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html + return (input_length - kernel_size) // stride + 1 + + for kernel_size in self.config.tdnn_kernel: + input_lengths = _conv_out_length(input_lengths, kernel_size, 1) + + return input_lengths + + @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_XVECTOR_CHECKPOINT, + output_type=XVectorOutput, + config_class=_CONFIG_FOR_DOC, + modality="audio", + expected_output=_XVECTOR_EXPECTED_OUTPUT, + ) + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[Tuple, XVectorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states + + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = outputs[_HIDDEN_STATES_START_POSITION] + hidden_states = torch.stack(hidden_states, dim=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) + else: + hidden_states = outputs[0] + + hidden_states = self.projector(hidden_states) + + for tdnn_layer in self.tdnn: + hidden_states = tdnn_layer(hidden_states) + + # Statistic Pooling + if attention_mask is None: + mean_features = hidden_states.mean(dim=1) + std_features = hidden_states.std(dim=1) + else: + feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1)) + tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths) + mean_features = [] + std_features = [] + for i, length in enumerate(tdnn_output_lengths): + mean_features.append(hidden_states[i, :length].mean(dim=0)) + std_features.append(hidden_states[i, :length].std(dim=0)) + mean_features = torch.stack(mean_features) + std_features = torch.stack(std_features) + statistic_pooling = torch.cat([mean_features, std_features], dim=-1) + + output_embeddings = self.feature_extractor(statistic_pooling) + logits = self.classifier(output_embeddings) + + loss = None + if labels is not None: + loss = self.objective(logits, labels) + + if not return_dict: + output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return XVectorOutput( + loss=loss, + logits=logits, + embeddings=output_embeddings, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) \ No newline at end of file From 9022d8af75c030098477e797d9fce85edd4ea778 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 30 Sep 2023 19:08:07 +0000 Subject: [PATCH 88/89] update FiLM Wav2vec2 --- .../wav2transducer/hf_wav2rnn_film_transducer.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py index 77579c94..b0a0bfea 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_film_transducer.py @@ -81,12 +81,12 @@ def _make_fuser(self): layer_dim, bias=False) - def _fuse_hid_feats(self, hid_feats, lang): + def _fuse_hid_feats(self, hid_feats, lang_condition): """Fuses the hidden features from the Wav2Vec model. Args: hid_feats: list of hidden features Tensors from Wav2Vec model. - lang: language id Tensor. + lang_condition: language condition Tensor. Returns: Tensor of fused features (batch, channels, time) @@ -95,7 +95,6 @@ def _fuse_hid_feats(self, hid_feats, lang): # There is only one layer of features return hid_feats[0] - lang_condition = self.transducer.decoder.lang_embedding(lang) hid_feats = hid_feats[self.feat_fusion_start:] if self.feat_fusion_method == "film-weighted-avg": film_hid_feats = tuple(self.films[i](hid_feats[i], lang_condition) for i in range(len(self.films))) @@ -129,12 +128,17 @@ def forward_feats(self, return_feat_layers=None, chunk_length=0, detach_chunks=False): + + + lang_condition = self.transducer.decoder.lang_embedding(lang) + return_hid_states = (False if return_feat_layers is None and self.feat_fusion_method == "last" else True) with self._hf_context: hf_output = self.hf_feats( x, x_lengths, + condition_features=lang_condition, return_hid_states=return_hid_states, chunk_length=chunk_length, detach_chunks=detach_chunks, @@ -142,7 +146,7 @@ def forward_feats(self, feat_lengths = hf_output["hidden_states_lengths"] if return_hid_states: hid_feats = hf_output["hidden_states"] - feats = self._fuse_hid_feats(hid_feats, lang) + feats = self._fuse_hid_feats(hid_feats, lang_condition) else: hid_feats = None feats = hf_output["last_hidden_state"] From 27fffa03aaa69c59eb4fd21e653db48777efd609 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 7 Oct 2023 01:21:15 +0000 Subject: [PATCH 89/89] add charachter based model for ASR --- hyperion/bin/decode_wav2vec2rnn_transducer.py | 14 ++++++-- hyperion/torch/data/audio_dataset.py | 21 +++++++++--- hyperion/torch/data/char_piece.py | 34 +++++++++++++++++++ 3 files changed, 61 insertions(+), 8 deletions(-) create mode 100644 hyperion/torch/data/char_piece.py diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py index 33aea8c3..b1af102b 100755 --- a/hyperion/bin/decode_wav2vec2rnn_transducer.py +++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py @@ -27,6 +27,7 @@ from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data.char_piece import CharPieceProcessor from hyperion.torch.models import HFWav2Vec2RNNTransducer from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search from hyperion.torch.narchs import AudioFeatsMVN as AF @@ -133,9 +134,16 @@ def decode_transducer( device = init_device(use_gpu) model = load_model(model_path, device) - logging.info("bpe-model=%s", bpe_model) - sp = spm.SentencePieceProcessor() - sp.load(bpe_model) + + + if bpe_model.endswith(".txt"): + logging.info("loading char piece file %s", bpe_model) + sp = CharPieceProcessor() + sp.load(open(bpe_model).read().split()) + else: + logging.info("bpe-model=%s", bpe_model) + sp = spm.SentencePieceProcessor() + sp.load(bpe_model) infer_args = HFWav2Vec2RNNTransducer.filter_infer_args(**infer_args) logging.info(f"infer-args={infer_args}") diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 45526284..5e604e6a 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -25,7 +25,7 @@ from ...utils.segment_set import SegmentSet from ...utils.text import read_text from ..torch_defs import floatstr_torch - +from .char_piece import CharPieceProcessor class AudioDataset(Dataset): def __init__( @@ -115,10 +115,21 @@ def _load_legacy_durations(self, time_durs_file): ].class_id.values.astype(float, copy=False) def _load_bpe_model(self, bpe_model, is_val): - if self.rank == 0: - logging.info("loading bpe file %s", bpe_model) - self.sp = spm.SentencePieceProcessor() - self.sp.load(bpe_model) + # if bpe_model end with .txt, it is a char piece model + # if bpe_model end with .model, it is a sentence piece model + if bpe_model.endswith(".txt"): + if self.rank == 0: + logging.info("loading char piece file %s", bpe_model) + self.sp = CharPieceProcessor() + self.sp.load(open(bpe_model).read().split()) + else: + if self.rank == 0: + logging.info("loading bpe file %s", bpe_model) + self.sp = spm.SentencePieceProcessor() + self.sp.load(bpe_model) + + + blank_id = self.sp.piece_to_id("") vocab_size = self.sp.get_piece_size() diff --git a/hyperion/torch/data/char_piece.py b/hyperion/torch/data/char_piece.py new file mode 100644 index 00000000..43c07619 --- /dev/null +++ b/hyperion/torch/data/char_piece.py @@ -0,0 +1,34 @@ +import logging + +class CharPieceProcessor: + def __init__(self): + self.token2id = {} + self.id2token = {} + + def load(self, token_list): + for idx, token in enumerate(token_list): + self.token2id[token] = idx + self.id2token[idx] = token + logging.info("Loaded {} tokens".format(len(self.token2id))) + logging.info("First 10 tokens: {}".format(list(self.token2id.keys())[:10])) + return True + + + def piece_to_id(self, token): + return self.token2id.get(token, self.token2id[""]) + + def id_to_piece(self, idx): + return self.id2token.get(idx, "") + + def encode_as_pieces(self, text): + return [char for char in text] + + def encode(self, text, out_type=int): + assert out_type in [int] + return [self.piece_to_id(char) for char in text] + + def decode(self, ids): + return ''.join([self.id_to_piece(idx) for idx in ids]) + + def get_piece_size(self): + return len(self.token2id)